diff --git a/commons/Cargo.toml b/commons/Cargo.toml index 2d07fb75..f9f99efc 100644 --- a/commons/Cargo.toml +++ b/commons/Cargo.toml @@ -23,6 +23,7 @@ icu_segmenter = "1.3.0" icu_locid = "1.3.0" regex = { version = "1.8.1", optional = true } insta = { version = "1.29.0", features = ["serde"], optional = true } +itertools = "0.11.0" [features] test_runner = ["dep:regex", "dep:once_cell", "dep:insta"] diff --git a/commons/src/scanner/mod.rs b/commons/src/scanner/mod.rs index 5033e96b..515ee895 100644 --- a/commons/src/scanner/mod.rs +++ b/commons/src/scanner/mod.rs @@ -1,81 +1,74 @@ -//! Scanner and helper types and traits for structurization of Unimarkup input. +//! Functionality, iterators, helper types and traits to get [`Symbol`]s from `&str`. +//! These [`Symbol`]s and iterators are used to convert the input into a Unimarkup document. + +use icu_segmenter::GraphemeClusterSegmenter; pub mod position; pub mod span; mod symbol; -use icu_segmenter::GraphemeClusterSegmenter; -use position::{Offset, Position}; -pub use symbol::{Symbol, SymbolKind}; - -#[derive(Debug)] -pub struct Scanner { - segmenter: GraphemeClusterSegmenter, -} - -impl Clone for Scanner { - fn clone(&self) -> Self { - let segmenter = GraphemeClusterSegmenter::new(); - - Self { segmenter } - } -} - -impl Default for Scanner { - fn default() -> Self { - let segmenter = GraphemeClusterSegmenter::new(); +use position::{Offset, Position as SymPos}; +pub use symbol::{iterator::*, Symbol, SymbolKind}; - Self { segmenter } - } -} - -impl Scanner { - pub fn scan_str<'s>(&self, input: &'s str) -> Vec> { - let mut symbols: Vec = Vec::new(); - let mut curr_pos: Position = Position::default(); - let mut prev_offset = 0; +/// Scans given input and returns vector of [`Symbol`]s needed to convert the input to Unimarkup content. +pub fn scan_str(input: &str) -> Vec> { + let segmenter = GraphemeClusterSegmenter::new(); - // skip(1) to ignore break at start of input - for offset in self.segmenter.segment_str(input).skip(1) { - if let Some(grapheme) = input.get(prev_offset..offset) { - let mut kind = SymbolKind::from(grapheme); + let mut symbols: Vec = Vec::new(); + let mut curr_pos: SymPos = SymPos::default(); + let mut prev_offset = 0; - let end_pos = if kind == SymbolKind::Newline { - Position { - line: (curr_pos.line + 1), - ..Default::default() - } - } else { - Position { - line: curr_pos.line, - col_utf8: (curr_pos.col_utf8 + grapheme.len()), - col_utf16: (curr_pos.col_utf16 + grapheme.encode_utf16().count()), - col_grapheme: (curr_pos.col_grapheme + 1), - } - }; + // skip(1) to ignore break at start of input + for offset in segmenter.segment_str(input).skip(1) { + if let Some(grapheme) = input.get(prev_offset..offset) { + let mut kind = SymbolKind::from(grapheme); - if curr_pos.col_utf8 == 1 && kind == SymbolKind::Newline { - // newline at the start of line -> Blankline - kind = SymbolKind::Blankline; + let end_pos = if kind == SymbolKind::Newline { + SymPos { + line: (curr_pos.line + 1), + ..Default::default() } + } else { + SymPos { + line: curr_pos.line, + col_utf8: (curr_pos.col_utf8 + grapheme.len()), + col_utf16: (curr_pos.col_utf16 + grapheme.encode_utf16().count()), + col_grapheme: (curr_pos.col_grapheme + 1), + } + }; - symbols.push(Symbol { - input, - kind, - offset: Offset { - start: prev_offset, - end: offset, - }, - start: curr_pos, - end: end_pos, - }); - - curr_pos = end_pos; + if curr_pos.col_utf8 == 1 && kind == SymbolKind::Newline { + // newline at the start of line -> Blankline + kind = SymbolKind::Blankline; } - prev_offset = offset; - } - // last offset not needed, because break at EOI is always available - symbols + symbols.push(Symbol { + input, + kind, + offset: Offset { + start: prev_offset, + end: offset, + }, + start: curr_pos, + end: end_pos, + }); + + curr_pos = end_pos; + } + prev_offset = offset; } + + symbols.push(Symbol { + input, + kind: SymbolKind::EOI, + offset: Offset { + start: prev_offset, + end: prev_offset, + }, + start: curr_pos, + end: curr_pos, + }); + + // last offset not needed, because break at EOI is always available + symbols } diff --git a/commons/src/scanner/symbol/iterator/matcher.rs b/commons/src/scanner/symbol/iterator/matcher.rs new file mode 100644 index 00000000..be59190b --- /dev/null +++ b/commons/src/scanner/symbol/iterator/matcher.rs @@ -0,0 +1,150 @@ +//! Contains matcher traits and types used to detect iterator end and strip prefixes. +//! The available matcher traits are implemented for [`SymbolIterator`]. + +use std::rc::Rc; + +use itertools::{Itertools, PeekingNext}; + +use crate::scanner::SymbolKind; + +use super::SymbolIterator; + +/// Function type to notify an iterator if an end was reached. +pub type IteratorEndFn = Rc bool)>; + +/// Function type to consume prefix sequences of a new line. +pub type IteratorPrefixFn = Rc bool)>; + +/// Trait containing functions that are available inside the end matcher function. +pub trait EndMatcher { + /// Returns `true` if the upcoming [`Symbol`] sequence is an empty line. + /// Meaning that a line contains no [`Symbol`] or only [`SymbolKind::Whitespace`]. + /// + /// **Note:** This is also `true` if a parent iterator stripped non-whitespace symbols, and the nested iterator only has whitespace symbols. + /// + /// [`Symbol`]: super::Symbol + fn is_empty_line(&mut self) -> bool; + + /// Wrapper around [`Self::is_empty_line()`] that additionally consumes the matched empty line. + /// Consuming means the related iterator advances over the matched empty line. + /// + /// **Note:** The iterator is only advanced if an empty line is matched. + /// + /// **Note:** The empty line is **not** included in the symbols returned by [`SymbolIterator::take_to_end()`]. + fn consumed_is_empty_line(&mut self) -> bool; + + /// Returns `true` if the given [`Symbol`] sequence matches the upcoming one. + /// + /// [`Symbol`]: super::Symbol + fn matches(&mut self, sequence: &[SymbolKind]) -> bool; + + /// Wrapper around [`Self::matches()`] that additionally consumes the matched sequence. + /// Consuming means the related iterator advances over the matched sequence. + /// + /// **Note:** The iterator is only advanced if the sequence is matched. + /// + /// **Note:** The matched sequence is **not** included in the symbols returned by [`SymbolIterator::take_to_end()`]. + fn consumed_matches(&mut self, sequence: &[SymbolKind]) -> bool; + + /// Returns `true` if the iterator is at the given nesting depth. + /// + /// **Note** Use [`SymbolIterator::curr_depth()`] to get the current depth of an iterator. + fn at_depth(&self, depth: usize) -> bool; +} + +/// Trait containing functions that are available inside the prefix matcher function. +pub trait PrefixMatcher { + /// Consumes and returns `true` if the given [`Symbol`] sequence matches the upcoming one. + /// Consuming means the related iterator advances over the matched sequence. + /// + /// **Note:** The iterator is only advanced if the sequence is matched. + /// + /// **Note:** The given sequence must **not** include any [`SymbolKind::Newline`], because matches are only considered per line. + /// + /// **Note:** The matched sequence is **not** included in the symbols returned by [`SymbolIterator::take_to_end()`]. + /// + /// [`Symbol`]: super::Symbol + fn consumed_prefix(&mut self, sequence: &[SymbolKind]) -> bool; +} + +impl<'input> EndMatcher for SymbolIterator<'input> { + fn is_empty_line(&mut self) -> bool { + // Note: Multiple matches may be set in the match closure, so we need to ensure that all start at the same index + self.reset_peek(); + + let next = self + .peeking_next(|s| { + matches!( + s.kind, + SymbolKind::Newline | SymbolKind::Blankline | SymbolKind::EOI + ) + }) + .map(|s| s.kind); + + let is_empty_line = if Some(SymbolKind::Newline) == next { + let _whitespaces = self + .peeking_take_while(|s| s.kind == SymbolKind::Whitespace) + .count(); + + let new_line = self.peeking_next(|s| { + matches!( + s.kind, + SymbolKind::Newline | SymbolKind::Blankline | SymbolKind::EOI + ) + }); + new_line.is_some() + } else { + next.is_some() + }; + + is_empty_line + } + + fn consumed_is_empty_line(&mut self) -> bool { + let is_empty_line = self.is_empty_line(); + + if is_empty_line { + self.set_index(self.peek_index()); // To consume peeked symbols + } + + is_empty_line + } + + fn matches(&mut self, sequence: &[SymbolKind]) -> bool { + // Note: Multiple matches may be set in the match closure, so we need to ensure that all start at the same index + self.reset_peek(); + + for kind in sequence { + if self.peeking_next(|s| s.kind == *kind).is_none() { + return false; + } + } + + true + } + + fn consumed_matches(&mut self, sequence: &[SymbolKind]) -> bool { + let matched = self.matches(sequence); + + if matched { + self.set_index(self.peek_index()); // To consume peeked symbols + } + + matched + } + + fn at_depth(&self, depth: usize) -> bool { + self.depth() == depth + } +} + +impl<'input> PrefixMatcher for SymbolIterator<'input> { + fn consumed_prefix(&mut self, sequence: &[SymbolKind]) -> bool { + debug_assert!( + !sequence.contains(&SymbolKind::Newline), + "Newline symbol in prefix match is not allowed." + ); + + self.consumed_matches(sequence) + } +} diff --git a/commons/src/scanner/symbol/iterator/mod.rs b/commons/src/scanner/symbol/iterator/mod.rs new file mode 100644 index 00000000..a175caba --- /dev/null +++ b/commons/src/scanner/symbol/iterator/mod.rs @@ -0,0 +1,503 @@ +//! Contains the [`SymbolIterator`], and all related functionality +//! that is used to step through the [`Symbol`]s retrieved from the [`Scanner`](crate::scanner::Scanner). + +use std::borrow::BorrowMut; + +use super::{Symbol, SymbolKind}; + +mod matcher; +mod root; + +pub use itertools::*; +pub use matcher::*; +pub use root::*; + +/// The [`SymbolIterator`] provides an iterator over [`Symbol`]s. +/// It allows to add matcher functions to notify the iterator, +/// when an end of an element is reached, or what prefixes to strip on a new line. +/// Additionaly, the iterator may be nested to enable transparent iterating for nested elements. +/// +/// *Transparent* meaning that the nested iterator does not see [`Symbol`]s consumed by the wrapped (parent) iterator. +/// In other words, wrapped iterators control which [`Symbol`]s will be passed to their nested iterator. +/// Therefore, each nested iterator only sees those [`Symbol`]s that are relevant to its scope. +#[derive(Clone)] +pub struct SymbolIterator<'input> { + /// The [`SymbolIteratorKind`] of this iterator. + kind: SymbolIteratorKind<'input>, + /// The index inside the [`Symbol`]s of the root iterator. + start_index: usize, + /// The nesting depth of this iterator, starting at 0 for the root iterator. + depth: usize, + /// Optional matching function that is used to automatically skip matched prefixes after a new line. + prefix_match: Option, + /// Optional matching function that is used to indicate the end of this iterator. + end_match: Option, + /// Flag set to `true` if this iterator reached its end. + iter_end: bool, +} + +/// The [`SymbolIteratorKind`] defines the kind of a [`SymbolIterator`]. +#[derive(Clone)] +pub enum SymbolIteratorKind<'input> { + /// Defines an iterator as being nested. + /// The contained iterator is the parent iterator. + Nested(Box>), + /// Defines an iterator as being root. + Root(SymbolIteratorRoot<'input>), +} + +impl<'input> SymbolIterator<'input> { + /// Creates a new [`SymbolIterator`] from the given [`Symbol`] slice. + /// This iterator is created without matching functions. + pub fn new(symbols: &'input [Symbol<'input>]) -> Self { + SymbolIterator::from(symbols) + } + + /// Creates a new [`SymbolIterator`] from the given [`Symbol`] slice, + /// and the given matching functions. + /// + /// # Arguments + /// + /// * `symbols` ... [`Symbol`] slice to iterate over + /// * `prefix_match` ... Optional matching function used to strip prefix on new lines + /// * `end_match` ... Optional matching function used to indicate the end of the created iterator + pub fn with( + symbols: &'input [Symbol<'input>], + prefix_match: Option, + end_match: Option, + ) -> Self { + SymbolIterator { + kind: SymbolIteratorKind::Root(SymbolIteratorRoot::from(symbols)), + depth: 0, + start_index: 0, + prefix_match, + end_match, + iter_end: false, + } + } + + /// Returns the maximum length of the remaining [`Symbol`]s this iterator might return. + /// + /// **Note:** This length does not consider parent iterators, or matching functions. + /// Therefore, the returned number of [`Symbol`]s might differ, but cannot be larger than this length. + pub fn max_len(&self) -> usize { + self.max_remaining_symbols().unwrap_or(&[]).len() + } + + /// Returns `true` if no more [`Symbol`]s are available. + pub fn is_empty(&self) -> bool { + self.max_remaining_symbols().unwrap_or(&[]).is_empty() + } + + /// Returns the index this iterator was started from the [`Symbol`] slice of the root iterator. + pub fn start_index(&self) -> usize { + self.start_index + } + + /// The current nested depth this iterator is at. + /// The root iterator starts at 0, and every iterator created using [`Self::nest()`] is one depth higher than its parent. + pub fn depth(&self) -> usize { + self.depth + } + + /// Returns the current index this iterator is in the [`Symbol`] slice of the root iterator. + pub fn index(&self) -> usize { + match &self.kind { + SymbolIteratorKind::Nested(parent) => parent.index(), + SymbolIteratorKind::Root(root) => root.index, + } + } + + /// Sets the current index of this iterator to the given index. + pub(super) fn set_index(&mut self, index: usize) { + if index >= self.start_index { + match self.kind.borrow_mut() { + SymbolIteratorKind::Nested(parent) => parent.set_index(index), + SymbolIteratorKind::Root(root) => { + root.index = index; + root.peek_index = index; + } + } + } + } + + /// Returns the index used to peek. + fn peek_index(&self) -> usize { + match &self.kind { + SymbolIteratorKind::Nested(parent) => parent.peek_index(), + SymbolIteratorKind::Root(root) => root.peek_index, + } + } + + /// Sets the peek index of this iterator to the given index. + fn set_peek_index(&mut self, index: usize) { + if index >= self.index() { + match self.kind.borrow_mut() { + SymbolIteratorKind::Nested(parent) => parent.set_peek_index(index), + SymbolIteratorKind::Root(root) => { + root.peek_index = index; + } + } + } + } + + /// Resets peek to get `peek() == next()`. + /// + /// **Note:** Needed to reset peek index after using `peeking_next()`. + pub fn reset_peek(&mut self) { + self.set_peek_index(self.index()); + } + + /// Returns the maximal remaining symbols in this iterator. + /// + /// **Note:** This slice does not consider parent iterators, or matching functions. + /// Therefore, the returned [`Symbol`] slice might differ from the symbols returned by calling [`Self::next()`], + /// but [`Self::next()`] cannot return more symbols than those inside the returned slice. + pub fn max_remaining_symbols(&self) -> Option<&'input [Symbol<'input>]> { + match &self.kind { + SymbolIteratorKind::Nested(parent) => parent.max_remaining_symbols(), + SymbolIteratorKind::Root(root) => root.remaining_symbols(), + } + } + + /// Returns the next [`Symbol`] without changing the current index. + pub fn peek(&mut self) -> Option<&'input Symbol<'input>> { + let symbol = self.peeking_next(|_| true); + self.reset_peek(); // Note: Resetting index, because peek() must be idempotent + symbol + } + + /// Returns the [`SymbolKind`] of the peeked [`Symbol`]. + pub fn peek_kind(&mut self) -> Option { + self.peek().map(|s| s.kind) + } + + /// Nests this iterator, by creating a new iterator that has this iterator set as parent. + /// + /// **Note:** Any change in this iterator is **not** propagated to the nested iterator. + /// See [`Self::update()`] on how to synchronize this iterator with the nested one. + /// + /// # Arguments + /// + /// * `prefix_match` ... Optional matching function used to strip prefix on new lines + /// * `end_match` ... Optional matching function used to indicate the end of the created iterator + pub fn nest( + &self, + prefix_match: Option, + end_match: Option, + ) -> SymbolIterator<'input> { + SymbolIterator { + kind: SymbolIteratorKind::Nested(Box::new(self.clone())), + start_index: self.index(), + depth: self.depth + 1, + prefix_match, + end_match, + iter_end: self.iter_end, + } + } + + /// Updates the given parent iterator to take the progress of the nested iterator. + /// + /// **Note:** Only updates the parent if `self` is nested. + pub fn update(self, parent: &mut Self) { + if let SymbolIteratorKind::Nested(self_parent) = self.kind { + // Make sure it actually is the parent. + // It is not possible to check more precisely, because other indices are expected to be different due to `clone()`. + debug_assert_eq!( + self_parent.start_index, parent.start_index, + "Updated iterator is not the actual parent of this iterator." + ); + + *parent = *self_parent; + } + } + + /// Tries to skip symbols until one of the end functions signals the end. + /// + /// **Note:** This function might not reach the iterator end. + /// + /// If no symbols are left, or no given line prefix is matched, the iterator may stop before an end is reached. + /// Use [`Self::end_reached()`] to check if the end was actually reached. + pub fn skip_to_end(mut self) -> Self { + let _last_symbol = self.by_ref().last(); + + self + } + + /// Collects and returns all symbols until one of the end functions signals the end, + /// or until no line prefix is matched after a new line. + pub fn take_to_end(&mut self) -> Vec<&'input Symbol<'input>> { + let mut symbols = Vec::new(); + + for symbol in self.by_ref() { + symbols.push(symbol); + } + + symbols + } + + /// Returns `true` if this iterator has reached its end. + pub fn end_reached(&self) -> bool { + self.iter_end + } +} + +impl<'input, T> From for SymbolIterator<'input> +where + T: Into<&'input [Symbol<'input>]>, +{ + fn from(value: T) -> Self { + SymbolIterator { + kind: SymbolIteratorKind::Root(SymbolIteratorRoot::from(value)), + start_index: 0, + depth: 0, + prefix_match: None, + end_match: None, + iter_end: false, + } + } +} + +impl<'input> Iterator for SymbolIterator<'input> { + type Item = &'input Symbol<'input>; + + fn next(&mut self) -> Option { + if self.end_reached() { + return None; + } + + if let Some(end_fn) = self.end_match.clone() { + if (end_fn)(self) { + self.iter_end = true; + return None; + } + } + + let curr_symbol_opt = match &mut self.kind { + SymbolIteratorKind::Nested(parent) => parent.next(), + SymbolIteratorKind::Root(root) => root.next(), + }; + + if curr_symbol_opt?.kind == SymbolKind::Newline && self.prefix_match.is_some() { + let prefix_match = self + .prefix_match + .clone() + .expect("Prefix match checked above to be some."); + + // Note: This mostly indicates a syntax violation, so skipped symbol is ok. + if !prefix_match(self) { + return None; + } + } + + curr_symbol_opt + } + + fn size_hint(&self) -> (usize, Option) { + (0, Some(self.max_len())) + } +} + +impl<'input> PeekingNext for SymbolIterator<'input> { + fn peeking_next(&mut self, accept: F) -> Option + where + Self: Sized, + F: FnOnce(&Self::Item) -> bool, + { + // Note: Not possible to restrict peek to return only symbols `next()` would return, + // because `peeking_next()` is needed in End- and PrefixMatcher. + // Using the same logic as in `next()` would result in endless loop inside `peeking_next()` => StackOverflow + + match &mut self.kind { + SymbolIteratorKind::Nested(parent) => parent.peeking_next(accept), + SymbolIteratorKind::Root(root) => root.peeking_next(accept), + } + } +} + +#[cfg(test)] +mod test { + use std::rc::Rc; + + use itertools::{Itertools, PeekingNext}; + + use crate::scanner::{PrefixMatcher, SymbolKind}; + + use super::SymbolIterator; + + #[test] + fn peek_while_index() { + let symbols = crate::scanner::scan_str("## "); + + let mut iterator = SymbolIterator::from(&*symbols); + let hash_cnt = iterator + .peeking_take_while(|symbol| symbol.kind == SymbolKind::Hash) + .count(); + + let next_symbol = iterator.nth(hash_cnt); + let curr_index = iterator.index(); + + assert_eq!(hash_cnt, 2, "Hash symbols in input not correctly detected."); + assert_eq!(curr_index, 3, "Current index was not updated correctly."); + assert_eq!( + next_symbol.map(|s| s.kind), + Some(SymbolKind::Whitespace), + "Whitespace after hash symbols was not detected." + ); + assert!( + iterator.next().unwrap().kind == SymbolKind::EOI, + "Input end reached, but new symbol was returned." + ); + } + + #[test] + fn peek_next() { + let symbols = crate::scanner::scan_str("#*"); + + let mut iterator = SymbolIterator::from(&*symbols); + + let peeked_symbol = iterator.peeking_next(|_| true); + let next_symbol = iterator.next(); + let next_peeked_symbol = iterator.peeking_next(|_| true); + let curr_index = iterator.index(); + + assert_eq!(curr_index, 1, "Current index was not updated correctly."); + assert_eq!( + peeked_symbol.map(|s| s.kind), + Some(SymbolKind::Hash), + "peek_next() did not return hash symbol." + ); + assert_eq!( + next_symbol.map(|s| s.kind), + Some(SymbolKind::Hash), + "next() did not return hash symbol." + ); + assert_eq!( + next_peeked_symbol.map(|s| s.kind), + Some(SymbolKind::Star), + "Star symbol not peeked next." + ); + assert_eq!( + iterator.next().map(|s| s.kind), + Some(SymbolKind::Star), + "Star symbol not returned." + ); + } + + #[test] + fn reach_end() { + let symbols = crate::scanner::scan_str("text*"); + + let mut iterator = SymbolIterator::from(&*symbols).nest( + None, + Some(Rc::new(|matcher| matcher.matches(&[SymbolKind::Star]))), + ); + + let taken_symkinds = iterator + .take_to_end() + .iter() + .map(|s| s.kind) + .collect::>(); + + assert!(iterator.end_reached(), "Iterator end was not reached."); + assert_eq!( + taken_symkinds, + vec![ + SymbolKind::Plain, + SymbolKind::Plain, + SymbolKind::Plain, + SymbolKind::Plain + ], + "Symbols till end was reached are incorrect." + ); + } + + #[test] + fn with_nested_and_parent_prefix() { + let symbols = crate::scanner::scan_str("a\n* *b"); + + let iterator = SymbolIterator::with( + &symbols, + Some(Rc::new(|matcher: &mut dyn PrefixMatcher| { + matcher.consumed_prefix(&[SymbolKind::Star, SymbolKind::Whitespace]) + })), + None, + ); + + let mut inner = iterator.nest( + Some(Rc::new(|matcher: &mut dyn PrefixMatcher| { + matcher.consumed_prefix(&[SymbolKind::Star]) + })), + None, + ); + + let sym_kinds = inner + .take_to_end() + .iter() + .map(|s| s.kind) + .collect::>(); + + assert_eq!( + sym_kinds, + vec![ + SymbolKind::Plain, + SymbolKind::Newline, + SymbolKind::Plain, + SymbolKind::EOI + ], + "Prefix symbols not correctly skipped" + ); + } + + #[test] + fn depth_matcher() { + let symbols = crate::scanner::scan_str("[o [i]]"); + + let mut iterator = SymbolIterator::with( + &symbols, + None, + Some(Rc::new(|matcher| { + if matcher.at_depth(0) { + matcher.consumed_matches(&[SymbolKind::CloseBracket]) + } else { + false + } + })), + ); + + iterator = iterator.dropping(1); // To skip first open bracket + let mut taken_outer = iterator + .by_ref() + // Note: This will skip the open bracket for both iterators, but this is ok for this test + .take_while(|s| s.kind != SymbolKind::OpenBracket) + .collect::>(); + + let mut inner_iter = iterator.nest( + None, + Some(Rc::new(|matcher| { + if matcher.at_depth(1) { + matcher.consumed_matches(&[SymbolKind::CloseBracket]) + } else { + false + } + })), + ); + + let taken_inner = inner_iter.take_to_end(); + inner_iter.update(&mut iterator); + + taken_outer.extend(iterator.take_to_end().iter()); + + assert!(iterator.end_reached(), "Iterator end was not reached."); + assert_eq!( + taken_inner.iter().map(|s| s.as_str()).collect::>(), + vec!["i"], + "Inner symbols are incorrect." + ); + assert_eq!( + taken_outer.iter().map(|s| s.as_str()).collect::>(), + vec!["o", " ",], + "Outer symbols are incorrect." + ); + } +} diff --git a/commons/src/scanner/symbol/iterator/root.rs b/commons/src/scanner/symbol/iterator/root.rs new file mode 100644 index 00000000..a2082fe6 --- /dev/null +++ b/commons/src/scanner/symbol/iterator/root.rs @@ -0,0 +1,67 @@ +//! Contains the [`SymbolIteratorRoot`] that is the root iterator in any [`SymbolIterator`](super::SymbolIterator). + +use itertools::PeekingNext; + +use crate::scanner::Symbol; + +/// The [`SymbolIteratorRoot`] is the root iterator in any [`SymbolIterator`](super::SymbolIterator). +/// It holds the actual [`Symbol`] slice. +#[derive(Clone)] +pub struct SymbolIteratorRoot<'input> { + /// The [`Symbol`] slice the iterator was created for. + symbols: &'input [Symbol<'input>], + /// The current index of the iterator inside the [`Symbol`] slice. + pub(super) index: usize, + /// The peek index of the iterator inside the [`Symbol`] slice. + pub(super) peek_index: usize, +} + +impl<'input> SymbolIteratorRoot<'input> { + /// Returns the remaining symbols in this iterator, or `None` if there are no symbols left. + pub(super) fn remaining_symbols(&self) -> Option<&'input [Symbol<'input>]> { + self.symbols.get(self.index..) + } +} + +impl<'input, T> From for SymbolIteratorRoot<'input> +where + T: Into<&'input [Symbol<'input>]>, +{ + fn from(value: T) -> Self { + SymbolIteratorRoot { + symbols: value.into(), + index: 0, + peek_index: 0, + } + } +} + +impl<'input> Iterator for SymbolIteratorRoot<'input> { + type Item = &'input Symbol<'input>; + + fn next(&mut self) -> Option { + let symbol = self.symbols.get(self.index)?; + + self.index += 1; + self.peek_index = self.index; + + Some(symbol) + } + + fn size_hint(&self) -> (usize, Option) { + let len = self.remaining_symbols().map(<[_]>::len).unwrap_or(0); + (len, Some(len)) + } +} + +impl<'input> PeekingNext for SymbolIteratorRoot<'input> { + fn peeking_next(&mut self, accept: F) -> Option + where + Self: Sized, + F: FnOnce(&Self::Item) -> bool, + { + let symbol = self.symbols.get(self.peek_index).filter(accept)?; + self.peek_index += 1; + Some(symbol) + } +} diff --git a/commons/src/scanner/symbol.rs b/commons/src/scanner/symbol/mod.rs similarity index 98% rename from commons/src/scanner/symbol.rs rename to commons/src/scanner/symbol/mod.rs index fdfe6d87..72ce8f30 100644 --- a/commons/src/scanner/symbol.rs +++ b/commons/src/scanner/symbol/mod.rs @@ -4,6 +4,8 @@ use core::fmt; use super::position::{Offset, Position}; +pub mod iterator; + /// Possible kinds of Symbol found in Unimarkup document. #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub enum SymbolKind { @@ -62,6 +64,19 @@ impl Default for SymbolKind { } } +impl SymbolKind { + pub fn is_not_keyword(&self) -> bool { + matches!( + self, + SymbolKind::Newline + | SymbolKind::Whitespace + | SymbolKind::Plain + | SymbolKind::Blankline + | SymbolKind::EOI + ) + } +} + /// Symbol representation of literals found in Unimarkup document. #[derive(Default, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub struct Symbol<'a> { @@ -104,14 +119,7 @@ impl fmt::Debug for Symbol<'_> { impl Symbol<'_> { // TODO: extension trait in core? pub fn is_not_keyword(&self) -> bool { - matches!( - self.kind, - SymbolKind::Newline - | SymbolKind::Whitespace - | SymbolKind::Plain - | SymbolKind::Blankline - | SymbolKind::EOI - ) + self.kind.is_not_keyword() } /// Returns the original string representation of the symbol. diff --git a/commons/src/test_runner/mod.rs b/commons/src/test_runner/mod.rs index f9e988ff..b364afd2 100644 --- a/commons/src/test_runner/mod.rs +++ b/commons/src/test_runner/mod.rs @@ -3,8 +3,6 @@ use std::{ path::{Path, PathBuf}, }; -use crate::scanner::{Scanner, Symbol}; - pub mod as_snapshot; pub mod snap_test_runner; pub mod spec_test; @@ -12,12 +10,6 @@ pub mod test_file; use self::test_file::{TestCase, TestFile}; -/// Scans the string using the [`Scanner`] struct. -pub fn scan_str(input: &str) -> Vec { - let scanner = Scanner::default(); - scanner.scan_str(input) -} - /// Finds all files with the given extension in the given path (recursively). /// /// # Arguments diff --git a/commons/src/test_runner/snap_test_runner.rs b/commons/src/test_runner/snap_test_runner.rs index d0d946b6..b332133d 100644 --- a/commons/src/test_runner/snap_test_runner.rs +++ b/commons/src/test_runner/snap_test_runner.rs @@ -17,11 +17,9 @@ impl<'a> SnapTestRunner<'a> { pub fn with_fn(name: &str, input: &'a S, mut parser: PF) -> SnapTestRunner<'a, ()> where S: AsRef<[Symbol<'a>]>, - PF: for<'s> FnMut(&'s [Symbol<'s>]) -> (String, &'s [Symbol<'s>]), + PF: for<'s> FnMut(&'s [Symbol<'s>]) -> String, { - let (snapshot, rest) = parser(input.as_ref()); - - assert_eq!(rest.len(), 0, "Whole input should be parsed"); + let snapshot = parser(input.as_ref()); SnapTestRunner { info: None, diff --git a/core/tests/runner/mod.rs b/core/tests/runner/mod.rs index 00588fad..bde1a3ff 100644 --- a/core/tests/runner/mod.rs +++ b/core/tests/runner/mod.rs @@ -66,20 +66,16 @@ fn run_spec_test(case: test_runner::test_file::TestCase) { fn run_snap_test(case: test_runner::test_file::TestCase) { let mut cfg = unimarkup_commons::config::Config::default(); - let scanner = unimarkup_commons::scanner::Scanner::default(); - let symbols = scanner.scan_str(&case.test.input); + let symbols = unimarkup_commons::scanner::scan_str(&case.test.input); let mut snap_runner = SnapTestRunner::with_fn::<_, _>(&case.test.name, &symbols, |_input| { let um = unimarkup_core::parser::parse_unimarkup(&case.test.input, &mut cfg); - ( - um.blocks - .iter() - .map(|block| Snapshot(block).as_snapshot()) - .collect(), - &[], - ) + um.blocks + .iter() + .map(|block| Snapshot(block).as_snapshot()) + .collect() }) .with_info(format!( "Test '{}' from: {}", diff --git a/inline/src/parser/mod.rs b/inline/src/parser/mod.rs index d1b2fc11..aeb4b7be 100644 --- a/inline/src/parser/mod.rs +++ b/inline/src/parser/mod.rs @@ -127,7 +127,9 @@ impl<'input> Parser<'input> { span.end = next_token.span.end; break; } else if not_enclosed_and_interrupted { - if next_token.consumable_by_plain() { + if !matches!(kind, TokenKind::Newline | TokenKind::EscapedNewline) + && next_token.consumable_by_plain() + { // consume the token let (next_content, next_span) = next_token.parts(); diff --git a/inline/tests/lexer/mod.rs b/inline/tests/lexer/mod.rs index 834b6e46..03b61879 100644 --- a/inline/tests/lexer/mod.rs +++ b/inline/tests/lexer/mod.rs @@ -37,11 +37,11 @@ pub fn test_lexer_snapshots() -> Vec { } fn run_test_case(case: test_runner::test_file::TestCase) { - let symbols = test_runner::scan_str(&case.test.input); + let mut symbols = unimarkup_commons::scanner::scan_str(&case.test.input); + symbols.pop(); // Remove EOI symbol for test cases + let runner = SnapTestRunner::with_fn(&case.test.name, &symbols, |symbols| { - let rest = &[]; - let snapshot = Snapshot::snap((case.test.input.as_ref(), symbols.tokens())); - (snapshot, rest) + Snapshot::snap((case.test.input.as_ref(), symbols.tokens())) }) .with_info(format!( "Test '{}' from '{}'", diff --git a/inline/tests/parser/mod.rs b/inline/tests/parser/mod.rs index 14716d3f..d8c9c71e 100644 --- a/inline/tests/parser/mod.rs +++ b/inline/tests/parser/mod.rs @@ -36,13 +36,11 @@ pub fn test_parser_snapshots() -> Vec { } fn run_test_case(case: test_runner::test_file::TestCase) { - let symbols = test_runner::scan_str(&case.test.input); + let symbols = unimarkup_commons::scanner::scan_str(&case.test.input); let runner = SnapTestRunner::with_fn(&case.test.name, &symbols, |symbols| { - let rest: &[_] = &[]; let inlines: Vec<_> = symbols.parse_inlines().collect(); - let snapshot = Snapshot::snap(&inlines[..]); - (snapshot, rest) + Snapshot::snap(&inlines[..]) }) .with_info(format!( "Test '{}' from '{}'", diff --git a/parser/src/elements/atomic/heading.rs b/parser/src/elements/atomic/heading.rs index 5d68e864..2bbb209f 100644 --- a/parser/src/elements/atomic/heading.rs +++ b/parser/src/elements/atomic/heading.rs @@ -1,10 +1,14 @@ +use std::rc::Rc; + use strum_macros::*; use unimarkup_inline::{Inline, ParseInlines}; use crate::elements::blocks::Block; use crate::elements::Blocks; use crate::parser::{ElementParser, TokenizeOutput}; -use unimarkup_commons::scanner::{Symbol, SymbolKind}; +use unimarkup_commons::scanner::{ + EndMatcher, Itertools, PrefixMatcher, Symbol, SymbolIterator, SymbolKind, +}; use super::log_id::AtomicError; @@ -112,7 +116,7 @@ pub enum HeadingToken<'a> { Level(HeadingLevel), /// Content of the heading - Content(&'a [Symbol<'a>]), + Content(Vec<&'a Symbol<'a>>), /// Marks the end of the heading End, @@ -121,37 +125,54 @@ pub enum HeadingToken<'a> { impl ElementParser for Heading { type Token<'a> = self::HeadingToken<'a>; - fn tokenize<'i>(input: &'i [Symbol<'i>]) -> Option>> { - let mut level_depth = input - .iter() - .take_while(|symbol| matches!(symbol.kind, SymbolKind::Hash)) - .count(); + fn tokenize<'i>(input: &mut SymbolIterator<'i>) -> Option>> { + let mut heading_start: Vec = input + .peeking_take_while(|symbol| matches!(symbol.kind, SymbolKind::Hash)) + .map(|s| s.kind) + .collect(); + let level_depth = heading_start.len(); let level: HeadingLevel = HeadingLevel::try_from(level_depth).ok()?; - if input.get(level_depth)?.kind != SymbolKind::Whitespace { + if input.by_ref().nth(level_depth)?.kind != SymbolKind::Whitespace { return None; } - level_depth += 1; // +1 space offset - let content_symbols = input - .iter() - .skip(level_depth) - .take_while(|symbol| !matches!(symbol.kind, SymbolKind::Blankline | SymbolKind::EOI)) - .count(); + heading_start.push(SymbolKind::Whitespace); + + let sub_heading_start: Vec = std::iter::repeat(SymbolKind::Hash) + .take(heading_start.len()) + .chain([SymbolKind::Whitespace]) + .collect(); + let heading_end = move |matcher: &mut dyn EndMatcher| { + matcher.consumed_is_empty_line() + || matcher.matches(&[SymbolKind::EOI]) + || level != HeadingLevel::Level6 && matcher.matches(&sub_heading_start) + }; + + let whitespace_indents: Vec = std::iter::repeat(SymbolKind::Whitespace) + .take(heading_start.len()) + .collect(); + let heading_prefix = move |matcher: &mut dyn PrefixMatcher| { + matcher.consumed_prefix(&heading_start) || matcher.consumed_prefix(&whitespace_indents) + }; + + let mut content_iter = + input.nest(Some(Rc::new(heading_prefix)), Some(Rc::new(heading_end))); + let content_symbols = content_iter.take_to_end(); - let content_start = level_depth; - let content_end = content_start + content_symbols; + // Line prefixes violated => invalid heading syntax + if !content_iter.end_reached() { + return None; + } - let content = &input[content_start..content_end]; - let rest = &input[content_end..]; + content_iter.update(input); let output = TokenizeOutput { tokens: vec![ HeadingToken::Level(level), - HeadingToken::Content(content), + HeadingToken::Content(content_symbols), HeadingToken::End, ], - rest_of_input: rest, }; Some(output) @@ -161,12 +182,19 @@ impl ElementParser for Heading { let HeadingToken::Level(level) = input[0] else { return None; }; - let HeadingToken::Content(symbols) = input[1] else { + let HeadingToken::Content(ref symbols) = input[1] else { return None; }; let inline_start = symbols.get(0)?.start; - let content = symbols.parse_inlines().collect(); + // TODO: Adapt inline lexer to also work with Vec<&'input Symbol> + let content = symbols + .iter() + .map(|&s| *s) + .collect::>>() + .parse_inlines() + .collect(); + let line_nr = inline_start.line; let block = Self { id: String::default(), diff --git a/parser/src/elements/atomic/paragraph.rs b/parser/src/elements/atomic/paragraph.rs index 32983742..a09af705 100644 --- a/parser/src/elements/atomic/paragraph.rs +++ b/parser/src/elements/atomic/paragraph.rs @@ -7,7 +7,7 @@ use crate::{ elements::{blocks::Block, types}, parser::TokenizeOutput, }; -use unimarkup_commons::scanner::{Symbol, SymbolKind}; +use unimarkup_commons::scanner::{Symbol, SymbolIterator, SymbolKind}; /// Structure of a Unimarkup paragraph element. #[derive(Debug, Default, Clone, PartialEq, Eq)] @@ -28,9 +28,14 @@ pub struct Paragraph { impl Paragraph {} -impl From<&[Symbol<'_>]> for Paragraph { - fn from(value: &[Symbol<'_>]) -> Self { - let content = value.parse_inlines().collect(); +impl From>> for Paragraph { + fn from(value: Vec<&'_ Symbol<'_>>) -> Self { + let content = value + .iter() + .map(|&s| *s) + .collect::>>() + .parse_inlines() + .collect(); let line_nr = value.get(0).map(|symbol| symbol.start.line).unwrap_or(0); let id = crate::generate_id::generate_id(&format!( @@ -55,57 +60,19 @@ fn not_closing_symbol(symbol: &&Symbol) -> bool { .all(|closing| *closing != symbol.kind) } -enum TokenKind<'a> { - Start, - End, - Text(&'a [Symbol<'a>]), -} +impl ElementParser for Paragraph { + type Token<'a> = &'a Symbol<'a>; -pub(crate) struct ParagraphToken<'a> { - kind: TokenKind<'a>, -} + fn tokenize<'i>(input: &mut SymbolIterator<'i>) -> Option>> { + let content = input.by_ref().take_while(not_closing_symbol).collect(); -impl ElementParser for Paragraph { - type Token<'a> = self::ParagraphToken<'a>; - - fn tokenize<'input>( - input: &'input [Symbol<'input>], - ) -> Option>> { - let iter = input.iter(); - - let taken = iter.take_while(not_closing_symbol).count(); - let end_of_input = taken.min(input.len()); - - let tokens = vec![ - ParagraphToken { - kind: TokenKind::Start, - }, - ParagraphToken { - kind: TokenKind::Text(&input[..end_of_input]), - }, - ParagraphToken { - kind: TokenKind::End, - }, - ]; - - let input = &input[end_of_input..]; - - let output = TokenizeOutput { - tokens, - rest_of_input: input, - }; + let output = TokenizeOutput { tokens: content }; Some(output) } fn parse(input: Vec>) -> Option { - let content = match input[1].kind { - TokenKind::Start => &[], - TokenKind::End => &[], - TokenKind::Text(symbols) => symbols, - }; - - let block = Block::Paragraph(Paragraph::from(content)); + let block = Block::Paragraph(Paragraph::from(input)); Some(vec![block]) } diff --git a/parser/src/elements/enclosed/verbatim.rs b/parser/src/elements/enclosed/verbatim.rs index 3275e7f7..4b62ad80 100644 --- a/parser/src/elements/enclosed/verbatim.rs +++ b/parser/src/elements/enclosed/verbatim.rs @@ -1,9 +1,11 @@ +use std::rc::Rc; + use serde::{Deserialize, Serialize}; use crate::elements::blocks::Block; use crate::elements::Blocks; use crate::parser::{ElementParser, TokenizeOutput}; -use unimarkup_commons::scanner::{Symbol, SymbolKind}; +use unimarkup_commons::scanner::{EndMatcher, Itertools, Symbol, SymbolIterator, SymbolKind}; /// Structure of a Unimarkup verbatim block element. #[derive(Debug, PartialEq, Eq, Clone)] @@ -14,6 +16,9 @@ pub struct Verbatim { /// The content of the verbatim block. pub content: String, + /// The language used to highlight the content. + pub data_lang: Option, + /// Attributes of the verbatim block. // TODO: make attributes data structure pub attributes: Option, @@ -24,80 +29,97 @@ pub struct Verbatim { } pub(crate) enum Token<'a> { - Delimiter { line: usize }, - Content(&'a [Symbol<'a>]), + StartDelim(Vec<&'a Symbol<'a>>), + DataLang(Vec<&'a Symbol<'a>>), + Content(Vec<&'a Symbol<'a>>), } impl ElementParser for Verbatim { type Token<'a> = self::Token<'a>; - fn tokenize<'i>(input: &'i [Symbol<'i>]) -> Option>> { - let start_delim = input - .iter() - .take_while(|symbol| matches!(symbol.kind, SymbolKind::Tick)) + fn tokenize<'i>(input: &mut SymbolIterator<'i>) -> Option>> { + let start_delim_len = input + .by_ref() + .peeking_take_while(|symbol| matches!(symbol.kind, SymbolKind::Tick)) .count(); - if start_delim < 3 { + if start_delim_len < 3 { return None; }; - // we know there are at least 3 - let first_delim = input[0]; + let start_delim = input.by_ref().take(start_delim_len).collect(); + // Note: Consuming `Newline` is intended, because it is not part of the content, but also not of data-lang + let data_lang = input + .take_while(|s| s.kind != SymbolKind::Newline) + .collect::>(); + + let end_sequence = std::iter::once(SymbolKind::Newline) + .chain(std::iter::repeat(SymbolKind::Tick).take(start_delim_len)) + .collect::>(); + let mut longer_delim_sequence = end_sequence.clone(); + longer_delim_sequence.push(SymbolKind::Tick); + + let end_fn = Rc::new(move |matcher: &mut dyn EndMatcher| { + if !matcher.matches(&longer_delim_sequence) { + matcher.consumed_matches(&end_sequence) + } else { + false + } + }); + + let mut content_iter = input.nest(None, Some(end_fn)); + let content = content_iter.take_to_end(); + + if !content_iter.end_reached() { + return None; + } + + content_iter.update(input); // TODO: handle language attribute - let content_count = input - .iter() - .skip(start_delim) - .take_while(|symbol| !matches!(symbol.kind, SymbolKind::Tick)) - .count(); - - let end_delim = input - .iter() - .skip(start_delim + content_count) - .take_while(|sym| matches!(sym.kind, SymbolKind::Tick)) - .count(); - if end_delim != start_delim { + // ensures empty line after block + if !input.consumed_is_empty_line() { return None; } - let start_content = start_delim; - let end_content = start_content + content_count; - let content = &input[start_content..end_content]; - let rest = &input[end_content + end_delim..]; - - let last_delim = input[end_content]; - let output = TokenizeOutput { tokens: vec![ - Token::Delimiter { - line: first_delim.start.line, - }, + Token::StartDelim(start_delim), + Token::DataLang(data_lang), Token::Content(content), - Token::Delimiter { - line: last_delim.start.line, - }, ], - rest_of_input: rest, }; Some(output) } fn parse(input: Vec>) -> Option { - let Token::Delimiter { line } = input.get(0)? else { + let Token::StartDelim(start) = input.get(0)? else { return None; }; - let Token::Content(symbols) = input.get(1)? else { + let line_nr = start.get(0)?.start.line; + + let Token::DataLang(lang_symbols) = input.get(1)? else { + return None; + }; + let data_lang = if lang_symbols.is_empty() { + None + } else { + Some(Symbol::flatten_iter(lang_symbols.iter().copied())?.to_string()) + }; + + let Token::Content(symbols) = input.get(2)? else { return None; }; - let content = Symbol::flatten(symbols)?; + let content = Symbol::flatten_iter(symbols.iter().copied())?; let block = Self { id: String::default(), content: String::from(content), + data_lang, attributes: None, - line_nr: *line, + line_nr, }; Some(vec![Block::Verbatim(block)]) diff --git a/parser/src/parser.rs b/parser/src/parser.rs index 9000d20d..c3e273f1 100644 --- a/parser/src/parser.rs +++ b/parser/src/parser.rs @@ -1,7 +1,7 @@ //! Module for parsing of Unimarkup elements. use logid::log; -use unimarkup_commons::scanner::{Scanner, Symbol, SymbolKind}; +use unimarkup_commons::scanner::{SymbolIterator, SymbolKind}; use crate::{ document::Document, @@ -17,15 +17,11 @@ use crate::{ use unimarkup_commons::config::Config; /// Parser as function that can parse Unimarkup content -pub type ParserFn = for<'i> fn(&'i [Symbol<'i>]) -> Option<(Blocks, &'i [Symbol<'i>])>; +pub type ParserFn = for<'i> fn(&mut SymbolIterator<'i>) -> Option; /// Output of symbol tokenization by a parser of a block. -pub(crate) struct TokenizeOutput<'a, T> -where - T: 'a, -{ +pub(crate) struct TokenizeOutput { pub(crate) tokens: Vec, - pub(crate) rest_of_input: &'a [Symbol<'a>], } /// Trait implemented by a parser for each Unimarkup element. @@ -34,7 +30,7 @@ pub(crate) trait ElementParser { type Token<'a>; /// Function that converts input symbols into tokens specific for the given element. - fn tokenize<'i>(input: &'i [Symbol<'i>]) -> Option>>; + fn tokenize<'i>(input: &mut SymbolIterator<'i>) -> Option>>; /// Function that parses tokenization output and produces one or more Unimarkup elements. fn parse(input: Vec>) -> Option; @@ -65,7 +61,7 @@ where let tokenize_output = T::tokenize(input)?; let blocks = T::parse(tokenize_output.tokens)?; - Some((blocks, tokenize_output.rest_of_input)) + Some(blocks) } } } @@ -103,53 +99,56 @@ impl MainParser { } /// Parses Unimarkup content and produces Unimarkup blocks. - pub fn parse<'s>(&self, input: impl AsRef<[Symbol<'s>]>) -> Blocks { - let mut input = input.as_ref(); + pub fn parse(&self, input: &mut SymbolIterator) -> Blocks { let mut blocks = Vec::default(); #[cfg(debug_assertions)] - let mut input_len = input.len(); + let mut curr_len = input.max_len(); - 'outer: while let Some(sym) = input.first() { - match sym.kind { - // skip blanklines - SymbolKind::Blankline => input = &input[1..], + 'outer: while let Some(kind) = input.peek_kind() { + match kind { + // skip newlines between elements + SymbolKind::Blankline | SymbolKind::Newline => { + input.next(); + } // stop parsing when end of input is reached SymbolKind::EOI => break, // no parser will match, parse with default parser - _ if sym.is_not_keyword() => { - let (mut res_blocks, rest_of_input) = (self.default_parser)(input) + _ if kind.is_not_keyword() => { + let mut res_blocks = (self.default_parser)(input) .expect("Default parser could not parse content!"); blocks.append(&mut res_blocks); - input = rest_of_input; } // symbol is start of a block, some parser should match _ => { for parser_fn in &self.parsers { - if let Some((mut res_blocks, rest_of_input)) = parser_fn(input) { + let mut iter = input.clone(); + if let Some(mut res_blocks) = parser_fn(&mut iter) { blocks.append(&mut res_blocks); - input = rest_of_input; + *input = iter; continue 'outer; // start from first parser on next input } } // no registered parser matched -> use default parser - let (mut res_blocks, rest_of_input) = (self.default_parser)(input) + let mut res_blocks = (self.default_parser)(input) .expect("Default parser could not parse content!"); blocks.append(&mut res_blocks); - input = rest_of_input; } } #[cfg(debug_assertions)] { - assert_ne!(input.len(), input_len); - input_len = input.len(); + assert!( + input.max_len() < curr_len, + "Parser consumed no symbol in iteration." + ); + curr_len = input.max_len(); } } @@ -161,9 +160,9 @@ impl MainParser { pub fn parse_unimarkup(um_content: &str, config: &mut Config) -> Document { let parser = MainParser::default(); - let symbols = Scanner::default().scan_str(um_content); - - let blocks = parser.parse(symbols); + let symbols = unimarkup_commons::scanner::scan_str(um_content); + let mut symbols_iter = SymbolIterator::from(&*symbols); + let blocks = parser.parse(&mut symbols_iter); let mut unimarkup = Document { config: config.clone(), diff --git a/render/src/html/render.rs b/render/src/html/render.rs index 370b5e32..291b0731 100644 --- a/render/src/html/render.rs +++ b/render/src/html/render.rs @@ -41,18 +41,6 @@ impl Renderer for HtmlRenderer { verbatim: &unimarkup_parser::elements::enclosed::Verbatim, _context: &Context, ) -> Result { - // TODO: improve handling of attributes - // let attributes = serde_json::from_str::( - // &verbatim.attributes.as_ref().cloned().unwrap_or_default(), - // ) - // .ok(); - - // let language = match attributes.as_ref() { - // Some(attrs) => attrs.language.clone().unwrap_or(PLAIN_SYNTAX.to_string()), - // None => PLAIN_SYNTAX.to_string(), - // }; - let language = "rust"; - let inner = Html::with( HtmlHead { syntax_highlighting_used: true, @@ -62,8 +50,14 @@ impl Renderer for HtmlRenderer { tag: HtmlTag::Code, attributes: HtmlAttributes::default(), content: Some( - highlight::highlight_content(&verbatim.content, language) - .unwrap_or(verbatim.content.clone()), + highlight::highlight_content( + &verbatim.content, + verbatim + .data_lang + .as_ref() + .unwrap_or(&highlight::PLAIN_SYNTAX.to_string()), + ) + .unwrap_or(verbatim.content.clone()), ), }), ); @@ -211,4 +205,49 @@ impl Renderer for HtmlRenderer { Ok(html) } + + fn render_newline( + &mut self, + _newline: &Newline, + _context: &Context, + ) -> Result { + let html = Html::with_body(HtmlBody::from(HtmlElement { + tag: HtmlTag::PlainContent, + attributes: HtmlAttributes::default(), + content: Some(unimarkup_inline::TokenKind::Whitespace.as_str().to_string()), + })); + + Ok(html) + } + + fn render_escaped_newline( + &mut self, + _escaped_newline: &EscapedNewline, + _context: &Context, + ) -> Result { + let html = Html::with_body(HtmlBody::from(HtmlElement { + tag: HtmlTag::Br, + attributes: HtmlAttributes::default(), + content: None, + })); + + Ok(html) + } + + fn render_escaped_whitespace( + &mut self, + escaped_whitespace: &EscapedWhitespace, + _context: &Context, + ) -> Result { + let html = Html::with_body(HtmlBody::from(HtmlElement { + tag: HtmlTag::Span, + attributes: HtmlAttributes(vec![HtmlAttribute { + name: "style".to_string(), + value: Some("white-space: pre-wrap;".to_string()), + }]), + content: Some(escaped_whitespace.inner().to_string()), + })); + + Ok(html) + } } diff --git a/render/src/html/tag.rs b/render/src/html/tag.rs index e8c02a32..1bcf029d 100644 --- a/render/src/html/tag.rs +++ b/render/src/html/tag.rs @@ -26,6 +26,7 @@ pub enum HtmlTag { Sup, Mark, Q, + Br, } impl HtmlTag { @@ -51,6 +52,7 @@ impl HtmlTag { HtmlTag::Sup => "sup", HtmlTag::Mark => "mark", HtmlTag::Q => "q", + HtmlTag::Br => "br", } } } diff --git a/render/src/render.rs b/render/src/render.rs index f6e23333..1d4c8b29 100644 --- a/render/src/render.rs +++ b/render/src/render.rs @@ -158,6 +158,29 @@ pub trait Renderer { Err(RenderError::Unimplemented) } + /// Render [`Newline` content](unimarkup_inline::inlines::Inline) to the output format `T`. + fn render_newline(&mut self, _newline: &Newline, _context: &Context) -> Result { + Err(RenderError::Unimplemented) + } + + /// Render [`EscapedNewline` content](unimarkup_inline::inlines::Inline) to the output format `T`. + fn render_escaped_newline( + &mut self, + _escaped_newline: &EscapedNewline, + _context: &Context, + ) -> Result { + Err(RenderError::Unimplemented) + } + + /// Render [`EscapedWhitespace` content](unimarkup_inline::inlines::Inline) to the output format `T`. + fn render_escaped_whitespace( + &mut self, + _escaped_whitespace: &EscapedWhitespace, + _context: &Context, + ) -> Result { + Err(RenderError::Unimplemented) + } + //----------------------------- GENERIC ELEMENTS ----------------------------- /// Render Unimarkup [`Block`s](Block) to the output format `T`. @@ -236,6 +259,13 @@ pub trait Renderer { Inline::Quote(quote) => self.render_quote(quote, context), Inline::Verbatim(verbatim) => self.render_inline_verbatim(verbatim, context), Inline::Plain(plain) => self.render_plain(plain, context), + Inline::Newline(newline) => self.render_newline(newline, context), + Inline::EscapedNewline(escaped_newline) => { + self.render_escaped_newline(escaped_newline, context) + } + Inline::EscapedWhitespace(escaped_whitespace) => { + self.render_escaped_whitespace(escaped_whitespace, context) + } _ => Err(RenderError::Unimplemented), } }