unimarkup · mhatzl · Nov 19, 2023 · Oct 24, 2023 · Oct 24, 2023 · Oct 24, 2023
diff --git a/Cargo.toml b/Cargo.toml
@@ -35,4 +35,3 @@ clap = { version = "4.2.7", features = ["derive", "cargo", "env"] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"
 serde_yaml = "0.8.23"
-ribbon = "0.7.0"
diff --git a/commons/Cargo.toml b/commons/Cargo.toml
@@ -19,6 +19,7 @@ serde.workspace = true
 serde_json.workspace = true
 serde_yaml.workspace = true
 once_cell = { workspace = true, optional = true }
+icu_properties = "1.3.2"
 icu_segmenter = "1.3.0"
 icu_locid = "1.3.0"
 regex = { version = "1.8.1", optional = true }

diff --git a/commons/README.md b/commons/README.md
@@ -3,7 +3,7 @@
 This crate provides common functionalities needed in other Unimarkup crates.
 
 - `config` ... Contains the config struct defining what arguments are available for compilation
-- `scanner` ... Contains the `SymbolIterator` used to transform string input into Unimarkup symbols
+- `lexer` ... Contains the `TokenIterator` used to iterate over tokenized input containing Unimarkup elements
 - `test_runner` ... Contains convenience traits and macros to create automated snapshot tests
 
 # License

diff --git a/commons/src/scanner/mod.rs → commons/src/lexer/mod.rs b/commons/src/scanner/mod.rs → commons/src/lexer/mod.rs
@@ -1,16 +1,18 @@
-//! Functionality, iterators, helper types and traits to get [`Symbol`]s from `&str`.
-//! These [`Symbol`]s and iterators are used to convert the input into a Unimarkup document.
+//! Functionality, iterators, helper types and traits to get [`Tokens`](token::Token)s from `&str`.
+//! These [`Tokens`](token::Token)s and iterators are used to convert the input into a Unimarkup document.
 
 use icu_segmenter::GraphemeClusterSegmenter;
 
 pub mod position;
 pub mod span;
-mod symbol;
+pub mod symbol;
+pub mod token;
 
 use position::{Offset, Position as SymPos};
-pub use symbol::{iterator::*, Symbol, SymbolKind};
 
-/// Scans given input and returns vector of [`Symbol`]s needed to convert the input to Unimarkup content.
+use self::symbol::{Symbol, SymbolKind};
+
+/// Scans given input and returns vector of [`Symbol`]s needed to convert the input to [Token](token::Token)s.
 pub fn scan_str(input: &str) -> Vec<Symbol<'_>> {
     let segmenter = GraphemeClusterSegmenter::new();
 
@@ -55,7 +57,7 @@ pub fn scan_str(input: &str) -> Vec<Symbol<'_>> {
 
     symbols.push(Symbol {
         input,
-        kind: SymbolKind::EOI,
+        kind: SymbolKind::Eoi,
         offset: Offset {
             start: prev_offset,
             end: prev_offset,

diff --git a/commons/src/scanner/position.rs → commons/src/lexer/position.rs b/commons/src/scanner/position.rs → commons/src/lexer/position.rs
@@ -5,31 +5,42 @@ use std::ops::{Add, AddAssign, Sub, SubAssign};
 
 use super::span::SpanLen;
 
-/// Indicates position of a symbol in a Unimarkup document. Both line and column
+/// Indicates position of a symbol or token in a Unimarkup document. Both line and column
 /// counting starts from 1.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub struct Position {
-    /// Line the symbol is found at
+    /// Line the symbol or token is found at
     pub line: usize,
-    /// Column at which the symbol is located in line, when encoded as UTF8
+    /// Column at which the symbol or token is located in line, when encoded as UTF8
     pub col_utf8: usize,
-    /// Column at which the symbol is located in line, when encoded as UTF16
+    /// Column at which the symbol or token is located in line, when encoded as UTF16
     pub col_utf16: usize,
-    /// Column at which the symbol is located in line, when counting graphemes
+    /// Column at which the symbol or token is located in line, when counting graphemes
     pub col_grapheme: usize,
 }
 
-/// Symbol offset in the original input.
-#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
-pub(crate) struct Offset {
-    /// Start offset of a symbol, inclusive. This is the same as the end offset
-    /// of the previous symbol.
+/// Symbol or token offset in the original input.
+#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct Offset {
+    /// Start offset of a symbol or token, inclusive. This is the same as the end offset
+    /// of the previous symbol or token.
     pub start: usize,
-    /// End offset of a symbol, exclusive. This is the same as the start offset
-    /// of the next symbol.
+    /// End offset of a symbol or token, exclusive. This is the same as the start offset
+    /// of the next symbol or token.
     pub end: usize,
 }
 
+impl Offset {
+    pub fn extend(&mut self, other: Offset) {
+        debug_assert!(
+            self.start <= other.start,
+            "Tried to extend self by another offset that started earlier."
+        );
+
+        self.end = self.end.max(other.end)
+    }
+}
+
 impl Position {
     pub fn new(line: usize, column: usize) -> Self {
         Self {

diff --git a/commons/src/scanner/span.rs → commons/src/lexer/span.rs b/commons/src/scanner/span.rs → commons/src/lexer/span.rs
diff --git a/commons/src/lexer/symbol/iterator.rs b/commons/src/lexer/symbol/iterator.rs
@@ -0,0 +1,109 @@
+use itertools::PeekingNext;
+
+use crate::lexer::{Symbol, SymbolKind};
+
+#[derive(Debug, Clone)]
+pub struct SymbolIterator<'slice, 'input> {
+    /// The [`Symbol`] slice the iterator was created for.
+    symbols: &'slice [Symbol<'input>],
+    /// The current index of the iterator inside the [`Symbol`] slice.
+    pub(super) index: usize,
+    /// The peek index of the iterator inside the [`Symbol`] slice.
+    pub(super) peek_index: usize,
+}
+
+impl<'slice, 'input, T> From<T> for SymbolIterator<'slice, 'input>
+where
+    T: Into<&'slice [Symbol<'input>]>,
+{
+    fn from(value: T) -> Self {
+        SymbolIterator {
+            symbols: value.into(),
+            index: 0,
+            peek_index: 0,
+        }
+    }
+}
+
+impl<'slice, 'input> Iterator for SymbolIterator<'slice, 'input> {
+    type Item = &'slice Symbol<'input>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let symbol = self.symbols.get(self.index)?;
+
+        self.index += 1;
+        self.peek_index = self.index;
+
+        Some(symbol)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        (0, Some(self.max_len()))
+    }
+}
+
+impl<'slice, 'input> PeekingNext for SymbolIterator<'slice, 'input> {
+    fn peeking_next<F>(&mut self, accept: F) -> Option<Self::Item>
+    where
+        Self: Sized,
+        F: FnOnce(&Self::Item) -> bool,
+    {
+        let symbol = self.symbols.get(self.peek_index).filter(accept)?;
+        self.peek_index += 1;
+        Some(symbol)
+    }
+}
+
+impl<'slice, 'input> SymbolIterator<'slice, 'input> {
+    /// Returns the maximum length of the remaining [`Symbol`]s this iterator might return.
+    ///
+    /// **Note:** This length does not consider parent iterators, or matching functions.
+    /// Therefore, the returned number of [`Symbol`]s might differ, but cannot be larger than this length.
+    pub fn max_len(&self) -> usize {
+        self.symbols.len().saturating_sub(self.index)
+    }
+
+    /// Returns `true` if no more [`Symbol`]s are available.
+    pub fn is_empty(&self) -> bool {
+        self.max_len() == 0
+    }
+
+    /// Returns the current index this iterator is in the [`Symbol`] slice of the root iterator.
+    pub fn index(&self) -> usize {
+        self.index
+    }
+
+    /// Sets the current index of this iterator to the given index.
+    pub(crate) fn set_index(&mut self, index: usize) {
+        debug_assert!(self.index <= index, "Tried to move the iterator backward.");
+
+        self.index = index;
+        self.peek_index = index;
+    }
+
+    /// Returns the index used to peek.
+    pub(crate) fn peek_index(&self) -> usize {
+        self.peek_index
+    }
+
+    /// Sets the peek index of this iterator to the given index.
+    pub(crate) fn set_peek_index(&mut self, index: usize) {
+        if self.index() <= index {
+            self.peek_index = index;
+        }
+    }
+
+    pub fn reset_peek(&mut self) {
+        self.set_peek_index(self.index());
+    }
+
+    /// Returns the next [`Symbol`] without changing the current index.    
+    pub fn peek(&mut self) -> Option<&'slice Symbol<'input>> {
+        self.symbols.get(self.peek_index)
+    }
+
+    /// Returns the [`SymbolKind`] of the peeked [`Symbol`].
+    pub fn peek_kind(&mut self) -> Option<SymbolKind> {
+        self.peek().map(|s| s.kind)
+    }
+}
diff --git a/commons/src/scanner/symbol/mod.rs → commons/src/lexer/symbol/mod.rs b/commons/src/scanner/symbol/mod.rs → commons/src/lexer/symbol/mod.rs
@@ -2,25 +2,33 @@
 
 use core::fmt;
 
+use icu_properties::sets::CodePointSetDataBorrowed;
+
 use super::position::{Offset, Position};
 
 pub mod iterator;
 
+pub const TERMINAL_PUNCTUATION: CodePointSetDataBorrowed<'static> =
+    icu_properties::sets::terminal_punctuation();
+
 /// Possible kinds of Symbol found in Unimarkup document.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
 pub enum SymbolKind {
-    /// Hash symbol (#) used for headings
-    Hash,
     /// Regular text with no semantic meaning
+    #[default]
     Plain,
+    /// Unicode terminal punctuation
+    TerminalPunctuation,
     /// Any non-linebreaking whitespace
     Whitespace,
     /// A line break literal (for example `\n` or '\r\n')
     Newline,
     /// End of Unimarkup document
-    EOI,
+    Eoi,
     /// The backslash (`\`) is used for escaping other symbols.
     Backslash,
+    /// Hash symbol (#) used for headings
+    Hash,
     /// The star (`*`) literal is used for various elements.
     Star,
     /// The minus (`-`) literal is used for various elements.
@@ -43,6 +51,10 @@ pub enum SymbolKind {
     Quote,
     /// The dollar (`$`) literal is used for math mode formatting.
     Dollar,
+    /// A colon literal (`:`) is used as marker (e.g. for alias substitutions `::heart::`).
+    Colon,
+    /// A dot literal (`.`).
+    Dot,
     /// The open parentheses (`(`) literal is used for additional data to text group elements (e.g.
     /// image insert).
     OpenParenthesis,
@@ -56,21 +68,42 @@ pub enum SymbolKind {
     OpenBrace,
     /// The close brace (`}`) literal is used for inline attributes.
     CloseBrace,
-    /// A colon literal (`:`) is used as marker (e.g. for alias substitutions `::heart::`).
-    Colon,
-}
-
-impl Default for SymbolKind {
-    fn default() -> Self {
-        Self::Plain
-    }
 }
 
 impl SymbolKind {
     pub fn is_not_keyword(&self) -> bool {
         matches!(
             self,
-            SymbolKind::Newline | SymbolKind::Whitespace | SymbolKind::Plain | SymbolKind::EOI
+            SymbolKind::Newline | SymbolKind::Whitespace | SymbolKind::Plain | SymbolKind::Eoi
+        )
+    }
+
+    pub fn is_keyword(&self) -> bool {
+        !self.is_not_keyword()
+    }
+
+    pub fn is_open_parenthesis(&self) -> bool {
+        matches!(
+            self,
+            SymbolKind::OpenParenthesis | SymbolKind::OpenBracket | SymbolKind::OpenBrace
+        )
+    }
+
+    pub fn is_close_parenthesis(&self) -> bool {
+        matches!(
+            self,
+            SymbolKind::CloseParenthesis | SymbolKind::CloseBracket | SymbolKind::CloseBrace
+        )
+    }
+
+    pub fn is_parenthesis(&self) -> bool {
+        self.is_open_parenthesis() || self.is_close_parenthesis()
+    }
+
+    pub fn is_space(&self) -> bool {
+        matches!(
+            self,
+            SymbolKind::Newline | SymbolKind::Whitespace | SymbolKind::Eoi
         )
     }
 }
@@ -80,7 +113,7 @@ impl SymbolKind {
 pub struct Symbol<'a> {
     /// Original input the symbol is found in.
     pub input: &'a str,
-    pub(crate) offset: Offset,
+    pub offset: Offset,
     /// Kind of the symbol, e.g. a hash (#)
     pub kind: SymbolKind,
     /// Start position of the symbol in input.
@@ -141,7 +174,7 @@ impl Symbol<'_> {
     /// # Examples
     ///
     /// ```
-    /// use unimarkup_commons::scanner::{scan_str, Symbol};
+    /// use unimarkup_commons::lexer::{scan_str, symbol::Symbol};
     ///
     /// let input = "This is a string";
     /// let symbols: Vec<_> = scan_str(input);
@@ -212,27 +245,47 @@ impl From<&str> for SymbolKind {
             "{" => SymbolKind::OpenBrace,
             "}" => SymbolKind::CloseBrace,
             ":" => SymbolKind::Colon,
+            "." => SymbolKind::Dot,
             symbol
                 if symbol != "\n"
                     && symbol != "\r\n"
                     && symbol.starts_with(char::is_whitespace) =>
             {
                 SymbolKind::Whitespace
             }
-            _ => SymbolKind::Plain,
+            _ => {
+                let mut kind = SymbolKind::Plain;
+
+                if let Some(c) = value.chars().next() {
+                    if TERMINAL_PUNCTUATION.contains(c) {
+                        kind = SymbolKind::TerminalPunctuation;
+                    }
+                }
+
+                kind
+            }
         }
     }
 }
 
 impl SymbolKind {
     pub fn as_str(&self) -> &str {
         match self {
+            SymbolKind::Plain | SymbolKind::TerminalPunctuation => {
+                #[cfg(debug_assertions)]
+                panic!(
+                    "Tried to create &str from '{:?}', which has undefined &str representation.",
+                    self
+                );
+
+                #[cfg(not(debug_assertions))]
+                ""
+            }
             SymbolKind::Hash => "#",
-            SymbolKind::Plain => "",
             SymbolKind::Tick => "`",
             SymbolKind::Whitespace => " ",
             SymbolKind::Newline => "\n",
-            SymbolKind::EOI => "",
+            SymbolKind::Eoi => "",
             SymbolKind::Backslash => "\\",
             SymbolKind::Star => "*",
             SymbolKind::Minus => "-",
@@ -251,6 +304,7 @@ impl SymbolKind {
             SymbolKind::OpenBrace => "{",
             SymbolKind::CloseBrace => "}",
             SymbolKind::Colon => ":",
+            SymbolKind::Dot => ".",
         }
     }
 }