Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 6 additions & 9 deletions src/chemistry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ use std::convert::TryInto;
use std::collections::HashSet;
use std::cmp::Ordering;
use crate::errors::*;
use std::sync::LazyLock;


pub static NOT_CHEMISTRY: isize = -10000; // should overwhelm any positive signal
Expand Down Expand Up @@ -939,10 +940,8 @@ fn likely_chem_superscript(sup: Element) -> isize {
// bullet is radical (en.wikipedia.org/wiki/Radical_(chemistry)#Depiction_in_chemical_reactions); mhchem uses dot operator
// these can stand alone, be followed by +/- or have a number in front "(2•)-"" [examples from mhchem documentation]
// roman numerals are "oxidation state" and range from -4 to +9
lazy_static! {
static ref MULTIPLE_PLUS_OR_MINUS_OR_DOT: Regex = Regex::new(r"^\++$|^-+$|^\U{2212}+$|^[⋅∙•][-+\U{2212}]*$").unwrap();
static ref SINGLE_PLUS_OR_MINUS_OR_DOT: Regex = Regex::new(r"^[+-\U{2212}⋅∙•]$").unwrap();
}
static MULTIPLE_PLUS_OR_MINUS_OR_DOT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\++$|^-+$|^\U{2212}+$|^[⋅∙•][-+\U{2212}]*$").unwrap());
static SINGLE_PLUS_OR_MINUS_OR_DOT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^[+-\U{2212}⋅∙•]$").unwrap());
static DOTS: &[char; 3] = &['⋅', '∙', '•'];
let sup_name = name(sup);
if sup_name == "mo" && MULTIPLE_PLUS_OR_MINUS_OR_DOT.is_match(as_text(sup)) {
Expand Down Expand Up @@ -1617,11 +1616,9 @@ fn is_equilibrium_constant(mut mathml: Element) -> bool {
return name(mathml) == "mi" && as_text(mathml) == "K";
}

lazy_static! {
// Oxidation states range from -4 to 9 and are written with (a subset of) roman numerals.
// All instances seem to be upper case that I've seen.
static ref SMALL_UPPER_ROMAN_NUMERAL: Regex = Regex::new(r"^\s*^(IX|IV|V?I{0,3})\s*$").unwrap();
}
// Oxidation states range from -4 to 9 and are written with (a subset of) roman numerals.
// All instances seem to be upper case that I've seen.
static SMALL_UPPER_ROMAN_NUMERAL: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*^(IX|IV|V?I{0,3})\s*$").unwrap());

/// look for "(s), "(l)", "(g)", "(aq)" (could also use [...])
/// this might be called before canonicalization, but in clean_chemistry_mrow, we made sure "( xxx )" is grouped properly
Expand Down
36 changes: 19 additions & 17 deletions src/infer_intent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use crate::speech::SpeechRulesWithContext;
use crate::canonicalize::{as_element, as_text, name, create_mathml_element, set_mathml_name, INTENT_ATTR, MATHML_FROM_NAME_ATTR};
use crate::errors::*;
use std::fmt;
use std::sync::LazyLock;
use crate::pretty_print::mml_to_string;
use crate::xpath_functions::is_leaf;
use regex::Regex;
Expand Down Expand Up @@ -243,23 +244,24 @@ pub fn intent_speech_for_name(intent_name: &str, verbosity: &str, fixity: &str)
// property := S ':' NCName
// S := [ \t\n\r]*

lazy_static! {
// The practical restrictions of NCName are that it cannot contain several symbol characters like
// !, ", #, $, %, &, ', (, ), *, +, ,, /, :, ;, <, =, >, ?, @, [, \, ], ^, `, {, |, }, ~, and whitespace characters
// Furthermore an NCName cannot begin with a number, dot or minus character although they can appear later in an NCName.
// NC_NAME defined in www.w3.org/TR/REC-xml/#sec-common-syn, but is complicated
// We follow NC_NAME for the basic latin block, but then allow everything
static ref CONCEPT_OR_LITERAL: Regex = Regex::new(
r#"^[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*"# // NC_NAME but simpler
).unwrap();
static ref PROPERTY: Regex = Regex::new(
r#"^:[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*"# // : NC_NAME
).unwrap();
static ref ARG_REF: Regex = Regex::new(
r#"^\$[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*"# // $ NC_NAME
).unwrap();
static ref NUMBER: Regex = Regex::new(r#"^-?[0-9]+(\.[0-9]+)?"#).unwrap();
}
// The practical restrictions of NCName are that it cannot contain several symbol characters like
// !, ", #, $, %, &, ', (, ), *, +, ,, /, :, ;, <, =, >, ?, @, [, \, ], ^, `, {, |, }, ~, and whitespace characters
// Furthermore an NCName cannot begin with a number, dot or minus character although they can appear later in an NCName.
// NC_NAME defined in www.w3.org/TR/REC-xml/#sec-common-syn, but is complicated
// We follow NC_NAME for the basic latin block, but then allow everything
static CONCEPT_OR_LITERAL: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r#"^[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*"# // NC_NAME but simpler
).unwrap()
});
static PROPERTY: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r#"^:[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*"# // : NC_NAME
).unwrap()
});
static ARG_REF: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r#"^\$[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*"# // $ NC_NAME
).unwrap()
});
static NUMBER: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"^-?[0-9]+(\.[0-9]+)?"#).unwrap());

static TERMINALS_AS_U8: [u8; 3] = [b'(', b',', b')'];
// static TERMINALS: [char; 3] = ['(', ',',')'];
Expand Down
23 changes: 9 additions & 14 deletions src/interface.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#![allow(non_snake_case)]
#![allow(clippy::needless_return)]
use std::cell::RefCell;
use std::sync::LazyLock;

use crate::canonicalize::{as_text, create_mathml_element};
use crate::errors::*;
Expand Down Expand Up @@ -94,14 +95,12 @@ pub fn get_version() -> String {
/// The ids can be used for sync highlighting if the `Bookmark` API preference is true.
pub fn set_mathml(mathml_str: String) -> Result<String> {
enable_logs();
lazy_static! {
// if these are present when resent to MathJaX, MathJaX crashes (https://github.com/mathjax/MathJax/issues/2822)
static ref MATHJAX_V2: Regex = Regex::new(r#"class *= *['"]MJX-.*?['"]"#).unwrap();
static ref MATHJAX_V3: Regex = Regex::new(r#"class *= *['"]data-mjx-.*?['"]"#).unwrap();
static ref NAMESPACE_DECL: Regex = Regex::new(r#"xmlns:[[:alpha:]]+"#).unwrap(); // very limited namespace prefix match
static ref PREFIX: Regex = Regex::new(r#"(</?)[[:alpha:]]+:"#).unwrap(); // very limited namespace prefix match
static ref HTML_ENTITIES: Regex = Regex::new(r#"&([a-zA-Z]+?);"#).unwrap();
}
// if these are present when resent to MathJaX, MathJaX crashes (https://github.com/mathjax/MathJax/issues/2822)
static MATHJAX_V2: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"class *= *['"]MJX-.*?['"]"#).unwrap());
static MATHJAX_V3: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"class *= *['"]data-mjx-.*?['"]"#).unwrap());
static NAMESPACE_DECL: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"xmlns:[[:alpha:]]+"#).unwrap()); // very limited namespace prefix match
static PREFIX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"(</?)[[:alpha:]]+:"#).unwrap()); // very limited namespace prefix match
static HTML_ENTITIES: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"&([a-zA-Z]+?);"#).unwrap());

NAVIGATION_STATE.with(|nav_stack| {
nav_stack.borrow_mut().reset();
Expand Down Expand Up @@ -670,9 +669,7 @@ pub fn trim_element(e: Element, allow_structure_in_leaves: bool) {

// space, tab, newline, carriage return all get collapsed to a single space
const WHITESPACE: &[char] = &[' ', '\u{0009}', '\u{000A}','\u{000C}', '\u{000D}'];
lazy_static! {
static ref WHITESPACE_MATCH: Regex = Regex::new(r#"[ \u{0009}\u{000A}\u{00C}\u{000D}]+"#).unwrap();
}
static WHITESPACE_MATCH: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"[ \u{0009}\u{000A}\u{00C}\u{000D}]+"#).unwrap());

if is_leaf(e) && (!allow_structure_in_leaves || IsNode::is_mathml(e)) {
// Assume it is HTML inside of the leaf -- turn the HTML into a string
Expand Down Expand Up @@ -1108,9 +1105,7 @@ mod tests {
set_mathml("<math><mrow><mo>&#x02212;</mo><mi>&#x1D55E;</mi></mrow></math>".to_string()).unwrap();

// need to remove unique ids
lazy_static! {
static ref ID_MATCH: Regex = Regex::new(r#"id='.+?' "#).unwrap();
}
static ID_MATCH: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"id='.+?' "#).unwrap());
let entity_str = ID_MATCH.replace_all(&entity_str, "");
let converted_str = ID_MATCH.replace_all(&converted_str, "");
assert_eq!(entity_str, converted_str, "normal entity test failed");
Expand Down
5 changes: 2 additions & 3 deletions src/prefs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ extern crate dirs;
use std::cell::RefCell;
use std::rc::Rc;
use std::path::{Path, PathBuf};
use std::sync::LazyLock;
use crate::speech::{as_str_checked, RulesFor, FileAndTime};
use std::collections::{HashMap, HashSet};
use phf::phf_set;
Expand All @@ -35,9 +36,7 @@ use crate::errors::*;
/// Use to indicate preference not found with Preference::to_string()
pub static NO_PREFERENCE: &str = "\u{FFFF}";

lazy_static! {
static ref DEFAULT_LANG: Yaml = Yaml::String("en".to_string());
}
static DEFAULT_LANG: LazyLock<Yaml> = LazyLock::new(|| Yaml::String("en".to_string()));


// Preferences are recorded here
Expand Down
11 changes: 6 additions & 5 deletions src/speech.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
use std::path::PathBuf;
use std::collections::HashMap;
use std::cell::{RefCell, RefMut};
use std::sync::LazyLock;
use sxd_document::dom::{ChildOfElement, Document, Element};
use sxd_document::{Package, QName};
use sxd_xpath::context::Evaluation;
Expand Down Expand Up @@ -565,13 +566,13 @@ impl InsertChildren {
}


lazy_static! {
static ref ATTR_NAME_VALUE: Regex = Regex::new(
static ATTR_NAME_VALUE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
// match name='value', where name is sort of an NCNAME (see CONCEPT_OR_LITERAL in infer_intent.rs)
// The quotes can be either single or double quotes
// The quotes can be either single or double quotes
r#"(?P<name>[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*)\s*=\s*('(?P<value>[^']+)'|"(?P<dqvalue>[^"]+)")"#
).unwrap();
}
).unwrap()
});

// structure used when "intent:" is encountered in a rule
// the name is either a string or an xpath that needs evaluation. 99% of the time it is a string
Expand Down
23 changes: 8 additions & 15 deletions src/tts.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ use std::string::ToString;
use std::str::FromStr;
use strum_macros::{Display, EnumString};
use regex::Regex;
use std::sync::LazyLock;
use sxd_xpath::Value;

const MIN_PAUSE:f64 = 50.0; // ms -- avoids clutter of putting out pauses that probably can't be heard
Expand Down Expand Up @@ -595,9 +596,7 @@ impl TTS {
/// The computation is based on the length of the speech strings (after removing tagging).
/// There is a bias towards pausing more _after_ longer strings.
pub fn compute_auto_pause(&self, prefs: &PreferenceManager, before: &str, after: &str) -> String {
lazy_static! {
static ref REMOVE_XML: Regex = Regex::new(r"<.+?>").unwrap(); // punctuation ending with a '.'
}
static REMOVE_XML: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<.+?>").unwrap()); // punctuation ending with a '.'
let before_len;
let after_len;
match self {
Expand Down Expand Up @@ -653,10 +652,8 @@ impl TTS {

fn merge_pauses_none(&self, str: &str) -> String {
// punctuation used for pauses is ",", ";"
lazy_static! {
static ref SPACES: Regex = Regex::new(r"\s+([;,])").unwrap(); // two or more pauses
static ref MULTIPLE_PAUSES: Regex = Regex::new(r"([,;][,;]+)").unwrap(); // two or more pauses
}
static SPACES: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\s+([;,])").unwrap()); // two or more pauses
static MULTIPLE_PAUSES: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"([,;][,;]+)").unwrap()); // two or more pauses
// we reduce all sequences of two or more pauses to a single medium pause
let merges_string = SPACES.replace_all(str, "$1").to_string();
let merges_string = MULTIPLE_PAUSES.replace_all(&merges_string, ";").to_string();
Expand All @@ -680,19 +677,15 @@ impl TTS {
}

fn merge_pauses_sapi5(&self, str: &str) -> String {
lazy_static! {
static ref CONSECUTIVE_BREAKS: Regex = Regex::new(r"(<silence msec[^>]+?> *){2,}").unwrap(); // two or more pauses
static ref PAUSE_AMOUNT: Regex = Regex::new(r"msec=.*?(\d+)").unwrap(); // amount after 'time'
}
static CONSECUTIVE_BREAKS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(<silence msec[^>]+?> *){2,}").unwrap()); // two or more pauses
static PAUSE_AMOUNT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"msec=.*?(\d+)").unwrap()); // amount after 'time'
let replacement = |amount: usize| format!("<silence msec=='{amount}ms'/>");
return TTS::merge_pauses_xml(str, &CONSECUTIVE_BREAKS, &PAUSE_AMOUNT, replacement);
}

fn merge_pauses_ssml(&self, str: &str) -> String {
lazy_static! {
static ref CONSECUTIVE_BREAKS: Regex = Regex::new(r"(<break time=[^>]+?> *){2,}").unwrap(); // two or more pauses
static ref PAUSE_AMOUNT: Regex = Regex::new(r"time=.*?(\d+)").unwrap(); // amount after 'time'
}
static CONSECUTIVE_BREAKS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(<break time=[^>]+?> *){2,}").unwrap()); // two or more pauses
static PAUSE_AMOUNT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"time=.*?(\d+)").unwrap()); // amount after 'time'
let replacement = |amount: usize| format!("<break time='{amount}ms'/>");
return TTS::merge_pauses_xml(str, &CONSECUTIVE_BREAKS, &PAUSE_AMOUNT, replacement);
}
Expand Down
15 changes: 5 additions & 10 deletions src/xpath_functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use crate::definitions::{Definitions, SPEECH_DEFINITIONS, BRAILLE_DEFINITIONS};
use regex::Regex;
use crate::pretty_print::mml_to_string;
use std::cell::{Ref, RefCell};
use std::sync::LazyLock;
use std::thread::LocalKey;
use phf::phf_set;
use sxd_xpath::function::Error as XPathError;
Expand Down Expand Up @@ -265,9 +266,7 @@ impl IsNode {
// Returns true if 'frac' is a common fraction
// In this case, the numerator and denominator can be no larger than 'num_limit' and 'denom_limit'
fn is_common_fraction(frac: Element, num_limit: usize, denom_limit: usize) -> bool {
lazy_static! {
static ref ALL_DIGITS: Regex = Regex::new(r"\d+").unwrap(); // match one or more digits
}
static ALL_DIGITS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\d+").unwrap()); // match one or more digits

if !is_tag(frac, "mfrac") && !is_tag(frac, "fraction"){
return false;
Expand Down Expand Up @@ -449,9 +448,7 @@ impl ToOrdinal {
* Returns the string representation of that number or an error message
*/
fn convert(number: &str, fractional: bool, plural: bool) -> Option<String> {
lazy_static! {
static ref NO_DIGIT: Regex = Regex::new(r"[^\d]").unwrap(); // match anything except a digit
}
static NO_DIGIT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[^\d]").unwrap()); // match anything except a digit
return SPEECH_DEFINITIONS.with(|definitions| {
let definitions = definitions.borrow();
let numbers_large = definitions.get_vec("NumbersLarge")?;
Expand Down Expand Up @@ -1349,10 +1346,8 @@ pub struct FontSizeGuess;
// returns original node match isn't found
impl FontSizeGuess {
pub fn em_from_value(value_with_unit: &str) -> f64 {
lazy_static! {
// match one or more digits followed by a unit -- there are many more units, but they tend to be large and rarer(?)
static ref FONT_VALUE: Regex = Regex::new(r"(-?[0-9]*\.?[0-9]*)(px|cm|mm|Q|in|ppc|pt|ex|em|rem)").unwrap();
}
// match one or more digits followed by a unit -- there are many more units, but they tend to be large and rarer(?)
static FONT_VALUE: LazyLock<Regex> = LazyLock::new(|| { Regex::new(r"(-?[0-9]*\.?[0-9]*)(px|cm|mm|Q|in|ppc|pt|ex|em|rem)").unwrap() });
let cap = FONT_VALUE.captures(value_with_unit);
if let Some(cap) = cap {
if cap.len() == 3 {
Expand Down
7 changes: 2 additions & 5 deletions tests/common/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
#[cfg(test)]

use regex::Regex;
extern crate lazy_static;
use lazy_static::lazy_static;
use std::sync::LazyLock;
pub use libmathcat::interface::*;


Expand Down Expand Up @@ -35,9 +34,7 @@ pub fn abs_rules_dir_path() -> String {
// Strip spaces from 'str' so comparison doesn't need to worry about spacing
#[allow(dead_code)] // used in testing
fn strip_spaces(str: &str) -> String {
lazy_static! {
static ref SPACES: Regex = Regex::new(r" +").unwrap();
}
static SPACES: LazyLock<Regex> = LazyLock::new(|| Regex::new(r" +").unwrap());
return String::from(SPACES.replace_all(str, " "));
}

Expand Down
Loading