From 5bfea6fdfd7257b84fb8e7b613e1df520ea1fbe5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20Gro=C3=9F?= Date: Thu, 8 Jan 2026 07:53:32 +0100 Subject: [PATCH 1/8] use LazyLock in prefs.rs --- src/prefs.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/prefs.rs b/src/prefs.rs index f73cf89f..d28466a5 100644 --- a/src/prefs.rs +++ b/src/prefs.rs @@ -26,6 +26,7 @@ extern crate dirs; use std::cell::RefCell; use std::rc::Rc; use std::path::{Path, PathBuf}; +use std::sync::LazyLock; use crate::speech::{as_str_checked, RulesFor, FileAndTime}; use std::collections::{HashMap, HashSet}; use phf::phf_set; @@ -35,9 +36,7 @@ use crate::errors::*; /// Use to indicate preference not found with Preference::to_string() pub static NO_PREFERENCE: &str = "\u{FFFF}"; -lazy_static! { - static ref DEFAULT_LANG: Yaml = Yaml::String("en".to_string()); -} +static DEFAULT_LANG: LazyLock = LazyLock::new(|| Yaml::String("en".to_string())); // Preferences are recorded here From 2f10adff4dbebad0176b5d795aff14165469ea31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20Gro=C3=9F?= Date: Thu, 8 Jan 2026 08:51:41 +0100 Subject: [PATCH 2/8] change more parts --- src/chemistry.rs | 15 ++++++--------- src/infer_intent.rs | 36 +++++++++++++++++++----------------- src/interface.rs | 23 +++++++++-------------- src/speech.rs | 11 ++++++----- src/tts.rs | 23 ++++++++--------------- src/xpath_functions.rs | 15 +++++---------- tests/common/mod.rs | 7 ++----- 7 files changed, 55 insertions(+), 75 deletions(-) diff --git a/src/chemistry.rs b/src/chemistry.rs index 8d2f625d..1d94fe32 100644 --- a/src/chemistry.rs +++ b/src/chemistry.rs @@ -46,6 +46,7 @@ use std::convert::TryInto; use std::collections::HashSet; use std::cmp::Ordering; use crate::errors::*; +use std::sync::LazyLock; pub static NOT_CHEMISTRY: isize = -10000; // should overwhelm any positive signal @@ -939,10 +940,8 @@ fn likely_chem_superscript(sup: Element) -> isize { // bullet is radical (en.wikipedia.org/wiki/Radical_(chemistry)#Depiction_in_chemical_reactions); mhchem uses dot operator // these can stand alone, be followed by +/- or have a number in front "(2•)-"" [examples from mhchem documentation] // roman numerals are "oxidation state" and range from -4 to +9 - lazy_static! { - static ref MULTIPLE_PLUS_OR_MINUS_OR_DOT: Regex = Regex::new(r"^\++$|^-+$|^\U{2212}+$|^[⋅∙•][-+\U{2212}]*$").unwrap(); - static ref SINGLE_PLUS_OR_MINUS_OR_DOT: Regex = Regex::new(r"^[+-\U{2212}⋅∙•]$").unwrap(); - } + static MULTIPLE_PLUS_OR_MINUS_OR_DOT: LazyLock = LazyLock::new(|| Regex::new(r"^\++$|^-+$|^\U{2212}+$|^[⋅∙•][-+\U{2212}]*$").unwrap()); + static SINGLE_PLUS_OR_MINUS_OR_DOT: LazyLock = LazyLock::new(|| Regex::new(r"^[+-\U{2212}⋅∙•]$").unwrap()); static DOTS: &[char; 3] = &['⋅', '∙', '•']; let sup_name = name(sup); if sup_name == "mo" && MULTIPLE_PLUS_OR_MINUS_OR_DOT.is_match(as_text(sup)) { @@ -1617,11 +1616,9 @@ fn is_equilibrium_constant(mut mathml: Element) -> bool { return name(mathml) == "mi" && as_text(mathml) == "K"; } -lazy_static! { - // Oxidation states range from -4 to 9 and are written with (a subset of) roman numerals. - // All instances seem to be upper case that I've seen. - static ref SMALL_UPPER_ROMAN_NUMERAL: Regex = Regex::new(r"^\s*^(IX|IV|V?I{0,3})\s*$").unwrap(); -} +// Oxidation states range from -4 to 9 and are written with (a subset of) roman numerals. +// All instances seem to be upper case that I've seen. +static SMALL_UPPER_ROMAN_NUMERAL: LazyLock = LazyLock::new(|| Regex::new(r"^\s*^(IX|IV|V?I{0,3})\s*$").unwrap()); /// look for "(s), "(l)", "(g)", "(aq)" (could also use [...]) /// this might be called before canonicalization, but in clean_chemistry_mrow, we made sure "( xxx )" is grouped properly diff --git a/src/infer_intent.rs b/src/infer_intent.rs index 7da65768..4c92e7a0 100644 --- a/src/infer_intent.rs +++ b/src/infer_intent.rs @@ -11,6 +11,7 @@ use crate::speech::SpeechRulesWithContext; use crate::canonicalize::{as_element, as_text, name, create_mathml_element, set_mathml_name, INTENT_ATTR, MATHML_FROM_NAME_ATTR}; use crate::errors::*; use std::fmt; +use std::sync::LazyLock; use crate::pretty_print::mml_to_string; use crate::xpath_functions::is_leaf; use regex::Regex; @@ -243,23 +244,24 @@ pub fn intent_speech_for_name(intent_name: &str, verbosity: &str, fixity: &str) // property := S ':' NCName // S := [ \t\n\r]* -lazy_static! { - // The practical restrictions of NCName are that it cannot contain several symbol characters like - // !, ", #, $, %, &, ', (, ), *, +, ,, /, :, ;, <, =, >, ?, @, [, \, ], ^, `, {, |, }, ~, and whitespace characters - // Furthermore an NCName cannot begin with a number, dot or minus character although they can appear later in an NCName. - // NC_NAME defined in www.w3.org/TR/REC-xml/#sec-common-syn, but is complicated - // We follow NC_NAME for the basic latin block, but then allow everything - static ref CONCEPT_OR_LITERAL: Regex = Regex::new( - r#"^[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*"# // NC_NAME but simpler - ).unwrap(); - static ref PROPERTY: Regex = Regex::new( - r#"^:[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*"# // : NC_NAME - ).unwrap(); - static ref ARG_REF: Regex = Regex::new( - r#"^\$[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*"# // $ NC_NAME - ).unwrap(); - static ref NUMBER: Regex = Regex::new(r#"^-?[0-9]+(\.[0-9]+)?"#).unwrap(); -} +// The practical restrictions of NCName are that it cannot contain several symbol characters like +// !, ", #, $, %, &, ', (, ), *, +, ,, /, :, ;, <, =, >, ?, @, [, \, ], ^, `, {, |, }, ~, and whitespace characters +// Furthermore an NCName cannot begin with a number, dot or minus character although they can appear later in an NCName. +// NC_NAME defined in www.w3.org/TR/REC-xml/#sec-common-syn, but is complicated +// We follow NC_NAME for the basic latin block, but then allow everything +static CONCEPT_OR_LITERAL: LazyLock = LazyLock::new(|| { + Regex::new(r#"^[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*"# // NC_NAME but simpler + ).unwrap() +}); +static PROPERTY: LazyLock = LazyLock::new(|| { + Regex::new(r#"^:[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*"# // : NC_NAME + ).unwrap() +}); +static ARG_REF: LazyLock = LazyLock::new(|| { + Regex::new(r#"^\$[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*"# // $ NC_NAME + ).unwrap() +}); +static NUMBER: LazyLock = LazyLock::new(|| Regex::new(r#"^-?[0-9]+(\.[0-9]+)?"#).unwrap()); static TERMINALS_AS_U8: [u8; 3] = [b'(', b',', b')']; // static TERMINALS: [char; 3] = ['(', ',',')']; diff --git a/src/interface.rs b/src/interface.rs index 665e1e0d..6b858e7f 100644 --- a/src/interface.rs +++ b/src/interface.rs @@ -3,6 +3,7 @@ #![allow(non_snake_case)] #![allow(clippy::needless_return)] use std::cell::RefCell; +use std::sync::LazyLock; use crate::canonicalize::{as_text, create_mathml_element}; use crate::errors::*; @@ -94,14 +95,12 @@ pub fn get_version() -> String { /// The ids can be used for sync highlighting if the `Bookmark` API preference is true. pub fn set_mathml(mathml_str: String) -> Result { enable_logs(); - lazy_static! { - // if these are present when resent to MathJaX, MathJaX crashes (https://github.com/mathjax/MathJax/issues/2822) - static ref MATHJAX_V2: Regex = Regex::new(r#"class *= *['"]MJX-.*?['"]"#).unwrap(); - static ref MATHJAX_V3: Regex = Regex::new(r#"class *= *['"]data-mjx-.*?['"]"#).unwrap(); - static ref NAMESPACE_DECL: Regex = Regex::new(r#"xmlns:[[:alpha:]]+"#).unwrap(); // very limited namespace prefix match - static ref PREFIX: Regex = Regex::new(r#"( = LazyLock::new(|| Regex::new(r#"class *= *['"]MJX-.*?['"]"#).unwrap()); + static MATHJAX_V3: LazyLock = LazyLock::new(|| Regex::new(r#"class *= *['"]data-mjx-.*?['"]"#).unwrap()); + static NAMESPACE_DECL: LazyLock = LazyLock::new(|| Regex::new(r#"xmlns:[[:alpha:]]+"#).unwrap()); // very limited namespace prefix match + static PREFIX: LazyLock = LazyLock::new(|| Regex::new(r#"( = LazyLock::new(|| Regex::new(r#"&([a-zA-Z]+?);"#).unwrap()); NAVIGATION_STATE.with(|nav_stack| { nav_stack.borrow_mut().reset(); @@ -670,9 +669,7 @@ pub fn trim_element(e: Element, allow_structure_in_leaves: bool) { // space, tab, newline, carriage return all get collapsed to a single space const WHITESPACE: &[char] = &[' ', '\u{0009}', '\u{000A}','\u{000C}', '\u{000D}']; - lazy_static! { - static ref WHITESPACE_MATCH: Regex = Regex::new(r#"[ \u{0009}\u{000A}\u{00C}\u{000D}]+"#).unwrap(); - } + static WHITESPACE_MATCH: LazyLock = LazyLock::new(|| Regex::new(r#"[ \u{0009}\u{000A}\u{00C}\u{000D}]+"#).unwrap()); if is_leaf(e) && (!allow_structure_in_leaves || IsNode::is_mathml(e)) { // Assume it is HTML inside of the leaf -- turn the HTML into a string @@ -1108,9 +1105,7 @@ mod tests { set_mathml("𝕞".to_string()).unwrap(); // need to remove unique ids - lazy_static! { - static ref ID_MATCH: Regex = Regex::new(r#"id='.+?' "#).unwrap(); - } + static ID_MATCH: LazyLock = LazyLock::new(|| Regex::new(r#"id='.+?' "#).unwrap()); let entity_str = ID_MATCH.replace_all(&entity_str, ""); let converted_str = ID_MATCH.replace_all(&converted_str, ""); assert_eq!(entity_str, converted_str, "normal entity test failed"); diff --git a/src/speech.rs b/src/speech.rs index 091528db..d36c3857 100644 --- a/src/speech.rs +++ b/src/speech.rs @@ -6,6 +6,7 @@ use std::path::PathBuf; use std::collections::HashMap; use std::cell::{RefCell, RefMut}; +use std::sync::LazyLock; use sxd_document::dom::{ChildOfElement, Document, Element}; use sxd_document::{Package, QName}; use sxd_xpath::context::Evaluation; @@ -565,13 +566,13 @@ impl InsertChildren { } -lazy_static! { - static ref ATTR_NAME_VALUE: Regex = Regex::new( +static ATTR_NAME_VALUE: LazyLock = LazyLock::new(|| { + Regex::new( // match name='value', where name is sort of an NCNAME (see CONCEPT_OR_LITERAL in infer_intent.rs) - // The quotes can be either single or double quotes + // The quotes can be either single or double quotes r#"(?P[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*)\s*=\s*('(?P[^']+)'|"(?P[^"]+)")"# - ).unwrap(); -} + ).unwrap() +}); // structure used when "intent:" is encountered in a rule // the name is either a string or an xpath that needs evaluation. 99% of the time it is a string diff --git a/src/tts.rs b/src/tts.rs index 11d52fe5..fcdfc468 100644 --- a/src/tts.rs +++ b/src/tts.rs @@ -77,6 +77,7 @@ use std::string::ToString; use std::str::FromStr; use strum_macros::{Display, EnumString}; use regex::Regex; +use std::sync::LazyLock; use sxd_xpath::Value; const MIN_PAUSE:f64 = 50.0; // ms -- avoids clutter of putting out pauses that probably can't be heard @@ -595,9 +596,7 @@ impl TTS { /// The computation is based on the length of the speech strings (after removing tagging). /// There is a bias towards pausing more _after_ longer strings. pub fn compute_auto_pause(&self, prefs: &PreferenceManager, before: &str, after: &str) -> String { - lazy_static! { - static ref REMOVE_XML: Regex = Regex::new(r"<.+?>").unwrap(); // punctuation ending with a '.' - } + static REMOVE_XML: LazyLock = LazyLock::new(|| Regex::new(r"<.+?>").unwrap()); // punctuation ending with a '.' let before_len; let after_len; match self { @@ -653,10 +652,8 @@ impl TTS { fn merge_pauses_none(&self, str: &str) -> String { // punctuation used for pauses is ",", ";" - lazy_static! { - static ref SPACES: Regex = Regex::new(r"\s+([;,])").unwrap(); // two or more pauses - static ref MULTIPLE_PAUSES: Regex = Regex::new(r"([,;][,;]+)").unwrap(); // two or more pauses - } + static SPACES: LazyLock = LazyLock::new(|| Regex::new(r"\s+([;,])").unwrap()); // two or more pauses + static MULTIPLE_PAUSES: LazyLock = LazyLock::new(|| Regex::new(r"([,;][,;]+)").unwrap()); // two or more pauses // we reduce all sequences of two or more pauses to a single medium pause let merges_string = SPACES.replace_all(str, "$1").to_string(); let merges_string = MULTIPLE_PAUSES.replace_all(&merges_string, ";").to_string(); @@ -680,19 +677,15 @@ impl TTS { } fn merge_pauses_sapi5(&self, str: &str) -> String { - lazy_static! { - static ref CONSECUTIVE_BREAKS: Regex = Regex::new(r"(]+?> *){2,}").unwrap(); // two or more pauses - static ref PAUSE_AMOUNT: Regex = Regex::new(r"msec=.*?(\d+)").unwrap(); // amount after 'time' - } + static CONSECUTIVE_BREAKS: LazyLock = LazyLock::new(|| Regex::new(r"(]+?> *){2,}").unwrap()); // two or more pauses + static PAUSE_AMOUNT: LazyLock = LazyLock::new(|| Regex::new(r"msec=.*?(\d+)").unwrap()); // amount after 'time' let replacement = |amount: usize| format!(""); return TTS::merge_pauses_xml(str, &CONSECUTIVE_BREAKS, &PAUSE_AMOUNT, replacement); } fn merge_pauses_ssml(&self, str: &str) -> String { - lazy_static! { - static ref CONSECUTIVE_BREAKS: Regex = Regex::new(r"(]+?> *){2,}").unwrap(); // two or more pauses - static ref PAUSE_AMOUNT: Regex = Regex::new(r"time=.*?(\d+)").unwrap(); // amount after 'time' - } + static CONSECUTIVE_BREAKS: LazyLock = LazyLock::new(|| Regex::new(r"(]+?> *){2,}").unwrap()); // two or more pauses + static PAUSE_AMOUNT: LazyLock = LazyLock::new(|| Regex::new(r"time=.*?(\d+)").unwrap()); // amount after 'time' let replacement = |amount: usize| format!(""); return TTS::merge_pauses_xml(str, &CONSECUTIVE_BREAKS, &PAUSE_AMOUNT, replacement); } diff --git a/src/xpath_functions.rs b/src/xpath_functions.rs index ae68edf3..49543688 100644 --- a/src/xpath_functions.rs +++ b/src/xpath_functions.rs @@ -23,6 +23,7 @@ use crate::definitions::{Definitions, SPEECH_DEFINITIONS, BRAILLE_DEFINITIONS}; use regex::Regex; use crate::pretty_print::mml_to_string; use std::cell::{Ref, RefCell}; +use std::sync::LazyLock; use std::thread::LocalKey; use phf::phf_set; use sxd_xpath::function::Error as XPathError; @@ -265,9 +266,7 @@ impl IsNode { // Returns true if 'frac' is a common fraction // In this case, the numerator and denominator can be no larger than 'num_limit' and 'denom_limit' fn is_common_fraction(frac: Element, num_limit: usize, denom_limit: usize) -> bool { - lazy_static! { - static ref ALL_DIGITS: Regex = Regex::new(r"\d+").unwrap(); // match one or more digits - } + static ALL_DIGITS: LazyLock = LazyLock::new(|| Regex::new(r"\d+").unwrap()); // match one or more digits if !is_tag(frac, "mfrac") && !is_tag(frac, "fraction"){ return false; @@ -449,9 +448,7 @@ impl ToOrdinal { * Returns the string representation of that number or an error message */ fn convert(number: &str, fractional: bool, plural: bool) -> Option { - lazy_static! { - static ref NO_DIGIT: Regex = Regex::new(r"[^\d]").unwrap(); // match anything except a digit - } + static NO_DIGIT: LazyLock = LazyLock::new(|| Regex::new(r"[^\d]").unwrap()); // match anything except a digit return SPEECH_DEFINITIONS.with(|definitions| { let definitions = definitions.borrow(); let numbers_large = definitions.get_vec("NumbersLarge")?; @@ -1349,10 +1346,8 @@ pub struct FontSizeGuess; // returns original node match isn't found impl FontSizeGuess { pub fn em_from_value(value_with_unit: &str) -> f64 { - lazy_static! { - // match one or more digits followed by a unit -- there are many more units, but they tend to be large and rarer(?) - static ref FONT_VALUE: Regex = Regex::new(r"(-?[0-9]*\.?[0-9]*)(px|cm|mm|Q|in|ppc|pt|ex|em|rem)").unwrap(); - } + // match one or more digits followed by a unit -- there are many more units, but they tend to be large and rarer(?) + static FONT_VALUE: LazyLock = LazyLock::new(|| { Regex::new(r"(-?[0-9]*\.?[0-9]*)(px|cm|mm|Q|in|ppc|pt|ex|em|rem)").unwrap() }); let cap = FONT_VALUE.captures(value_with_unit); if let Some(cap) = cap { if cap.len() == 3 { diff --git a/tests/common/mod.rs b/tests/common/mod.rs index 736e2b84..9faf074e 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -2,8 +2,7 @@ #[cfg(test)] use regex::Regex; -extern crate lazy_static; -use lazy_static::lazy_static; +use std::sync::LazyLock; pub use libmathcat::interface::*; @@ -35,9 +34,7 @@ pub fn abs_rules_dir_path() -> String { // Strip spaces from 'str' so comparison doesn't need to worry about spacing #[allow(dead_code)] // used in testing fn strip_spaces(str: &str) -> String { - lazy_static! { - static ref SPACES: Regex = Regex::new(r" +").unwrap(); - } + static SPACES: LazyLock = LazyLock::new(|| Regex::new(r" +").unwrap()); return String::from(SPACES.replace_all(str, " ")); } From 138e0f28942de68ce1e816c628beb1cd2bcfee79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20Gro=C3=9F?= Date: Fri, 9 Jan 2026 18:55:09 +0100 Subject: [PATCH 3/8] implement and test line-reporting per diff --- PythonScripts/audit_translations/auditor.py | 124 +++++++++++++----- .../audit_translations/dataclasses.py | 39 +++++- PythonScripts/audit_translations/parsers.py | 108 ++++++++++++++- .../Languages/en/SharedRules/calculus.yaml | 57 ++++++++ .../fixtures/Rules/Languages/en/overview.yaml | 110 ++++++++++++++++ .../Languages/es/SharedRules/calculus.yaml | 39 ++++++ .../fixtures/Rules/Languages/es/overview.yaml | 110 ++++++++++++++++ .../tests/golden/jsonl/de.json | 78 +++++++---- .../audit_translations/tests/test_auditor.py | 17 ++- .../tests/test_cli_end_to_end.py | 84 ++++++++++++ .../audit_translations/tests/test_parsers.py | 40 ++++++ 11 files changed, 737 insertions(+), 69 deletions(-) create mode 100644 PythonScripts/audit_translations/tests/fixtures/Rules/Languages/en/SharedRules/calculus.yaml create mode 100644 PythonScripts/audit_translations/tests/fixtures/Rules/Languages/en/overview.yaml create mode 100644 PythonScripts/audit_translations/tests/fixtures/Rules/Languages/es/SharedRules/calculus.yaml create mode 100644 PythonScripts/audit_translations/tests/fixtures/Rules/Languages/es/overview.yaml create mode 100644 PythonScripts/audit_translations/tests/test_cli_end_to_end.py diff --git a/PythonScripts/audit_translations/auditor.py b/PythonScripts/audit_translations/auditor.py index 5495dea2..9375ca35 100644 --- a/PythonScripts/audit_translations/auditor.py +++ b/PythonScripts/audit_translations/auditor.py @@ -17,7 +17,7 @@ from rich.table import Table from .dataclasses import RuleInfo, RuleDifference, ComparisonResult -from .parsers import parse_yaml_file, diff_rules +from .parsers import parse_yaml_file, diff_rules, extract_structure_elements console = Console() @@ -127,7 +127,7 @@ def merge_rules(base_rules: List[RuleInfo], region_rules: List[RuleInfo]) -> Lis if include_untranslated: for rule in translated_rules: if rule.has_untranslated_text and not rule.audit_ignore: - untranslated_text.append((rule, rule.untranslated_keys)) + untranslated_text.append((rule, rule.untranslated_entries)) # Find fine-grained differences in rules that exist in both files (skip if audit-ignore) rule_differences = [] @@ -157,16 +157,16 @@ def rule_label(rule: RuleInfo) -> str: return f"[cyan]{escape(rule.name)}[/] [dim][{escape(tag)}][/]" -def print_rule_item(rule: RuleInfo, context: str = ""): - console.print(f" [dim]•[/] {rule_label(rule)} [dim](line {rule.line_number}{context})[/]") +def print_rule_item(rule: RuleInfo, issue_line: int, context: str = ""): + console.print(f" [dim]•[/] {rule_label(rule)} [dim](line {issue_line}{context})[/]") -def print_diff_item(diff: RuleDifference): +def print_diff_item(diff: RuleDifference, line_en: int, line_tr: int): """Print a single rule difference""" rule = diff.english_rule console.print( f" [dim]•[/] {rule_label(rule)} " - f"[dim](line {rule.line_number} en, {diff.translated_rule.line_number} tr)[/]" + f"[dim](line {line_en} en, {line_tr} tr)[/]" ) console.print(f" [dim]{diff.description}:[/]") console.print(f" [green]en:[/] {escape(diff.english_snippet)}") @@ -180,11 +180,43 @@ def issue_base(rule: RuleInfo, file_name: str, language: str) -> dict: "rule_name": rule.name or "", "rule_tag": rule.tag or "", "rule_key": rule.key, - "line_en": None, - "line_tr": None, + "issue_line_en": None, + "issue_line_tr": None, + "rule_line_en": None, + "rule_line_tr": None, } +def first_structure_mismatch( + english_tokens: List[str], + translated_tokens: List[str], +) -> Tuple[Optional[str], Optional[str]]: + min_len = min(len(english_tokens), len(translated_tokens)) + for idx in range(min_len): + if english_tokens[idx] != translated_tokens[idx]: + return english_tokens[idx], translated_tokens[idx] + if len(english_tokens) > min_len: + return english_tokens[min_len], None + if len(translated_tokens) > min_len: + return None, translated_tokens[min_len] + return None, None + + +def resolve_issue_line(rule: RuleInfo, kind: str, token: Optional[str] = None) -> int: + if kind == "match": + lines = rule.line_map.get("match", []) + elif kind == "condition": + lines = rule.line_map.get("condition", []) + elif kind == "variables": + lines = rule.line_map.get("variables", []) + elif kind == "structure" and token: + token_key = f"structure:{token.rstrip(':')}" + lines = rule.line_map.get(token_key, []) + else: + lines = [] + return lines[0] if lines else rule.line_number + + def collect_issues( result: ComparisonResult, file_name: str, @@ -197,7 +229,8 @@ def collect_issues( issue.update( issue_type="missing_rule", diff_type="", - line_en=rule.line_number, + issue_line_en=rule.line_number, + rule_line_en=rule.line_number, description="Rule present in English but missing in translation", english_snippet="", translated_snippet="", @@ -210,7 +243,8 @@ def collect_issues( issue.update( issue_type="extra_rule", diff_type="", - line_tr=rule.line_number, + issue_line_tr=rule.line_number, + rule_line_tr=rule.line_number, description="Rule present in translation but missing in English", english_snippet="", translated_snippet="", @@ -218,27 +252,40 @@ def collect_issues( ) issues.append(issue) - for rule, texts in result.untranslated_text: - issue = issue_base(rule, file_name, language) - issue.update( - issue_type="untranslated_text", - diff_type="", - line_tr=rule.line_number, - description="Lowercase t/ot/ct keys indicate untranslated text", - english_snippet="", - translated_snippet="", - untranslated_texts=texts, - ) - issues.append(issue) + for rule, entries in result.untranslated_text: + for key, text, line in entries: + issue = issue_base(rule, file_name, language) + issue.update( + issue_type="untranslated_text", + diff_type="", + issue_line_tr=line or rule.line_number, + rule_line_tr=rule.line_number, + description="Lowercase t/ot/ct keys indicate untranslated text", + english_snippet="", + translated_snippet="", + untranslated_texts=[text], + ) + issues.append(issue) for diff in result.rule_differences: rule = diff.english_rule issue = issue_base(rule, file_name, language) + if diff.diff_type == "structure": + en_tokens = extract_structure_elements(diff.english_rule.data) + tr_tokens = extract_structure_elements(diff.translated_rule.data) + en_token, tr_token = first_structure_mismatch(en_tokens, tr_tokens) + issue_line_en = resolve_issue_line(diff.english_rule, "structure", en_token) + issue_line_tr = resolve_issue_line(diff.translated_rule, "structure", tr_token) + else: + issue_line_en = resolve_issue_line(diff.english_rule, diff.diff_type) + issue_line_tr = resolve_issue_line(diff.translated_rule, diff.diff_type) issue.update( issue_type="rule_difference", diff_type=diff.diff_type, - line_en=diff.english_rule.line_number, - line_tr=diff.translated_rule.line_number, + issue_line_en=issue_line_en, + issue_line_tr=issue_line_tr, + rule_line_en=diff.english_rule.line_number, + rule_line_tr=diff.translated_rule.line_number, description=diff.description, english_snippet=diff.english_snippet, translated_snippet=diff.translated_snippet, @@ -278,16 +325,18 @@ def print_warnings(result: ComparisonResult, file_name: str) -> int: if result.missing_rules: console.print(f"\n [red]✗[/] [bold]Missing Rules[/] [[red]{len(result.missing_rules)}[/]] [dim](in English but not in translation)[/]") for rule in result.missing_rules: - print_rule_item(rule, context=" in English") + print_rule_item(rule, issue_line=rule.line_number, context=" in English") issues += 1 if result.untranslated_text: - console.print(f"\n [yellow]⚠[/] [bold]Untranslated Text[/] [[yellow]{len(result.untranslated_text)}[/]] [dim](lowercase t/ot/ct keys)[/]") - for rule, texts in result.untranslated_text: - print_rule_item(rule) - for text in texts: + untranslated_count = sum(len(entries) for _, entries in result.untranslated_text) + console.print(f"\n [yellow]⚠[/] [bold]Untranslated Text[/] [[yellow]{untranslated_count}[/]] [dim](lowercase t/ot/ct keys)[/]") + for rule, entries in result.untranslated_text: + for _, text, line in entries: + issue_line = line or rule.line_number + print_rule_item(rule, issue_line=issue_line) console.print(f" [dim]→[/] [yellow]\"{escape(text)}\"[/]") - issues += 1 + issues += 1 if result.rule_differences: total_diffs = len(result.rule_differences) @@ -296,13 +345,22 @@ def print_warnings(result: ComparisonResult, file_name: str) -> int: f"[[magenta]{total_diffs}[/]] [dim](structural differences between en and translation)[/]" ) for diff in result.rule_differences: - print_diff_item(diff) + if diff.diff_type == "structure": + en_tokens = extract_structure_elements(diff.english_rule.data) + tr_tokens = extract_structure_elements(diff.translated_rule.data) + en_token, tr_token = first_structure_mismatch(en_tokens, tr_tokens) + line_en = resolve_issue_line(diff.english_rule, "structure", en_token) + line_tr = resolve_issue_line(diff.translated_rule, "structure", tr_token) + else: + line_en = resolve_issue_line(diff.english_rule, diff.diff_type) + line_tr = resolve_issue_line(diff.translated_rule, diff.diff_type) + print_diff_item(diff, line_en=line_en, line_tr=line_tr) issues += 1 if result.extra_rules: console.print(f"\n [blue]ℹ[/] [bold]Extra Rules[/] [[blue]{len(result.extra_rules)}[/]] [dim](may be intentional)[/]") for rule in result.extra_rules: - print_rule_item(rule) + print_rule_item(rule, issue_line=rule.line_number) issues += 1 return issues @@ -399,7 +457,7 @@ def audit_language( files_ok += 1 total_missing += len(result.missing_rules) - total_untranslated += len(result.untranslated_text) + total_untranslated += sum(len(entries) for _, entries in result.untranslated_text) total_extra += len(result.extra_rules) total_differences += len(result.rule_differences) diff --git a/PythonScripts/audit_translations/dataclasses.py b/PythonScripts/audit_translations/dataclasses.py index 36e45ab2..9cd4b0e9 100644 --- a/PythonScripts/audit_translations/dataclasses.py +++ b/PythonScripts/audit_translations/dataclasses.py @@ -5,12 +5,43 @@ """ from dataclasses import dataclass, field -from typing import Any, List, Tuple, Optional +from typing import Any, List, Tuple, Optional, Dict @dataclass class RuleInfo: - """Information about a single rule""" + """ + Information about a single rule parsed from a YAML file. + + Attributes + ---------- + name : Optional[str] + Rule name for standard rule files; None for unicode entries. + tag : Optional[str] + Rule tag (normalized string); None for unicode entries. + key : str + Stable identifier used for matching; for unicode entries this is the character or range key. + line_number : int + 1-based line number where the rule starts in the source file. + raw_content : str + Raw YAML block for this rule (used for reporting/snippets). + data : Optional[Any] + Parsed YAML node for the rule; used for structural diffs. + has_untranslated_text : bool + True if the rule contains lowercase t/ot/ct/etc. values. + untranslated_keys : List[str] + List of untranslated text values (used for summary counts). + untranslated_entries : List[Tuple[str, str, Optional[int]]] + List of (key, text, line) entries extracted from lowercase translation keys. + This drives per-issue JSONL output so each untranslated string can report + the specific YAML line number where it appears. + line_map : Dict[str, List[int]] + Mapping of element type to line numbers for rule components like match, + conditions, variables, and structural tokens. This is used to point + structural diffs at a precise line rather than the top of the rule. + audit_ignore : bool + True if the raw content contains an audit-ignore marker. + """ name: Optional[str] # None for unicode entries tag: Optional[str] # None for unicode entries key: str # For unicode entries, this is the character/range @@ -19,6 +50,8 @@ class RuleInfo: data: Optional[Any] = None has_untranslated_text: bool = False untranslated_keys: List[str] = field(default_factory=list) + untranslated_entries: List[Tuple[str, str, Optional[int]]] = field(default_factory=list) # (key, text, line) for JSONL output + line_map: Dict[str, List[int]] = field(default_factory=dict) # Element-type -> line numbers for precise diff locations audit_ignore: bool = False @@ -38,7 +71,7 @@ class ComparisonResult: """Results from comparing English and translated files""" missing_rules: List[RuleInfo] # Rules in English but not in translation extra_rules: List[RuleInfo] # Rules in translation but not in English - untranslated_text: List[Tuple[RuleInfo, List[str]]] # Rules with lowercase t/ot/ct + untranslated_text: List[Tuple[RuleInfo, List[Tuple[str, str, Optional[int]]]]] # Rules with lowercase t/ot/ct file_path: str english_rule_count: int translated_rule_count: int diff --git a/PythonScripts/audit_translations/parsers.py b/PythonScripts/audit_translations/parsers.py index dd10b9e4..1553a2b0 100644 --- a/PythonScripts/audit_translations/parsers.py +++ b/PythonScripts/audit_translations/parsers.py @@ -5,7 +5,7 @@ """ import os -from typing import Any, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple from ruamel.yaml import YAML from ruamel.yaml.scanner import ScannerError @@ -92,7 +92,8 @@ def parse_rules_file(content: str, data: Any) -> List[RuleInfo]: for item, raw_content, line_idx in zip(rule_items, raw_blocks, start_lines): rule_name = str(item.get("name")) tag = format_tag(item.get("tag")) - untranslated = find_untranslated_text_values(item) + untranslated_entries = find_untranslated_text_entries(item) + untranslated = [entry[1] for entry in untranslated_entries] rule_key = f"{rule_name}|{tag or 'unknown'}" rules.append(RuleInfo( name=rule_name, @@ -103,6 +104,8 @@ def parse_rules_file(content: str, data: Any) -> List[RuleInfo]: data=item, has_untranslated_text=len(untranslated) > 0, untranslated_keys=untranslated, + untranslated_entries=untranslated_entries, + line_map=build_line_map(item), audit_ignore=has_audit_ignore(raw_content) )) @@ -130,7 +133,8 @@ def parse_unicode_file(content: str, data: Any) -> List[RuleInfo]: raw_blocks = build_raw_blocks(lines, start_lines) for (char_key, value), raw_content, line_idx in zip(entries, raw_blocks, start_lines): - untranslated = find_untranslated_text_values(value) + untranslated_entries = find_untranslated_text_entries(value) + untranslated = [entry[1] for entry in untranslated_entries] rules.append(RuleInfo( name=None, tag=None, @@ -140,6 +144,8 @@ def parse_unicode_file(content: str, data: Any) -> List[RuleInfo]: data=value, has_untranslated_text=len(untranslated) > 0, untranslated_keys=untranslated, + untranslated_entries=untranslated_entries, + line_map=build_line_map(value), audit_ignore=has_audit_ignore(raw_content) )) @@ -183,6 +189,102 @@ def walk(value: Any) -> None: return untranslated +def find_untranslated_text_entries(node: Any) -> List[Tuple[str, str, Optional[int]]]: + """ + Find lowercase text keys (t, ot, ct, spell, pronounce, ifthenelse) and their line numbers. + Returns list of (key, text, line_number) entries. Line number is 1-based when available. + """ + entries: List[Tuple[str, str, Optional[int]]] = [] + translation_keys = {"t", "ot", "ct", "spell", "pronounce", "ifthenelse"} + + def should_add(text: str) -> bool: + if not text.strip(): + return False + if len(text) == 1 and not text.isalpha(): + return False + if text.startswith('$') or text.startswith('@'): + return False + return True + + def key_line(mapping: Any, key: str) -> Optional[int]: + if hasattr(mapping, "lc") and hasattr(mapping.lc, "data"): + line_info = mapping.lc.data.get(key) + if line_info: + return line_info[0] + 1 + return None + + def walk(value: Any) -> None: + if isinstance(value, dict): + for key, child in value.items(): + if ( + isinstance(key, str) + and key.lower() in translation_keys + and not key.isupper() + and isinstance(child, str) + ): + if should_add(child): + entries.append((key, child, key_line(value, key))) + walk(child) + elif isinstance(value, list): + for item in value: + walk(item) + + walk(node) + return entries + + +def build_line_map(node: Any) -> Dict[str, List[int]]: + """ + Build a mapping of rule element types to line numbers. + Line numbers are 1-based. Missing elements are omitted. + """ + line_map: Dict[str, List[int]] = {} + structure_tokens = { + "test", + "if", + "else_if", + "then", + "else", + "then_test", + "else_test", + "with", + "replace", + "intent", + } + + def add_line(kind: str, line: Optional[int]) -> None: + if line is None: + return + line_map.setdefault(kind, []).append(line) + + def key_line(mapping: Any, key: str) -> Optional[int]: + if hasattr(mapping, "lc") and hasattr(mapping.lc, "data"): + line_info = mapping.lc.data.get(key) + if line_info: + return line_info[0] + 1 + return None + + def walk(value: Any) -> None: + if isinstance(value, dict): + for key, child in value.items(): + if isinstance(key, str): + if key == "match": + add_line("match", key_line(value, key)) + if key in ("if", "else_if"): + add_line("condition", key_line(value, key)) + if key == "variables": + add_line("variables", key_line(value, key)) + if key in structure_tokens: + add_line(f"structure:{key}", key_line(value, key)) + walk(child) + elif isinstance(value, list): + for item in value: + walk(item) + + walk(node) + return line_map + + def normalize_match(value: Any) -> str: if isinstance(value, list): return " ".join(str(item) for item in value) diff --git a/PythonScripts/audit_translations/tests/fixtures/Rules/Languages/en/SharedRules/calculus.yaml b/PythonScripts/audit_translations/tests/fixtures/Rules/Languages/en/SharedRules/calculus.yaml new file mode 100644 index 00000000..df25dd85 --- /dev/null +++ b/PythonScripts/audit_translations/tests/fixtures/Rules/Languages/en/SharedRules/calculus.yaml @@ -0,0 +1,57 @@ +--- +# fixture copied over from Rules + +- name: laplacian + tag: laplacian + match: "count(*) <= 1" # can be on ∇^2 or on enclosing mrow + replace: + - t: "LahPlahsian" # phrase('laplacian' of x) -- "LahPlahsian" sounds better with speech engines tested + - test: + if: "count(*) = 1" + then: + - test: + if: "$Verbosity!='Terse'" + then: [t: "of"] # phrase(function 'of' one variable) -- note OneCore voices spell out "div" + - test: + if: "not(IsNode(*[1], 'leaf'))" + then: [pause: short] + - x: "*[1]" + +- name: divergence + tag: divergence + match: "count(*) = 1" + replace: + - test: + if: "$Verbosity='Terse'" + then: [t: "dihv"] # phrase('div' is short for divergence) -- note OneCore voices spell out "div" + else: [t: "divergence of"] # phrase('divergence of' this function from the mean) + - test: + if: "not(IsNode(*[1], 'leaf'))" + then: [pause: short] + - x: "*[1]" + +- name: curl + tag: curl + match: "count(*) = 1" + replace: + - t: "curl" # phrase(the 'curl of' a field) + - test: + if: "$Verbosity!='Terse'" + then: [t: "of"] # phrase(function 'of' one variable) -- note OneCore voices spell out "div" + - test: + if: "not(IsNode(*[1], 'leaf'))" + then: [pause: short] + - x: "*[1]" + +- name: gradient + tag: gradient + match: "count(*) = 1" + replace: + - test: + if: "$Verbosity!='Terse'" + then: [t: "gradient of"] # phrase(the hill has a 'gradient of' five percent) + else: [t: "del"] # phrase(the delete key is labeled 'del') + - test: + if: "not(IsNode(*[1], 'leaf'))" + then: [pause: short] + - x: "*[1]" diff --git a/PythonScripts/audit_translations/tests/fixtures/Rules/Languages/en/overview.yaml b/PythonScripts/audit_translations/tests/fixtures/Rules/Languages/en/overview.yaml new file mode 100644 index 00000000..9286ba8b --- /dev/null +++ b/PythonScripts/audit_translations/tests/fixtures/Rules/Languages/en/overview.yaml @@ -0,0 +1,110 @@ +--- +# fixture copied over from Rules + + + +- name: overview-default + tag: [mfrac, fraction] + match: "." + replace: + - test: + if: "IsNode(*[1], 'simple') and IsNode(*[2], 'simple')" + then: + - x: "*[1]" + - t: "over" + - x: "*[2]" + else: + - t: "fraction" + +- name: overview-default + tag: [msqrt, "square-root"] + match: "." + replace: + - t: "square root" + - test: + if: "IsNode(*[1], 'simple')" + then: + - test: + if: "$Verbosity!='Terse'" + then: [t: "of"] + - x: "*[1]" + +- name: overview-default + tag: [mroot, root] + match: "." + replace: + - test: + if: "*[2][self::m:mn]" + then_test: + - if: "*[2][.='2']" + then: [t: "square root"] + - else_if: "*[2][.='3']" + then: [t: "cube root"] + - else_if: "*[2][not(contains(., '.'))]" + then: [x: "ToOrdinal(*[2])", t: "root"] + else: + - test: + if: "*[2][self::m:mi][string-length(.)=1]" + then: + - x: "*[2]" + - pronounce: [text: "-th", ipa: "θ", sapi5: "th", eloquence: "T"] + else: [x: "*[2]"] + - t: "root" + - test: + if: "IsNode(*[1], 'simple')" + then: + - test: + if: "$Verbosity!='Terse'" + then: [t: "of"] + - x: "*[1]" + +- name: matrix-override + tag: mrow + match: + - "*[2][self::m:mtable] and" + - "(IsBracketed(., '(', ')') or IsBracketed(., '[', ']') or IsBracketed(., '|', '|'))" + replace: + - t: "the" + - x: count(*[2]/*) + - t: "by" + - x: count(*[2]/*[self::m:mtr][1]/*) + - test: + if: "*[1][.='|']" # just need to check the first bracket since we know it must be (, [, or | + then: [t: "determinant"] + else: [t: "matrix"] + +- name: overview-default + tag: mtable + match: "." + replace: + - t: "the" + - x: count(*[2]/*) + - t: "by" + - x: count(*[2]/*[self::m:mtr][1]/*) + - t: "table" + +- name: short-mrow + tag: mrow + match: "count(*)<6" + replace: + - insert: + nodes: "*" + replace: [pause: auto] + +- name: long-mrow + tag: mrow + match: "." + replace: + - x: "*[1]" + - pause: auto + - x: "*[2]" + - pause: auto + - x: "*[3]" + - pause: auto + - x: "*[4]" + - pause: auto + - x: "*[5]" + - pause: auto + - t: "and so on" + +- include: "SimpleSpeak_Rules.yaml" diff --git a/PythonScripts/audit_translations/tests/fixtures/Rules/Languages/es/SharedRules/calculus.yaml b/PythonScripts/audit_translations/tests/fixtures/Rules/Languages/es/SharedRules/calculus.yaml new file mode 100644 index 00000000..9da69a80 --- /dev/null +++ b/PythonScripts/audit_translations/tests/fixtures/Rules/Languages/es/SharedRules/calculus.yaml @@ -0,0 +1,39 @@ +--- +# fixture copied over from Rules + +- name: divergence + tag: divergence + match: "." + replace: + - test: + if: "$Verbosity='Verbose'" + then: [t: "divergence"] # phrase('divergence' from the mean) + else: [t: "div"] # phrase('divergence' from the mean) + - t: "of" # phrase(systems 'of' linear equations) + - test: + if: "not(IsNode(*[1], 'leaf'))" + then: [pause: short] + - x: "*[1]" + +- name: curl + tag: curl + match: "." + replace: + - t: "curl of" # phrase(the 'curl of' a field) + - test: + if: "not(IsNode(*[1], 'leaf'))" + then: [pause: short] + - x: "*[1]" + +- name: gradient + tag: gradient + match: "." + replace: + - test: + if: "$Verbosity!='Terse'" + then: [t: "gradient of"] # phrase('divergence' from the mean) + else: [t: "del"] # phrase('divergence' from the mean) + - test: + if: "not(IsNode(*[1], 'leaf'))" + then: [pause: short] + - x: "*[1]" diff --git a/PythonScripts/audit_translations/tests/fixtures/Rules/Languages/es/overview.yaml b/PythonScripts/audit_translations/tests/fixtures/Rules/Languages/es/overview.yaml new file mode 100644 index 00000000..328938d8 --- /dev/null +++ b/PythonScripts/audit_translations/tests/fixtures/Rules/Languages/es/overview.yaml @@ -0,0 +1,110 @@ +--- +# fixture copied over from Rules + + +- name: overview-default + tag: mfrac + match: "." + replace: + - test: + if: "IsNode(*[1], 'simple') and IsNode(*[2], 'simple')" + then: + - x: "*[1]" + - T: "partido por" + - x: "*[2]" + else: + - T: "fracción" + +- name: overview-default + tag: msqrt + match: "." + replace: + - T: "raíz cuadrada" + - test: + if: "IsNode(*[1], 'simple')" + then: + - test: + if: "$Verbosity!='Terse'" + then: [T: de] + - x: "*[1]" + +- name: overview-default + tag: mroot + match: "." + replace: + - test: + if: "*[2][self::m:mn]" + then_test: + - if: "*[2][.='2']" + then: [T: raíz cuadrada] + - else_if: "*[2][.='3']" + then: [T: raíz cúbica] + - else_if: "*[2][not(contains(., '.'))]" + then: [x: "ToOrdinal(*[2])", T: raíz] + else: + - test: + if: "*[2][self::m:mi][string-length(.)=1]" + then: + - x: "*[2]" + - pronounce: [text: "-th", ipa: "θ", sapi5: "th", eloquence: "T"] + else: [x: "*[2]"] + - T: raíz + - test: + if: "IsNode(*[1], 'simple')" + then: + - test: + if: "$Verbosity!='Terse'" + then: [T: de] + - x: "*[1]" + +- name: matrix-override + tag: mrow + match: + - "*[2][self::m:mtable] and" + - "(IsBracketed(., '(', ')') or IsBracketed(., '[', ']') or IsBracketed(., '|', '|'))" + replace: + - T: la + - x: count(*[2]/*) + - T: por + - x: count(*[2]/*[self::m:mtr][1]/*) + - test: + if: "*[1][.='|']" # just need to check the first bracket since we know it must be (, [, or | + then: [T: determinante] + else: [T: matriz] + +- name: overview-default + tag: mtable + match: "." + replace: + - T: la + - x: count(*[2]/*) + - T: por + - x: count(*[2]/*[self::m:mtr][1]/*) + - T: "tabla" + +- name: short-mrow + tag: mrow + match: "count(*)<6" + replace: + - insert: + nodes: "*" + replace: [pause: auto] + +- name: long-mrow + tag: mrow + match: "." + replace: + - x: "*[1]" + - pause: auto + - x: "*[2]" + - pause: auto + - x: "*[3]" + - pause: auto + - x: "*[4]" + - pause: auto + - x: "*[5]" + - pause: auto + - T: "etc." + +- include: "SimpleSpeak_Rules.yaml" + diff --git a/PythonScripts/audit_translations/tests/golden/jsonl/de.json b/PythonScripts/audit_translations/tests/golden/jsonl/de.json index 010d64db..f5c4600f 100644 --- a/PythonScripts/audit_translations/tests/golden/jsonl/de.json +++ b/PythonScripts/audit_translations/tests/golden/jsonl/de.json @@ -5,8 +5,10 @@ "rule_name": "rule-2", "rule_tag": "mn", "rule_key": "rule-2|mn", - "line_en": 7, - "line_tr": null, + "issue_line_en": 7, + "issue_line_tr": null, + "rule_line_en": 7, + "rule_line_tr": null, "issue_type": "missing_rule", "diff_type": "", "description": "Rule present in English but missing in translation", @@ -21,8 +23,10 @@ "rule_name": "rule-3", "rule_tag": "mo", "rule_key": "rule-3|mo", - "line_en": null, - "line_tr": 7, + "issue_line_en": null, + "issue_line_tr": 7, + "rule_line_en": null, + "rule_line_tr": 7, "issue_type": "extra_rule", "diff_type": "", "description": "Rule present in translation but missing in English", @@ -37,8 +41,10 @@ "rule_name": "rule-1", "rule_tag": "mo", "rule_key": "rule-1|mo", - "line_en": null, - "line_tr": 1, + "issue_line_en": null, + "issue_line_tr": 5, + "rule_line_en": null, + "rule_line_tr": 1, "issue_type": "untranslated_text", "diff_type": "", "description": "Lowercase t/ot/ct keys indicate untranslated text", @@ -55,8 +61,10 @@ "rule_name": "condition-none", "rule_tag": "mi", "rule_key": "condition-none|mi", - "line_en": 1, - "line_tr": 1, + "issue_line_en": 6, + "issue_line_tr": 6, + "rule_line_en": 1, + "rule_line_tr": 1, "issue_type": "rule_difference", "diff_type": "condition", "description": "Conditions differ", @@ -71,8 +79,10 @@ "rule_name": "match-rule", "rule_tag": "mo", "rule_key": "match-rule|mo", - "line_en": 1, - "line_tr": 1, + "issue_line_en": 3, + "issue_line_tr": 3, + "rule_line_en": 1, + "rule_line_tr": 1, "issue_type": "rule_difference", "diff_type": "match", "description": "Match pattern differs", @@ -87,8 +97,10 @@ "rule_name": "cond-rule", "rule_tag": "mi", "rule_key": "cond-rule|mi", - "line_en": 1, - "line_tr": 1, + "issue_line_en": 6, + "issue_line_tr": 6, + "rule_line_en": 1, + "rule_line_tr": 1, "issue_type": "rule_difference", "diff_type": "condition", "description": "Conditions differ", @@ -103,8 +115,10 @@ "rule_name": "struct-rule", "rule_tag": "mi", "rule_key": "struct-rule|mi", - "line_en": 1, - "line_tr": 1, + "issue_line_en": 9, + "issue_line_tr": 1, + "rule_line_en": 1, + "rule_line_tr": 1, "issue_type": "rule_difference", "diff_type": "structure", "description": "Rule structure differs (test/if/then/else blocks)", @@ -119,8 +133,10 @@ "rule_name": "struct-empty", "rule_tag": "mi", "rule_key": "struct-empty|mi", - "line_en": 1, - "line_tr": 1, + "issue_line_en": 4, + "issue_line_tr": 1, + "rule_line_en": 1, + "rule_line_tr": 1, "issue_type": "rule_difference", "diff_type": "structure", "description": "Rule structure differs (test/if/then/else blocks)", @@ -135,8 +151,10 @@ "rule_name": "", "rule_tag": "", "rule_key": "b", - "line_en": 3, - "line_tr": null, + "issue_line_en": 3, + "issue_line_tr": null, + "rule_line_en": 3, + "rule_line_tr": null, "issue_type": "missing_rule", "diff_type": "", "description": "Rule present in English but missing in translation", @@ -151,8 +169,10 @@ "rule_name": "", "rule_tag": "", "rule_key": "c", - "line_en": null, - "line_tr": 5, + "issue_line_en": null, + "issue_line_tr": 5, + "rule_line_en": null, + "rule_line_tr": 5, "issue_type": "extra_rule", "diff_type": "", "description": "Rule present in translation but missing in English", @@ -167,8 +187,10 @@ "rule_name": "", "rule_tag": "", "rule_key": "a", - "line_en": null, - "line_tr": 1, + "issue_line_en": null, + "issue_line_tr": 2, + "rule_line_en": null, + "rule_line_tr": 1, "issue_type": "untranslated_text", "diff_type": "", "description": "Lowercase t/ot/ct keys indicate untranslated text", @@ -185,8 +207,10 @@ "rule_name": "vars-rule", "rule_tag": "mo", "rule_key": "vars-rule|mo", - "line_en": 1, - "line_tr": 1, + "issue_line_en": 4, + "issue_line_tr": 4, + "rule_line_en": 1, + "rule_line_tr": 1, "issue_type": "rule_difference", "diff_type": "variables", "description": "Variable definitions differ", @@ -201,8 +225,10 @@ "rule_name": "vars-none", "rule_tag": "mo", "rule_key": "vars-none|mo", - "line_en": 1, - "line_tr": 1, + "issue_line_en": 4, + "issue_line_tr": 1, + "rule_line_en": 1, + "rule_line_tr": 1, "issue_type": "rule_difference", "diff_type": "variables", "description": "Variable definitions differ", diff --git a/PythonScripts/audit_translations/tests/test_auditor.py b/PythonScripts/audit_translations/tests/test_auditor.py index 7d2d3e45..54b838cd 100644 --- a/PythonScripts/audit_translations/tests/test_auditor.py +++ b/PythonScripts/audit_translations/tests/test_auditor.py @@ -36,7 +36,7 @@ def test_collect_issues_fields() -> None: result = ComparisonResult( missing_rules=[missing], extra_rules=[extra], - untranslated_text=[(untranslated, ["x"])], + untranslated_text=[(untranslated, [("t", "x", 31)])], rule_differences=[diff], file_path="", english_rule_count=1, @@ -46,19 +46,28 @@ def test_collect_issues_fields() -> None: issues = collect_issues(result, "file.yaml", "xx") by_type = {issue["issue_type"]: issue for issue in issues} - assert by_type["missing_rule"]["line_en"] == 10 - assert by_type["missing_rule"]["line_tr"] is None + assert by_type["missing_rule"]["issue_line_en"] == 10 + assert by_type["missing_rule"]["issue_line_tr"] is None + assert by_type["missing_rule"]["rule_line_en"] == 10 + assert by_type["missing_rule"]["rule_line_tr"] is None assert "english_raw" not in by_type["missing_rule"] - assert by_type["extra_rule"]["line_tr"] == 20 + assert by_type["extra_rule"]["issue_line_tr"] == 20 + assert by_type["extra_rule"]["rule_line_tr"] == 20 assert "translated_raw" not in by_type["extra_rule"] assert by_type["untranslated_text"]["untranslated_texts"] == ["x"] + assert by_type["untranslated_text"]["issue_line_tr"] == 31 + assert by_type["untranslated_text"]["rule_line_tr"] == 30 assert "translated_raw" not in by_type["untranslated_text"] assert by_type["rule_difference"]["diff_type"] == "match" assert by_type["rule_difference"]["english_snippet"] == "a" assert by_type["rule_difference"]["translated_snippet"] == "b" + assert by_type["rule_difference"]["issue_line_en"] == 40 + assert by_type["rule_difference"]["issue_line_tr"] == 41 + assert by_type["rule_difference"]["rule_line_en"] == 40 + assert by_type["rule_difference"]["rule_line_tr"] == 41 assert "english_raw" not in by_type["rule_difference"] diff --git a/PythonScripts/audit_translations/tests/test_cli_end_to_end.py b/PythonScripts/audit_translations/tests/test_cli_end_to_end.py new file mode 100644 index 00000000..ce1e325c --- /dev/null +++ b/PythonScripts/audit_translations/tests/test_cli_end_to_end.py @@ -0,0 +1,84 @@ +""" +CLI coverage tests for audit_translations. +""" + +from __future__ import annotations + +import json +import os +import subprocess +import sys +from collections import Counter +from pathlib import Path + +from .. import cli as audit_cli + + +def fixture_rules_dir() -> Path: + return Path(__file__).resolve().parent / "fixtures" / "Rules" / "Languages" + + +def parse_jsonl(output: str) -> list[dict]: + return [json.loads(line) for line in output.splitlines() if line.strip()] + + +def assert_issue_counts(issues: list[dict]) -> None: + counts = Counter(issue["issue_type"] for issue in issues) + assert len(issues) == 19 + assert counts["missing_rule"] == 4 + assert counts["extra_rule"] == 3 + assert counts["untranslated_text"] == 6 + assert counts["rule_difference"] == 6 + + +def test_cli_main_jsonl_output_matches_fixture(capsys, monkeypatch) -> None: + """ + Exercise the CLI entrypoint in-process by patching sys.argv. + + This validates argparse wiring and output formatting without spawning a new process. + """ + rules_dir = fixture_rules_dir() + args = [ + "es", + "--format", + "jsonl", + "--rules-dir", + str(rules_dir), + ] + + monkeypatch.setattr(sys, "argv", ["audit_translations", *args]) + audit_cli.main() + in_process_output = capsys.readouterr().out + assert_issue_counts(parse_jsonl(in_process_output)) + + +def test_cli_module_jsonl_output_matches_fixture() -> None: + """ + Exercise the CLI via python -m audit_translations in a subprocess. + + This validates module execution, environment wiring, and exit behavior. + """ + rules_dir = fixture_rules_dir() + args = [ + "es", + "--format", + "jsonl", + "--rules-dir", + str(rules_dir), + ] + + python_scripts_dir = Path(__file__).resolve().parents[2] + env = os.environ.copy() + env["PYTHONPATH"] = os.pathsep.join( + [str(python_scripts_dir), env.get("PYTHONPATH", "")] + ).strip(os.pathsep) + + result = subprocess.run( + [sys.executable, "-m", "audit_translations", *args], + capture_output=True, + text=True, + cwd=str(python_scripts_dir), + env=env, + check=True, + ) + assert_issue_counts(parse_jsonl(result.stdout)) diff --git a/PythonScripts/audit_translations/tests/test_parsers.py b/PythonScripts/audit_translations/tests/test_parsers.py index 4685fc65..d500ced2 100644 --- a/PythonScripts/audit_translations/tests/test_parsers.py +++ b/PythonScripts/audit_translations/tests/test_parsers.py @@ -13,8 +13,10 @@ extract_match_pattern, extract_structure_elements, extract_variables, + find_untranslated_text_entries, find_untranslated_text_values, has_audit_ignore, + build_line_map, parse_rules_file, parse_unicode_file, ) @@ -96,6 +98,18 @@ def test_ignores_single_punctuation(self): content = {"t": "."} assert find_untranslated_text_values(content) == [] + def test_finds_entries_with_lines(self): + """Ensure finds entries with line numbers.""" + yaml = YAML() + content = """- name: line-check + tag: mo + replace: + - t: "not translated" +""" + data = yaml.load(content) + entries = find_untranslated_text_entries(data[0]) + assert entries == [("t", "not translated", 4)] + class TestParseRulesFile: def test_parses_simple_rule(self): @@ -114,6 +128,7 @@ def test_parses_simple_rule(self): assert rules[0].tag == "mo" assert rules[0].key == "my-rule|mo" assert rules[0].line_number == 1 + assert rules[0].line_map["match"] == [3] def test_parses_multiple_rules(self): """Ensure parses multiple rules.""" @@ -144,6 +159,7 @@ def test_detects_untranslated_text(self): rules = parse_rules_file(content, data) assert rules[0].has_untranslated_text assert "not translated" in rules[0].untranslated_keys + assert rules[0].untranslated_entries == [("t", "not translated", 4)] def test_detects_audit_ignore(self): """Ensure detects audit ignore.""" @@ -306,6 +322,30 @@ def test_extracts_test_structure(self): assert "else:" in elements +class TestBuildLineMap: + def test_builds_line_map_for_rule_elements(self): + """Ensure line map captures nested element lines.""" + content = """- name: line-map + tag: mo + match: "." + if: cond + variables: + - foo: bar + test: + if: cond2 + then: + - t: "x" +""" + yaml = YAML() + data = yaml.load(content) + line_map = build_line_map(data[0]) + assert line_map["match"] == [3] + assert line_map["condition"] == [4, 8] + assert line_map["variables"] == [5] + assert line_map["structure:test"] == [7] + assert line_map["structure:if"] == [4, 8] + + def make_rule(name: str, tag: str, data) -> RuleInfo: """Helper to create RuleInfo for testing""" return RuleInfo( From 7bb8919ee169fc74aff0f9377f12bf8575393fda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20Gro=C3=9F?= Date: Fri, 9 Jan 2026 19:38:56 +0100 Subject: [PATCH 4/8] fix merge --- PythonScripts/audit_translations/auditor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PythonScripts/audit_translations/auditor.py b/PythonScripts/audit_translations/auditor.py index 3078a574..dd0c68ae 100644 --- a/PythonScripts/audit_translations/auditor.py +++ b/PythonScripts/audit_translations/auditor.py @@ -355,7 +355,7 @@ def print_warnings(result: ComparisonResult, file_name: str, verbose: bool = Fal else: line_en = resolve_issue_line(diff.english_rule, diff.diff_type) line_tr = resolve_issue_line(diff.translated_rule, diff.diff_type) - print_diff_item(diff, line_en=line_en, line_tr=line_tr) + print_diff_item(diff, line_en=line_en, line_tr=line_tr, verbose=verbose) issues += 1 if result.extra_rules: From 56290e2df22e0ad75c1538ca398b7488154aea6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20Gro=C3=9F?= Date: Fri, 9 Jan 2026 19:58:53 +0100 Subject: [PATCH 5/8] Add tests for `print_warnings` function to verify verbose and non-verbose output behavior with golden snapshots. --- .../rich/structure_diff_nonverbose.golden | 9 ++++ .../golden/rich/structure_diff_verbose.golden | 11 +++++ .../audit_translations/tests/test_auditor.py | 42 ++++++++++++++++++- 3 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 PythonScripts/audit_translations/tests/golden/rich/structure_diff_nonverbose.golden create mode 100644 PythonScripts/audit_translations/tests/golden/rich/structure_diff_verbose.golden diff --git a/PythonScripts/audit_translations/tests/golden/rich/structure_diff_nonverbose.golden b/PythonScripts/audit_translations/tests/golden/rich/structure_diff_nonverbose.golden new file mode 100644 index 00000000..40fd15c6 --- /dev/null +++ b/PythonScripts/audit_translations/tests/golden/rich/structure_diff_nonverbose.golden @@ -0,0 +1,9 @@ + +──────────────────────────────────────────────────────────────────────────────── +✓ structure_diff.yaml + English: 1 rules → Translated: 1 rules +──────────────────────────────────────────────────────────────────────────────── + + ≠ Rule Differences [1] (structural differences between en and translation) + • struct-rule (line 9 en, 1 tr) + Rule structure differs (test/if/then/else blocks) diff --git a/PythonScripts/audit_translations/tests/golden/rich/structure_diff_verbose.golden b/PythonScripts/audit_translations/tests/golden/rich/structure_diff_verbose.golden new file mode 100644 index 00000000..3af596bd --- /dev/null +++ b/PythonScripts/audit_translations/tests/golden/rich/structure_diff_verbose.golden @@ -0,0 +1,11 @@ + +──────────────────────────────────────────────────────────────────────────────── +✓ structure_diff.yaml + English: 1 rules → Translated: 1 rules +──────────────────────────────────────────────────────────────────────────────── + + ≠ Rule Differences [1] (structural differences between en and translation) + • struct-rule (line 9 en, 1 tr) + Rule structure differs (test/if/then/else blocks) + en: replace: test: if: then: else: + tr: replace: test: if: then: diff --git a/PythonScripts/audit_translations/tests/test_auditor.py b/PythonScripts/audit_translations/tests/test_auditor.py index 54b838cd..6b452014 100644 --- a/PythonScripts/audit_translations/tests/test_auditor.py +++ b/PythonScripts/audit_translations/tests/test_auditor.py @@ -2,7 +2,9 @@ Tests for auditor helpers. """ -from ..auditor import collect_issues, compare_files, console, get_yaml_files, list_languages +from pathlib import Path + +from ..auditor import collect_issues, compare_files, console, get_yaml_files, list_languages, print_warnings from ..dataclasses import ComparisonResult, RuleDifference, RuleInfo @@ -179,3 +181,41 @@ def test_list_languages_includes_region_codes(tmp_path) -> None: assert "zz" in output assert "zz-aa" in output + + +def test_print_warnings_omits_snippets_when_not_verbose() -> None: + """ + Ensure the print_warnings output matches the non-verbose golden snapshot. + """ + base_dir = Path(__file__).parent + fixtures_dir = base_dir / "fixtures" + golden_path = base_dir / "golden" / "rich" / "structure_diff_nonverbose.golden" + result = compare_files( + str(fixtures_dir / "en" / "structure_diff.yaml"), + str(fixtures_dir / "de" / "structure_diff.yaml"), + ) + + with console.capture() as capture: + print_warnings(result, "structure_diff.yaml", verbose=False) + output = capture.get() + + assert output == golden_path.read_text(encoding="utf-8") + + +def test_print_warnings_includes_snippets_when_verbose() -> None: + """ + Ensure the print_warnings output matches the verbose golden snapshot. + """ + base_dir = Path(__file__).parent + fixtures_dir = base_dir / "fixtures" + golden_path = base_dir / "golden" / "rich" / "structure_diff_verbose.golden" + result = compare_files( + str(fixtures_dir / "en" / "structure_diff.yaml"), + str(fixtures_dir / "de" / "structure_diff.yaml"), + ) + + with console.capture() as capture: + print_warnings(result, "structure_diff.yaml", verbose=True) + output = capture.get() + + assert output == golden_path.read_text(encoding="utf-8") From e8b54383c8b248913adde29fb6c736c969153c13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20Gro=C3=9F?= Date: Fri, 9 Jan 2026 20:00:58 +0100 Subject: [PATCH 6/8] Revert "use LazyLock in prefs.rs" This reverts commit 5bfea6fdfd7257b84fb8e7b613e1df520ea1fbe5. --- src/prefs.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/prefs.rs b/src/prefs.rs index 39137d40..7443511e 100644 --- a/src/prefs.rs +++ b/src/prefs.rs @@ -26,7 +26,6 @@ extern crate dirs; use std::cell::RefCell; use std::rc::Rc; use std::path::{Path, PathBuf}; -use std::sync::LazyLock; use crate::speech::{as_str_checked, RulesFor, FileAndTime}; use std::collections::{HashMap, HashSet}; use phf::phf_set; @@ -36,7 +35,9 @@ use crate::errors::*; /// Use to indicate preference not found with Preference::to_string() pub static NO_PREFERENCE: &str = "\u{FFFF}"; -static DEFAULT_LANG: LazyLock = LazyLock::new(|| Yaml::String("en".to_string())); +lazy_static! { + static ref DEFAULT_LANG: Yaml = Yaml::String("en".to_string()); +} // Preferences are recorded here From b9612260b1637f2cdec562cc26f570145f7059bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20Gro=C3=9F?= Date: Fri, 9 Jan 2026 20:01:06 +0100 Subject: [PATCH 7/8] Revert "change more parts" This reverts commit 2f10adff4dbebad0176b5d795aff14165469ea31. --- src/chemistry.rs | 15 +++++++++------ src/infer_intent.rs | 36 +++++++++++++++++------------------- src/interface.rs | 23 ++++++++++++++--------- src/speech.rs | 11 +++++------ src/tts.rs | 23 +++++++++++++++-------- src/xpath_functions.rs | 15 ++++++++++----- tests/common/mod.rs | 7 +++++-- 7 files changed, 75 insertions(+), 55 deletions(-) diff --git a/src/chemistry.rs b/src/chemistry.rs index 1d94fe32..8d2f625d 100644 --- a/src/chemistry.rs +++ b/src/chemistry.rs @@ -46,7 +46,6 @@ use std::convert::TryInto; use std::collections::HashSet; use std::cmp::Ordering; use crate::errors::*; -use std::sync::LazyLock; pub static NOT_CHEMISTRY: isize = -10000; // should overwhelm any positive signal @@ -940,8 +939,10 @@ fn likely_chem_superscript(sup: Element) -> isize { // bullet is radical (en.wikipedia.org/wiki/Radical_(chemistry)#Depiction_in_chemical_reactions); mhchem uses dot operator // these can stand alone, be followed by +/- or have a number in front "(2•)-"" [examples from mhchem documentation] // roman numerals are "oxidation state" and range from -4 to +9 - static MULTIPLE_PLUS_OR_MINUS_OR_DOT: LazyLock = LazyLock::new(|| Regex::new(r"^\++$|^-+$|^\U{2212}+$|^[⋅∙•][-+\U{2212}]*$").unwrap()); - static SINGLE_PLUS_OR_MINUS_OR_DOT: LazyLock = LazyLock::new(|| Regex::new(r"^[+-\U{2212}⋅∙•]$").unwrap()); + lazy_static! { + static ref MULTIPLE_PLUS_OR_MINUS_OR_DOT: Regex = Regex::new(r"^\++$|^-+$|^\U{2212}+$|^[⋅∙•][-+\U{2212}]*$").unwrap(); + static ref SINGLE_PLUS_OR_MINUS_OR_DOT: Regex = Regex::new(r"^[+-\U{2212}⋅∙•]$").unwrap(); + } static DOTS: &[char; 3] = &['⋅', '∙', '•']; let sup_name = name(sup); if sup_name == "mo" && MULTIPLE_PLUS_OR_MINUS_OR_DOT.is_match(as_text(sup)) { @@ -1616,9 +1617,11 @@ fn is_equilibrium_constant(mut mathml: Element) -> bool { return name(mathml) == "mi" && as_text(mathml) == "K"; } -// Oxidation states range from -4 to 9 and are written with (a subset of) roman numerals. -// All instances seem to be upper case that I've seen. -static SMALL_UPPER_ROMAN_NUMERAL: LazyLock = LazyLock::new(|| Regex::new(r"^\s*^(IX|IV|V?I{0,3})\s*$").unwrap()); +lazy_static! { + // Oxidation states range from -4 to 9 and are written with (a subset of) roman numerals. + // All instances seem to be upper case that I've seen. + static ref SMALL_UPPER_ROMAN_NUMERAL: Regex = Regex::new(r"^\s*^(IX|IV|V?I{0,3})\s*$").unwrap(); +} /// look for "(s), "(l)", "(g)", "(aq)" (could also use [...]) /// this might be called before canonicalization, but in clean_chemistry_mrow, we made sure "( xxx )" is grouped properly diff --git a/src/infer_intent.rs b/src/infer_intent.rs index 4c92e7a0..7da65768 100644 --- a/src/infer_intent.rs +++ b/src/infer_intent.rs @@ -11,7 +11,6 @@ use crate::speech::SpeechRulesWithContext; use crate::canonicalize::{as_element, as_text, name, create_mathml_element, set_mathml_name, INTENT_ATTR, MATHML_FROM_NAME_ATTR}; use crate::errors::*; use std::fmt; -use std::sync::LazyLock; use crate::pretty_print::mml_to_string; use crate::xpath_functions::is_leaf; use regex::Regex; @@ -244,24 +243,23 @@ pub fn intent_speech_for_name(intent_name: &str, verbosity: &str, fixity: &str) // property := S ':' NCName // S := [ \t\n\r]* -// The practical restrictions of NCName are that it cannot contain several symbol characters like -// !, ", #, $, %, &, ', (, ), *, +, ,, /, :, ;, <, =, >, ?, @, [, \, ], ^, `, {, |, }, ~, and whitespace characters -// Furthermore an NCName cannot begin with a number, dot or minus character although they can appear later in an NCName. -// NC_NAME defined in www.w3.org/TR/REC-xml/#sec-common-syn, but is complicated -// We follow NC_NAME for the basic latin block, but then allow everything -static CONCEPT_OR_LITERAL: LazyLock = LazyLock::new(|| { - Regex::new(r#"^[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*"# // NC_NAME but simpler - ).unwrap() -}); -static PROPERTY: LazyLock = LazyLock::new(|| { - Regex::new(r#"^:[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*"# // : NC_NAME - ).unwrap() -}); -static ARG_REF: LazyLock = LazyLock::new(|| { - Regex::new(r#"^\$[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*"# // $ NC_NAME - ).unwrap() -}); -static NUMBER: LazyLock = LazyLock::new(|| Regex::new(r#"^-?[0-9]+(\.[0-9]+)?"#).unwrap()); +lazy_static! { + // The practical restrictions of NCName are that it cannot contain several symbol characters like + // !, ", #, $, %, &, ', (, ), *, +, ,, /, :, ;, <, =, >, ?, @, [, \, ], ^, `, {, |, }, ~, and whitespace characters + // Furthermore an NCName cannot begin with a number, dot or minus character although they can appear later in an NCName. + // NC_NAME defined in www.w3.org/TR/REC-xml/#sec-common-syn, but is complicated + // We follow NC_NAME for the basic latin block, but then allow everything + static ref CONCEPT_OR_LITERAL: Regex = Regex::new( + r#"^[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*"# // NC_NAME but simpler + ).unwrap(); + static ref PROPERTY: Regex = Regex::new( + r#"^:[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*"# // : NC_NAME + ).unwrap(); + static ref ARG_REF: Regex = Regex::new( + r#"^\$[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*"# // $ NC_NAME + ).unwrap(); + static ref NUMBER: Regex = Regex::new(r#"^-?[0-9]+(\.[0-9]+)?"#).unwrap(); +} static TERMINALS_AS_U8: [u8; 3] = [b'(', b',', b')']; // static TERMINALS: [char; 3] = ['(', ',',')']; diff --git a/src/interface.rs b/src/interface.rs index 87cc6085..2d92c2d0 100644 --- a/src/interface.rs +++ b/src/interface.rs @@ -3,7 +3,6 @@ #![allow(non_snake_case)] #![allow(clippy::needless_return)] use std::cell::RefCell; -use std::sync::LazyLock; use crate::canonicalize::{as_text, create_mathml_element}; use crate::errors::*; @@ -92,12 +91,14 @@ pub fn get_version() -> String { /// The ids can be used for sync highlighting if the `Bookmark` API preference is true. pub fn set_mathml(mathml_str: impl AsRef) -> Result { enable_logs(); - // if these are present when resent to MathJaX, MathJaX crashes (https://github.com/mathjax/MathJax/issues/2822) - static MATHJAX_V2: LazyLock = LazyLock::new(|| Regex::new(r#"class *= *['"]MJX-.*?['"]"#).unwrap()); - static MATHJAX_V3: LazyLock = LazyLock::new(|| Regex::new(r#"class *= *['"]data-mjx-.*?['"]"#).unwrap()); - static NAMESPACE_DECL: LazyLock = LazyLock::new(|| Regex::new(r#"xmlns:[[:alpha:]]+"#).unwrap()); // very limited namespace prefix match - static PREFIX: LazyLock = LazyLock::new(|| Regex::new(r#"( = LazyLock::new(|| Regex::new(r#"&([a-zA-Z]+?);"#).unwrap()); + lazy_static! { + // if these are present when resent to MathJaX, MathJaX crashes (https://github.com/mathjax/MathJax/issues/2822) + static ref MATHJAX_V2: Regex = Regex::new(r#"class *= *['"]MJX-.*?['"]"#).unwrap(); + static ref MATHJAX_V3: Regex = Regex::new(r#"class *= *['"]data-mjx-.*?['"]"#).unwrap(); + static ref NAMESPACE_DECL: Regex = Regex::new(r#"xmlns:[[:alpha:]]+"#).unwrap(); // very limited namespace prefix match + static ref PREFIX: Regex = Regex::new(r#"( = LazyLock::new(|| Regex::new(r#"[ \u{0009}\u{000A}\u{00C}\u{000D}]+"#).unwrap()); + lazy_static! { + static ref WHITESPACE_MATCH: Regex = Regex::new(r#"[ \u{0009}\u{000A}\u{00C}\u{000D}]+"#).unwrap(); + } if is_leaf(e) && (!allow_structure_in_leaves || IsNode::is_mathml(e)) { // Assume it is HTML inside of the leaf -- turn the HTML into a string @@ -1103,7 +1106,9 @@ mod tests { set_mathml("𝕞").unwrap(); // need to remove unique ids - static ID_MATCH: LazyLock = LazyLock::new(|| Regex::new(r#"id='.+?' "#).unwrap()); + lazy_static! { + static ref ID_MATCH: Regex = Regex::new(r#"id='.+?' "#).unwrap(); + } let entity_str = ID_MATCH.replace_all(&entity_str, ""); let converted_str = ID_MATCH.replace_all(&converted_str, ""); assert_eq!(entity_str, converted_str, "normal entity test failed"); diff --git a/src/speech.rs b/src/speech.rs index 769f58bf..5a85ebe4 100644 --- a/src/speech.rs +++ b/src/speech.rs @@ -6,7 +6,6 @@ use std::path::PathBuf; use std::collections::HashMap; use std::cell::{RefCell, RefMut}; -use std::sync::LazyLock; use sxd_document::dom::{ChildOfElement, Document, Element}; use sxd_document::{Package, QName}; use sxd_xpath::context::Evaluation; @@ -566,13 +565,13 @@ impl InsertChildren { } -static ATTR_NAME_VALUE: LazyLock = LazyLock::new(|| { - Regex::new( +lazy_static! { + static ref ATTR_NAME_VALUE: Regex = Regex::new( // match name='value', where name is sort of an NCNAME (see CONCEPT_OR_LITERAL in infer_intent.rs) - // The quotes can be either single or double quotes + // The quotes can be either single or double quotes r#"(?P[^\s\u{0}-\u{40}\[\\\]^`\u{7B}-\u{BF}][^\s\u{0}-\u{2C}/:;<=>?@\[\\\]^`\u{7B}-\u{BF}]*)\s*=\s*('(?P[^']+)'|"(?P[^"]+)")"# - ).unwrap() -}); + ).unwrap(); +} // structure used when "intent:" is encountered in a rule // the name is either a string or an xpath that needs evaluation. 99% of the time it is a string diff --git a/src/tts.rs b/src/tts.rs index fcdfc468..11d52fe5 100644 --- a/src/tts.rs +++ b/src/tts.rs @@ -77,7 +77,6 @@ use std::string::ToString; use std::str::FromStr; use strum_macros::{Display, EnumString}; use regex::Regex; -use std::sync::LazyLock; use sxd_xpath::Value; const MIN_PAUSE:f64 = 50.0; // ms -- avoids clutter of putting out pauses that probably can't be heard @@ -596,7 +595,9 @@ impl TTS { /// The computation is based on the length of the speech strings (after removing tagging). /// There is a bias towards pausing more _after_ longer strings. pub fn compute_auto_pause(&self, prefs: &PreferenceManager, before: &str, after: &str) -> String { - static REMOVE_XML: LazyLock = LazyLock::new(|| Regex::new(r"<.+?>").unwrap()); // punctuation ending with a '.' + lazy_static! { + static ref REMOVE_XML: Regex = Regex::new(r"<.+?>").unwrap(); // punctuation ending with a '.' + } let before_len; let after_len; match self { @@ -652,8 +653,10 @@ impl TTS { fn merge_pauses_none(&self, str: &str) -> String { // punctuation used for pauses is ",", ";" - static SPACES: LazyLock = LazyLock::new(|| Regex::new(r"\s+([;,])").unwrap()); // two or more pauses - static MULTIPLE_PAUSES: LazyLock = LazyLock::new(|| Regex::new(r"([,;][,;]+)").unwrap()); // two or more pauses + lazy_static! { + static ref SPACES: Regex = Regex::new(r"\s+([;,])").unwrap(); // two or more pauses + static ref MULTIPLE_PAUSES: Regex = Regex::new(r"([,;][,;]+)").unwrap(); // two or more pauses + } // we reduce all sequences of two or more pauses to a single medium pause let merges_string = SPACES.replace_all(str, "$1").to_string(); let merges_string = MULTIPLE_PAUSES.replace_all(&merges_string, ";").to_string(); @@ -677,15 +680,19 @@ impl TTS { } fn merge_pauses_sapi5(&self, str: &str) -> String { - static CONSECUTIVE_BREAKS: LazyLock = LazyLock::new(|| Regex::new(r"(]+?> *){2,}").unwrap()); // two or more pauses - static PAUSE_AMOUNT: LazyLock = LazyLock::new(|| Regex::new(r"msec=.*?(\d+)").unwrap()); // amount after 'time' + lazy_static! { + static ref CONSECUTIVE_BREAKS: Regex = Regex::new(r"(]+?> *){2,}").unwrap(); // two or more pauses + static ref PAUSE_AMOUNT: Regex = Regex::new(r"msec=.*?(\d+)").unwrap(); // amount after 'time' + } let replacement = |amount: usize| format!(""); return TTS::merge_pauses_xml(str, &CONSECUTIVE_BREAKS, &PAUSE_AMOUNT, replacement); } fn merge_pauses_ssml(&self, str: &str) -> String { - static CONSECUTIVE_BREAKS: LazyLock = LazyLock::new(|| Regex::new(r"(]+?> *){2,}").unwrap()); // two or more pauses - static PAUSE_AMOUNT: LazyLock = LazyLock::new(|| Regex::new(r"time=.*?(\d+)").unwrap()); // amount after 'time' + lazy_static! { + static ref CONSECUTIVE_BREAKS: Regex = Regex::new(r"(]+?> *){2,}").unwrap(); // two or more pauses + static ref PAUSE_AMOUNT: Regex = Regex::new(r"time=.*?(\d+)").unwrap(); // amount after 'time' + } let replacement = |amount: usize| format!(""); return TTS::merge_pauses_xml(str, &CONSECUTIVE_BREAKS, &PAUSE_AMOUNT, replacement); } diff --git a/src/xpath_functions.rs b/src/xpath_functions.rs index 49543688..ae68edf3 100644 --- a/src/xpath_functions.rs +++ b/src/xpath_functions.rs @@ -23,7 +23,6 @@ use crate::definitions::{Definitions, SPEECH_DEFINITIONS, BRAILLE_DEFINITIONS}; use regex::Regex; use crate::pretty_print::mml_to_string; use std::cell::{Ref, RefCell}; -use std::sync::LazyLock; use std::thread::LocalKey; use phf::phf_set; use sxd_xpath::function::Error as XPathError; @@ -266,7 +265,9 @@ impl IsNode { // Returns true if 'frac' is a common fraction // In this case, the numerator and denominator can be no larger than 'num_limit' and 'denom_limit' fn is_common_fraction(frac: Element, num_limit: usize, denom_limit: usize) -> bool { - static ALL_DIGITS: LazyLock = LazyLock::new(|| Regex::new(r"\d+").unwrap()); // match one or more digits + lazy_static! { + static ref ALL_DIGITS: Regex = Regex::new(r"\d+").unwrap(); // match one or more digits + } if !is_tag(frac, "mfrac") && !is_tag(frac, "fraction"){ return false; @@ -448,7 +449,9 @@ impl ToOrdinal { * Returns the string representation of that number or an error message */ fn convert(number: &str, fractional: bool, plural: bool) -> Option { - static NO_DIGIT: LazyLock = LazyLock::new(|| Regex::new(r"[^\d]").unwrap()); // match anything except a digit + lazy_static! { + static ref NO_DIGIT: Regex = Regex::new(r"[^\d]").unwrap(); // match anything except a digit + } return SPEECH_DEFINITIONS.with(|definitions| { let definitions = definitions.borrow(); let numbers_large = definitions.get_vec("NumbersLarge")?; @@ -1346,8 +1349,10 @@ pub struct FontSizeGuess; // returns original node match isn't found impl FontSizeGuess { pub fn em_from_value(value_with_unit: &str) -> f64 { - // match one or more digits followed by a unit -- there are many more units, but they tend to be large and rarer(?) - static FONT_VALUE: LazyLock = LazyLock::new(|| { Regex::new(r"(-?[0-9]*\.?[0-9]*)(px|cm|mm|Q|in|ppc|pt|ex|em|rem)").unwrap() }); + lazy_static! { + // match one or more digits followed by a unit -- there are many more units, but they tend to be large and rarer(?) + static ref FONT_VALUE: Regex = Regex::new(r"(-?[0-9]*\.?[0-9]*)(px|cm|mm|Q|in|ppc|pt|ex|em|rem)").unwrap(); + } let cap = FONT_VALUE.captures(value_with_unit); if let Some(cap) = cap { if cap.len() == 3 { diff --git a/tests/common/mod.rs b/tests/common/mod.rs index d1ea70df..482e588a 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -2,7 +2,8 @@ #[cfg(test)] use regex::Regex; -use std::sync::LazyLock; +extern crate lazy_static; +use lazy_static::lazy_static; pub use libmathcat::interface::*; @@ -34,7 +35,9 @@ pub fn abs_rules_dir_path() -> String { // Strip spaces from 'str' so comparison doesn't need to worry about spacing #[allow(dead_code)] // used in testing fn strip_spaces(str: &str) -> String { - static SPACES: LazyLock = LazyLock::new(|| Regex::new(r" +").unwrap()); + lazy_static! { + static ref SPACES: Regex = Regex::new(r" +").unwrap(); + } return String::from(SPACES.replace_all(str, " ")); } From 64ff2094b711d9c8806903816141a9f31720db8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20Gro=C3=9F?= Date: Fri, 9 Jan 2026 20:13:23 +0100 Subject: [PATCH 8/8] Add test command to audit_translations README --- PythonScripts/audit_translations/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/PythonScripts/audit_translations/README.md b/PythonScripts/audit_translations/README.md index 80dff1d4..26455086 100644 --- a/PythonScripts/audit_translations/README.md +++ b/PythonScripts/audit_translations/README.md @@ -94,3 +94,7 @@ uv run python -m audit_translations de-CH # Show detailed output with English/translated snippets for rule differences uv run python -m audit_translations es --verbose ``` + +### Testing + +```uv run python -m pytest``` \ No newline at end of file