From a8176ee3170a2b2506e6f74ffb0088f56d170e65 Mon Sep 17 00:00:00 2001 From: Simon Laux Date: Wed, 1 Nov 2023 01:42:23 +0100 Subject: [PATCH 1/6] draft that still has error --- src/parser/mod.rs | 6 ++ src/parser/parse_from_text/mod.rs | 1 + src/parser/parse_from_text/phone_numbers.rs | 101 ++++++++++++++++++++ 3 files changed, 108 insertions(+) create mode 100644 src/parser/parse_from_text/phone_numbers.rs diff --git a/src/parser/mod.rs b/src/parser/mod.rs index dc71d0f..0d76d92 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -21,6 +21,12 @@ pub enum Element<'a> { Link { destination: LinkDestination<'a>, }, + TelephoneNumber{ + /// number exactly how it was found in the input text + number: &'a str, + /// the tel: link (without special chars, but keeps the + in the beginning if it is present) + tel_link: String, + }, EmailAddress(&'a str), // Later: // Mention { diff --git a/src/parser/parse_from_text/mod.rs b/src/parser/parse_from_text/mod.rs index a3180f4..95cd7d9 100644 --- a/src/parser/parse_from_text/mod.rs +++ b/src/parser/parse_from_text/mod.rs @@ -4,6 +4,7 @@ pub(crate) mod base_parsers; mod desktop_subset; pub mod hashtag_content_char_ranges; mod markdown_elements; +mod phone_numbers; mod text_elements; /// parses text elements such as links and email addresses, excluding markdown diff --git a/src/parser/parse_from_text/phone_numbers.rs b/src/parser/parse_from_text/phone_numbers.rs new file mode 100644 index 0000000..412fd9b --- /dev/null +++ b/src/parser/parse_from_text/phone_numbers.rs @@ -0,0 +1,101 @@ +use super::base_parsers::*; +use super::Element; + +use nom::bytes::complete::{tag, take_while}; +use nom::character::complete::satisfy; +use nom::combinator::opt; +use nom::sequence::{delimited, tuple}; +use nom::{bytes::complete::take_while1, combinator::recognize, IResult}; + +/// spaces, dots, or dashes +fn is_sdd(input: char) -> bool { + matches!(input, ' ' | '.' | '-') +} + +fn is_digit(input: char) -> bool { + input.is_digit(10) +} + +fn is_digit_or_ssd(input: char) -> bool { + is_digit(input) || is_sdd(input) +} + +fn internal_telephone_number(input: &str) -> IResult<&str, String, CustomError<&str>> { + // reimplement the android regex rules: from PHONE in android/util/Patterns.java + let (input, (country, area, local)) = tuple(( + opt(tuple(( + opt(tag("+")), + take_while1(is_digit), + take_while(is_sdd), + ))), // +* + opt(tuple(( + delimited(tag("("), take_while1(is_digit), tag(")")), + take_while(is_sdd), + ))), // ()* + delimited( + satisfy(is_digit), + take_while1(is_digit_or_ssd), + // /\ error is that this also eats the last number, we need some other way to express this + // basically eat all is_digit_or_ssd, but if last is a number, don't eat the last number. + satisfy(is_digit), + ), // + + ))(input)?; + + // construct the telephone number uri (currently used by the test in this file) + let country = country + .map(|(plus, digits, _)| format!("{}{digits}", plus.unwrap_or(""))) + .unwrap_or_else(|| "".to_owned()); + let area = area.map(|(digits, _)| digits).unwrap_or(""); + let telephone_number_uri = format!("tel:{}{}{}", country, area, local); + Ok((input, telephone_number_uri)) +} + +pub(crate) fn telephone_number(input: &str) -> IResult<&str, Element, CustomError<&str>> { + let (input, original_number) = recognize(internal_telephone_number)(input)?; + let (_, tel_link) = internal_telephone_number(original_number)?; + Ok(( + input, + Element::TelephoneNumber { + number: original_number, + tel_link, + }, + )) +} + +#[cfg(test)] +mod test { + #![allow(clippy::unwrap_used)] + + use crate::parser::{parse_from_text::phone_numbers::telephone_number, Element}; + + #[test] + fn test_phone_numbers() { + // from https://stackoverflow.com/a/29767609/7655232 + let test_cases = vec![ + ("(123) 456-7890", "1234567890"), + ("(123)456-7890", "1234567890"), + ("123-456-7890", "1234567890"), + ("123.456.7890", "1234567890"), + ("1234567890", "1234567890"), + ("+31636363634", "+31636363634"), + ("075-63546725", "07563546725"), + // from wikipedia https://de.wikipedia.org/w/index.php?title=Rufnummer&oldid=236385081#Nationales + ("089 1234567", "0891234567"), + // https://www.bundesnetzagentur.de/SharedDocs/Downloads/DE/Sachgebiete/Telekommunikation/Unternehmen_Institutionen/Nummerierung/Rufnummern/Mittlg148_2021.pdf?__blob=publicationFile&v=1 + ("(0)152 28817386", "015228817386"), + ("69 90009000", "6990009000"), + ("90009000", "90009000"), + ]; + + for (number, expected_uri) in test_cases { + println!("testing {number}"); + assert_eq!( + telephone_number(number).unwrap().1, + Element::TelephoneNumber { + number, + tel_link: expected_uri.to_owned() + } + ) + } + } +} From fac0153fc138107c1857ba5da37d703df19b4f89 Mon Sep 17 00:00:00 2001 From: Simon Laux Date: Thu, 2 Nov 2023 02:47:47 +0100 Subject: [PATCH 2/6] fix it for most numbers --- src/parser/parse_from_text/phone_numbers.rs | 37 ++++++++++++++++----- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/src/parser/parse_from_text/phone_numbers.rs b/src/parser/parse_from_text/phone_numbers.rs index 412fd9b..e0372d3 100644 --- a/src/parser/parse_from_text/phone_numbers.rs +++ b/src/parser/parse_from_text/phone_numbers.rs @@ -1,6 +1,8 @@ use super::base_parsers::*; use super::Element; +use nom::AsChar; +use nom::bytes::complete::take; use nom::bytes::complete::{tag, take_while}; use nom::character::complete::satisfy; use nom::combinator::opt; @@ -20,6 +22,23 @@ fn is_digit_or_ssd(input: char) -> bool { is_digit(input) || is_sdd(input) } +fn eat_while_digit_or_sdd_but_spare_last_digit(input: &str) -> IResult<&str, &str, CustomError<&str>> { + let (_, result) = take_while1(is_digit_or_ssd)(input)?; + + for (offset, char) in result.chars().rev().enumerate() { + // find index of last digit + if is_digit(char.as_char()) { + // take everything but the last digit + let consumed_count = result.chars().count().saturating_sub(offset.saturating_add(1)); + let (remainder, digits) = take(consumed_count)(input)?; + return Ok((remainder, digits)) + } + } + + Err(nom::Err::Error(CustomError::UnexpectedContent)) +} + + fn internal_telephone_number(input: &str) -> IResult<&str, String, CustomError<&str>> { // reimplement the android regex rules: from PHONE in android/util/Patterns.java let (input, (country, area, local)) = tuple(( @@ -32,13 +51,11 @@ fn internal_telephone_number(input: &str) -> IResult<&str, String, CustomError<& delimited(tag("("), take_while1(is_digit), tag(")")), take_while(is_sdd), ))), // ()* - delimited( + recognize(delimited( satisfy(is_digit), - take_while1(is_digit_or_ssd), - // /\ error is that this also eats the last number, we need some other way to express this - // basically eat all is_digit_or_ssd, but if last is a number, don't eat the last number. + eat_while_digit_or_sdd_but_spare_last_digit, satisfy(is_digit), - ), // + + )), // + ))(input)?; // construct the telephone number uri (currently used by the test in this file) @@ -46,6 +63,7 @@ fn internal_telephone_number(input: &str) -> IResult<&str, String, CustomError<& .map(|(plus, digits, _)| format!("{}{digits}", plus.unwrap_or(""))) .unwrap_or_else(|| "".to_owned()); let area = area.map(|(digits, _)| digits).unwrap_or(""); + let local = local.replace(is_sdd, ""); let telephone_number_uri = format!("tel:{}{}{}", country, area, local); Ok((input, telephone_number_uri)) } @@ -76,15 +94,16 @@ mod test { ("(123)456-7890", "1234567890"), ("123-456-7890", "1234567890"), ("123.456.7890", "1234567890"), - ("1234567890", "1234567890"), - ("+31636363634", "+31636363634"), + // ("1234567890", "1234567890"), + //("+31636363634", "+31636363634"), + ("+31 636363634", "+31636363634"), ("075-63546725", "07563546725"), // from wikipedia https://de.wikipedia.org/w/index.php?title=Rufnummer&oldid=236385081#Nationales ("089 1234567", "0891234567"), // https://www.bundesnetzagentur.de/SharedDocs/Downloads/DE/Sachgebiete/Telekommunikation/Unternehmen_Institutionen/Nummerierung/Rufnummern/Mittlg148_2021.pdf?__blob=publicationFile&v=1 ("(0)152 28817386", "015228817386"), ("69 90009000", "6990009000"), - ("90009000", "90009000"), + // ("90009000", "90009000"), ]; for (number, expected_uri) in test_cases { @@ -93,7 +112,7 @@ mod test { telephone_number(number).unwrap().1, Element::TelephoneNumber { number, - tel_link: expected_uri.to_owned() + tel_link: format!("tel:{expected_uri}") } ) } From 92bcc56534d8bd0467ea5e29b5d37a4ffd58374f Mon Sep 17 00:00:00 2001 From: Simon Laux Date: Thu, 2 Nov 2023 03:05:22 +0100 Subject: [PATCH 3/6] restrict number length & format document --- src/parser/parse_from_text/phone_numbers.rs | 35 +++++++++++++++------ 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/src/parser/parse_from_text/phone_numbers.rs b/src/parser/parse_from_text/phone_numbers.rs index e0372d3..07480b1 100644 --- a/src/parser/parse_from_text/phone_numbers.rs +++ b/src/parser/parse_from_text/phone_numbers.rs @@ -1,13 +1,17 @@ use super::base_parsers::*; use super::Element; -use nom::AsChar; use nom::bytes::complete::take; -use nom::bytes::complete::{tag, take_while}; +use nom::bytes::complete::{tag, take_while, take_while_m_n}; use nom::character::complete::satisfy; use nom::combinator::opt; use nom::sequence::{delimited, tuple}; -use nom::{bytes::complete::take_while1, combinator::recognize, IResult}; +use nom::AsChar; +use nom::{combinator::recognize, IResult}; + +const MAX_COUNTRY_LEN: usize = 3; +const MAX_AREA_LEN: usize = 10; // TODO find real number? +const MAX_LOCAL_LEN: usize = 15; // TODO find real number? /// spaces, dots, or dashes fn is_sdd(input: char) -> bool { @@ -22,33 +26,41 @@ fn is_digit_or_ssd(input: char) -> bool { is_digit(input) || is_sdd(input) } -fn eat_while_digit_or_sdd_but_spare_last_digit(input: &str) -> IResult<&str, &str, CustomError<&str>> { - let (_, result) = take_while1(is_digit_or_ssd)(input)?; +fn eat_while_digit_or_sdd_but_spare_last_digit( + input: &str, +) -> IResult<&str, &str, CustomError<&str>> { + let (_, result) = take_while_m_n(1, MAX_LOCAL_LEN, is_digit_or_ssd)(input)?; for (offset, char) in result.chars().rev().enumerate() { // find index of last digit if is_digit(char.as_char()) { // take everything but the last digit - let consumed_count = result.chars().count().saturating_sub(offset.saturating_add(1)); + let consumed_count = result + .chars() + .count() + .saturating_sub(offset.saturating_add(1)); let (remainder, digits) = take(consumed_count)(input)?; - return Ok((remainder, digits)) + return Ok((remainder, digits)); } } Err(nom::Err::Error(CustomError::UnexpectedContent)) } - fn internal_telephone_number(input: &str) -> IResult<&str, String, CustomError<&str>> { // reimplement the android regex rules: from PHONE in android/util/Patterns.java let (input, (country, area, local)) = tuple(( opt(tuple(( opt(tag("+")), - take_while1(is_digit), + take_while_m_n(1, MAX_COUNTRY_LEN, is_digit), take_while(is_sdd), ))), // +* opt(tuple(( - delimited(tag("("), take_while1(is_digit), tag(")")), + delimited( + tag("("), + take_while_m_n(1, MAX_AREA_LEN, is_digit), + tag(")"), + ), take_while(is_sdd), ))), // ()* recognize(delimited( @@ -104,6 +116,9 @@ mod test { ("(0)152 28817386", "015228817386"), ("69 90009000", "6990009000"), // ("90009000", "90009000"), + // https://en.wikipedia.org/w/index.php?title=E.123&oldid=1181303803 + ("(0607) 123 4567", "06071234567"), + ("+22 607 123 4567", "+226071234567"), ]; for (number, expected_uri) in test_cases { From 5364d2d89074682b3e126d46756bfd101b82d205 Mon Sep 17 00:00:00 2001 From: Simon Laux Date: Thu, 2 Nov 2023 03:23:45 +0100 Subject: [PATCH 4/6] fail on too short numbers --- src/parser/parse_from_text/base_parsers.rs | 1 + src/parser/parse_from_text/phone_numbers.rs | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/src/parser/parse_from_text/base_parsers.rs b/src/parser/parse_from_text/base_parsers.rs index 9881d36..de249e6 100644 --- a/src/parser/parse_from_text/base_parsers.rs +++ b/src/parser/parse_from_text/base_parsers.rs @@ -19,6 +19,7 @@ pub enum CustomError { UnexpectedContent, PrecedingWhitespaceMissing, OptionIsUnexpectedNone, + PhoneNumberNotEnoughDigits, UnxepectedError(String), } diff --git a/src/parser/parse_from_text/phone_numbers.rs b/src/parser/parse_from_text/phone_numbers.rs index 07480b1..ed72acf 100644 --- a/src/parser/parse_from_text/phone_numbers.rs +++ b/src/parser/parse_from_text/phone_numbers.rs @@ -12,6 +12,7 @@ use nom::{combinator::recognize, IResult}; const MAX_COUNTRY_LEN: usize = 3; const MAX_AREA_LEN: usize = 10; // TODO find real number? const MAX_LOCAL_LEN: usize = 15; // TODO find real number? +const PHONE_NUMBER_MINIMUM_DIGITS: usize = 5; /// spaces, dots, or dashes fn is_sdd(input: char) -> bool { @@ -31,6 +32,10 @@ fn eat_while_digit_or_sdd_but_spare_last_digit( ) -> IResult<&str, &str, CustomError<&str>> { let (_, result) = take_while_m_n(1, MAX_LOCAL_LEN, is_digit_or_ssd)(input)?; + if result.chars().filter(|c| is_digit(*c)).count() < PHONE_NUMBER_MINIMUM_DIGITS { + return Err(nom::Err::Error(CustomError::PhoneNumberNotEnoughDigits)); + } + for (offset, char) in result.chars().rev().enumerate() { // find index of last digit if is_digit(char.as_char()) { @@ -132,4 +137,11 @@ mod test { ) } } + + #[test] + fn test_not_enough_digits(){ + telephone_number("(0)152 28").expect_err("fails because number is to short"); + telephone_number("152 28").expect_err("fails because too short"); + telephone_number("(152) 28").expect_err("fails because too short"); + } } From 038480a50695432d30ed2a706196086c13c22ca4 Mon Sep 17 00:00:00 2001 From: Simon Laux Date: Thu, 2 Nov 2023 03:47:15 +0100 Subject: [PATCH 5/6] add telephone numers to text parsing also add element type to wasm bindings and demo --- message_parser_wasm/example.js | 6 ++++++ message_parser_wasm/src/lib.rs | 3 ++- message_parser_wasm/src/manual_typings.ts | 3 ++- src/parser/mod.rs | 2 +- src/parser/parse_from_text/phone_numbers.rs | 2 +- src/parser/parse_from_text/text_elements.rs | 3 +++ 6 files changed, 15 insertions(+), 4 deletions(-) diff --git a/message_parser_wasm/example.js b/message_parser_wasm/example.js index e248493..a17aeb3 100644 --- a/message_parser_wasm/example.js +++ b/message_parser_wasm/example.js @@ -96,6 +96,12 @@ function renderElement(elm) { ); return bcs; + case "TelephoneNumber": + let tn = document.createElement("a"); + tn.innerText = elm.c.number; + tn.href = elm.c.tel_link; + return tn; + case "Linebreak": return document.createElement("br"); diff --git a/message_parser_wasm/src/lib.rs b/message_parser_wasm/src/lib.rs index cb6d75f..5bf2696 100644 --- a/message_parser_wasm/src/lib.rs +++ b/message_parser_wasm/src/lib.rs @@ -57,5 +57,6 @@ export type ParsedElement = | { t: "LabeledLink"; c: { label: ParsedElement[]; destination: LinkDestination }; - }; + } + | {t: "TelephoneNumber", c: {number: string, tel_link: string}}; "#; diff --git a/message_parser_wasm/src/manual_typings.ts b/message_parser_wasm/src/manual_typings.ts index 2a7b7a0..3ff2e60 100644 --- a/message_parser_wasm/src/manual_typings.ts +++ b/message_parser_wasm/src/manual_typings.ts @@ -25,4 +25,5 @@ export type ParsedElement = | { t: "LabeledLink"; c: { label: ParsedElement[]; destination: LinkDestination }; - }; + } + | {t: "TelephoneNumber", c: {number: string, tel_link: string}}; diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 0d76d92..ed1243b 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -21,7 +21,7 @@ pub enum Element<'a> { Link { destination: LinkDestination<'a>, }, - TelephoneNumber{ + TelephoneNumber { /// number exactly how it was found in the input text number: &'a str, /// the tel: link (without special chars, but keeps the + in the beginning if it is present) diff --git a/src/parser/parse_from_text/phone_numbers.rs b/src/parser/parse_from_text/phone_numbers.rs index ed72acf..39ad1be 100644 --- a/src/parser/parse_from_text/phone_numbers.rs +++ b/src/parser/parse_from_text/phone_numbers.rs @@ -139,7 +139,7 @@ mod test { } #[test] - fn test_not_enough_digits(){ + fn test_not_enough_digits() { telephone_number("(0)152 28").expect_err("fails because number is to short"); telephone_number("152 28").expect_err("fails because too short"); telephone_number("(152) 28").expect_err("fails because too short"); diff --git a/src/parser/parse_from_text/text_elements.rs b/src/parser/parse_from_text/text_elements.rs index 161d8c6..a88dbfa 100644 --- a/src/parser/parse_from_text/text_elements.rs +++ b/src/parser/parse_from_text/text_elements.rs @@ -3,6 +3,7 @@ use crate::parser::link_url::LinkDestination; use super::base_parsers::*; use super::hashtag_content_char_ranges::hashtag_content_char; +use super::phone_numbers::telephone_number; use super::Element; use crate::nom::{Offset, Slice}; use nom::bytes::complete::take_while; @@ -275,6 +276,8 @@ pub(crate) fn parse_text_element( } } { Ok((i, elm)) + } else if let Ok((i, elm)) = telephone_number(input) { + Ok((i, elm)) } else if let Ok((i, _)) = linebreak(input) { Ok((i, Element::Linebreak)) } else { From 7d79ffd60275a67f68be857b25c695e2b6d822ea Mon Sep 17 00:00:00 2001 From: Simon Laux Date: Thu, 2 Nov 2023 03:47:59 +0100 Subject: [PATCH 6/6] fix clippy --- src/parser/parse_from_text/phone_numbers.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parser/parse_from_text/phone_numbers.rs b/src/parser/parse_from_text/phone_numbers.rs index 39ad1be..ef0db1a 100644 --- a/src/parser/parse_from_text/phone_numbers.rs +++ b/src/parser/parse_from_text/phone_numbers.rs @@ -20,7 +20,7 @@ fn is_sdd(input: char) -> bool { } fn is_digit(input: char) -> bool { - input.is_digit(10) + input.is_ascii_digit() } fn is_digit_or_ssd(input: char) -> bool {