0.7.29 - Limit width after word-break.

finnbear · Sep 24, 2024 · 6c4b4f9 · 6c4b4f9
1 parent 52edb2d
commit 6c4b4f9
Show file tree

Hide file tree

Showing 6 changed files with 164 additions and 20 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -40,7 +40,7 @@ jobs:
       - name: Download Testing Data
         run: curl https://raw.githubusercontent.com/vzhou842/profanity-check/master/profanity_check/data/clean_data.csv --output test.csv
       - name: Test
-        run: cargo test --release --features width
+        run: cargo test --release --features width,context
       - name: Add wasm32 target
         run: rustup target add wasm32-unknown-unknown
       - name: Install Trunk

diff --git a/Cargo.toml b/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "rustrict"
 authors = ["Finn Bear"]
-version = "0.7.28"
+version = "0.7.29"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/finnbear/rustrict/"
@@ -37,7 +37,7 @@ default = ["censor", "context"]
 censor = ["arrayvec", "bitflags", "lazy_static", "itertools", "unicode-normalization", "rustc-hash"]
 context = ["censor", "strsim"]
 customize = ["censor"]
-width = ["lazy_static"]
+width = ["lazy_static", "itertools"]
 pii = ["lazy_static", "regex"]
 find_false_positives = ["censor", "regex", "indicatif", "rayon"]
 find_replacements = ["csv"]
@@ -79,4 +79,4 @@ censor_crate = {package = "censor", version = "0.3.0"}
 rustrict_old = {package = "rustrict", version = "0.7.24"}
 serial_test = "0.5"
 bincode = "1.3.3"
-serde_json = "1"
+serde_json = "1"
diff --git a/fuzz/fuzz_targets/fuzz.rs b/fuzz/fuzz_targets/fuzz.rs
@@ -9,6 +9,7 @@ fuzz_target!(|data: &[u8]| {
 
         if let Ok(text) = std::str::from_utf8(input) {
             let _ = rustrict::width_str(text);
+            let _ = rustrict::width_str_max_unbroken(text);
             let _ = rustrict::trim_to_width(text, 10);
             let _ = rustrict::censor_and_analyze_pii(text);
 

diff --git a/src/context.rs b/src/context.rs
@@ -1,3 +1,4 @@
+use crate::width::WordBreak;
 use crate::{trim_whitespace, Censor, Type};
 
 use crate::censor::should_skip_censor;
@@ -49,6 +50,7 @@ impl Debug for Context {
 /// as new fields may be added in the future.
 #[derive(Clone, Debug)]
 #[cfg_attr(doc, doc(cfg(feature = "context")))]
+#[non_exhaustive]
 pub struct ContextProcessingOptions {
     /// Block messages if the user has been manually muted.
     pub block_if_muted: bool,
@@ -63,6 +65,9 @@ pub struct ContextProcessingOptions {
     ///
     /// Messages will be trimmed to fit.
     pub character_limit: Option<NonZeroUsize>,
+    /// Ensure word-break will work on the message.
+    #[cfg(feature = "width")]
+    pub word_break: Option<ContextWordBreakOptions>,
     /// Rate-limiting options.
     pub rate_limit: Option<ContextRateLimitOptions>,
     /// Block messages if they are very similar to this many previous message.
@@ -83,6 +88,8 @@ impl Default for ContextProcessingOptions {
             safe_mode_until: None,
             character_limit: Some(NonZeroUsize::new(2048).unwrap()),
             rate_limit: Some(ContextRateLimitOptions::default()),
+            #[cfg(feature = "width")]
+            word_break: Some(ContextWordBreakOptions::default()),
             repetition_limit: Some(ContextRepetitionLimitOptions::default()),
             max_safe_timeout: Duration::from_secs(30 * 60),
             trim_whitespace: true,
@@ -127,6 +134,27 @@ impl ContextRateLimitOptions {
     }
 }
 
+/// Options that ensure word break will be possible.
+#[derive(Clone, Debug)]
+#[cfg(feature = "width")]
+#[cfg_attr(doc, doc(cfg(all(feature = "context", feature = "width"))))]
+pub struct ContextWordBreakOptions {
+    /// The type of word-breaking used to display the text.
+    pub word_break: WordBreak,
+    /// The maximum length of an unbreakable part (before the entire message is blocked).
+    pub limit: NonZeroUsize,
+}
+
+#[cfg(feature = "width")]
+impl Default for ContextWordBreakOptions {
+    fn default() -> Self {
+        Self {
+            word_break: WordBreak::BreakAll,
+            limit: NonZeroUsize::new(16).unwrap(),
+        }
+    }
+}
+
 /// Options that control repetition-limiting.
 #[derive(Clone, Debug)]
 #[cfg_attr(doc, doc(cfg(feature = "context")))]
@@ -252,6 +280,16 @@ impl Context {
             censored_str = trim_whitespace(censored_str);
         }
 
+        #[cfg(feature = "width")]
+        {
+            if let Some(word_break) = &options.word_break {
+                let max = crate::width::width_str_max_unbroken(censored_str, word_break.word_break);
+                if max > word_break.limit.get() {
+                    return Err(BlockReason::Unbroken(max));
+                }
+            }
+        }
+
         if censored_str.len() < censored.len() {
             // Something was trimmed, must must re-allocate.
             censored = String::from(censored_str);
@@ -514,6 +552,9 @@ impl Default for Context {
 pub enum BlockReason {
     /// The particular message was *severely* inappropriate, more specifically, `Type`.
     Inappropriate(Type),
+    #[cfg(feature = "width")]
+    /// There was an unbroken part of the string of this length, exceeding the limit.
+    Unbroken(usize),
     /// Recent messages were generally inappropriate, and this message isn't on the safe list.
     /// Alternatively, if targeted is false, safe mode was configured globally.
     /// Try again after `Duration`.
@@ -537,7 +578,18 @@ impl BlockReason {
     /// default warning to send to the user.
     pub fn generic_str(self) -> &'static str {
         match self {
-            Self::Inappropriate(_) => "Your message was held for severe profanity",
+            Self::Inappropriate(typ) => {
+                if typ.is(Type::OFFENSIVE) {
+                    "Your message was held for being highly offensive"
+                } else if typ.is(Type::SEXUAL) {
+                    "Your message was held for being overly sexual"
+                } else if typ.is(Type::MEAN) {
+                    "Your message was held for being overly mean"
+                } else {
+                    "Your message was held for severe profanity"
+                }
+            }
+            Self::Unbroken(_) => "Part of your message is too wide to display",
             Self::Unsafe { .. } => "You have been temporarily restricted due to profanity/spam",
             Self::Repetitious(_) => "Your message was too similar to recent messages",
             Self::Spam(_) => "You have been temporarily muted due to excessive frequency",
@@ -556,15 +608,6 @@ impl BlockReason {
     /// muted for).
     pub fn contextual_string(self) -> String {
         match self {
-            Self::Inappropriate(typ) => String::from(if typ.is(Type::OFFENSIVE) {
-                "Your message was held for being highly offensive"
-            } else if typ.is(Type::SEXUAL) {
-                "Your message was held for being overly sexual"
-            } else if typ.is(Type::MEAN) {
-                "Your message was held for being overly mean"
-            } else {
-                "Your message was held for severe profanity"
-            }),
             Self::Unsafe {
                 remaining,
                 targeted: true,
@@ -584,7 +627,7 @@ impl BlockReason {
                 FormattedDuration(dur)
             ),
             Self::Muted(dur) => format!("You have been muted for {}", FormattedDuration(dur)),
-            _ => String::from(self.generic_str()),
+            _ => self.generic_str().to_owned(),
         }
     }
 }
@@ -836,11 +879,17 @@ mod tests {
     #[test]
     #[cfg(feature = "width")]
     fn character_limit() {
-        use crate::{BlockReason, Context, ContextProcessingOptions};
+        use crate::{
+            context::ContextWordBreakOptions, BlockReason, Context, ContextProcessingOptions,
+        };
         let mut ctx = Context::new();
 
         let opts = ContextProcessingOptions {
             character_limit: Some(NonZeroUsize::new(5).unwrap()),
+            word_break: Some(ContextWordBreakOptions {
+                word_break: crate::width::WordBreak::BreakAll,
+                limit: NonZeroUsize::new(5).unwrap(),
+            }),
             ..Default::default()
         };
 
@@ -849,11 +898,24 @@ mod tests {
             Ok(String::from("abcde"))
         );
 
-        #[cfg(feature = "width")]
         assert_eq!(
             ctx.process_with_options(String::from("a﷽"), &opts),
             Ok(String::from("a"))
         );
+
+        let opts = ContextProcessingOptions {
+            character_limit: Some(NonZeroUsize::new(20).unwrap()),
+            word_break: Some(ContextWordBreakOptions {
+                word_break: crate::width::WordBreak::BreakAll,
+                limit: NonZeroUsize::new(5).unwrap(),
+            }),
+            ..Default::default()
+        };
+
+        assert_eq!(
+            ctx.process_with_options("abc ௌௌௌௌ def".to_owned(), &opts),
+            Err(BlockReason::Unbroken(10))
+        );
     }
 
     #[test]

diff --git a/src/lib.rs b/src/lib.rs
@@ -34,7 +34,7 @@ pub use replacements::Replacements;
 pub use trie::Trie;
 
 #[cfg(feature = "width")]
-pub use width::{trim_to_width, width, width_str};
+pub use width::{trim_to_width, width, width_str, width_str_max_unbroken};
 
 #[cfg(feature = "censor")]
 pub use typ::Type;

diff --git a/src/width.rs b/src/width.rs
@@ -1,3 +1,4 @@
+use crate::is_whitespace;
 use std::str::from_utf8;
 
 const MODE_WIDTH: u8 = 10;
@@ -65,6 +66,60 @@ pub fn width_str(s: &str) -> usize {
     s.chars().map(|c| width(c) / 100).sum::<usize>() / 10
 }
 
+/// How text is expected to be displayed.
+///
+/// Eventually, `BreakWord` will be supported.
+#[derive(Copy, Clone, Debug)]
+#[non_exhaustive]
+pub enum WordBreak {
+    // TODO: BreakWord
+    BreakAll,
+}
+
+/// Like `width_str` but computes the width of the max unbroken (no line break) part of the string.
+///
+/// In certain cases, not even CSS's `word-break: break-all;` (or equivalents) will be able to
+/// break a string, so it's good to know how long the lines might get.
+///
+/// For example, try selecting the following unbroken part: ௌௌௌௌ
+pub fn width_str_max_unbroken(s: &str, _word_break: WordBreak) -> usize {
+    let mut start = 0;
+    break_all_linebreaks(&s)
+        .map(|p| {
+            let unbroken = &s[start..p];
+            start = p;
+            width_str(unbroken.trim_end_matches(is_whitespace))
+        })
+        .max()
+        .unwrap_or(0)
+}
+
+// TODO unicode-linebreak = { version = "0.1.5", optional = true }
+
+fn break_all_linebreaks(s: &str) -> impl Iterator<Item = usize> + '_ {
+    use finl_unicode::categories::{CharacterCategories, MinorCategory};
+
+    use itertools::Itertools;
+    s.char_indices()
+        .tuple_windows()
+        .filter_map(|((_, c1), (p, c2))| {
+            let c1 = c1.get_minor_category();
+            let c2 = c2.get_minor_category();
+            let break_all = !matches!(c1, MinorCategory::Mn | MinorCategory::Mc)
+                && !matches!(c2, MinorCategory::Mn | MinorCategory::Mc);
+            if break_all
+                || [c1, c2]
+                    .into_iter()
+                    .any(|c| matches!(c, MinorCategory::Zs | MinorCategory::Zl))
+            {
+                Some(p)
+            } else {
+                None
+            }
+        })
+        .chain(std::iter::once(s.len()))
+}
+
 /// Trims a string to a maximum number of `m`'s. A budget of 5 would allow five m, or more narrower
 /// characters, or fewer wider characters.
 pub fn trim_to_width(s: &str, mut budget: usize) -> &str {
@@ -81,8 +136,8 @@ pub fn trim_to_width(s: &str, mut budget: usize) -> &str {
 
 #[cfg(test)]
 mod test {
-    use crate::width::{trim_to_width, width_str};
-    use crate::{width, CensorStr};
+    use crate::width::{trim_to_width, width_str, WordBreak};
+    use crate::{width, width_str_max_unbroken, CensorStr};
     use serial_test::serial;
 
     /*
@@ -92,6 +147,23 @@ mod test {
     }
      */
 
+    #[test]
+    pub fn unbroken() {
+        let tests = [
+            ("", 0),
+            ("m", 1),
+            ("mm", 1),
+            ("m m", 1),
+            ("m     m", 1),
+            ("mm m", 1),
+            ("m mm", 1),
+            ("m;m", 1),
+        ];
+        for (s, w) in tests {
+            assert_eq!(width_str_max_unbroken(s, WordBreak::BreakAll), w, "{s} {w}");
+        }
+    }
+
     #[test]
     pub fn m() {
         assert_eq!(width('m'), 1000);
@@ -123,6 +195,15 @@ mod test {
         assert!(width('꧅') >= 1500);
     }
 
+    #[test]
+    pub fn tamil() {
+        assert_eq!(
+            width_str_max_unbroken("abc ௌௌௌௌ def", WordBreak::BreakAll),
+            10
+        );
+        assert_eq!(width_str_max_unbroken("abc ௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌ", WordBreak::BreakAll), 345);
+    }
+
     #[test]
     pub fn emoji() {
         assert_eq!(width_str("😀🐿"), 4);