Skip to content

Commit

Permalink
0.7.29 - Limit width after word-break.
Browse files Browse the repository at this point in the history
  • Loading branch information
finnbear committed Sep 24, 2024
1 parent 52edb2d commit 6c4b4f9
Show file tree
Hide file tree
Showing 6 changed files with 164 additions and 20 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ jobs:
- name: Download Testing Data
run: curl https://raw.githubusercontent.com/vzhou842/profanity-check/master/profanity_check/data/clean_data.csv --output test.csv
- name: Test
run: cargo test --release --features width
run: cargo test --release --features width,context
- name: Add wasm32 target
run: rustup target add wasm32-unknown-unknown
- name: Install Trunk
Expand Down
6 changes: 3 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "rustrict"
authors = ["Finn Bear"]
version = "0.7.28"
version = "0.7.29"
edition = "2021"
license = "MIT OR Apache-2.0"
repository = "https://github.com/finnbear/rustrict/"
Expand Down Expand Up @@ -37,7 +37,7 @@ default = ["censor", "context"]
censor = ["arrayvec", "bitflags", "lazy_static", "itertools", "unicode-normalization", "rustc-hash"]
context = ["censor", "strsim"]
customize = ["censor"]
width = ["lazy_static"]
width = ["lazy_static", "itertools"]
pii = ["lazy_static", "regex"]
find_false_positives = ["censor", "regex", "indicatif", "rayon"]
find_replacements = ["csv"]
Expand Down Expand Up @@ -79,4 +79,4 @@ censor_crate = {package = "censor", version = "0.3.0"}
rustrict_old = {package = "rustrict", version = "0.7.24"}
serial_test = "0.5"
bincode = "1.3.3"
serde_json = "1"
serde_json = "1"
1 change: 1 addition & 0 deletions fuzz/fuzz_targets/fuzz.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ fuzz_target!(|data: &[u8]| {

if let Ok(text) = std::str::from_utf8(input) {
let _ = rustrict::width_str(text);
let _ = rustrict::width_str_max_unbroken(text);
let _ = rustrict::trim_to_width(text, 10);
let _ = rustrict::censor_and_analyze_pii(text);

Expand Down
88 changes: 75 additions & 13 deletions src/context.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use crate::width::WordBreak;
use crate::{trim_whitespace, Censor, Type};

use crate::censor::should_skip_censor;
Expand Down Expand Up @@ -49,6 +50,7 @@ impl Debug for Context {
/// as new fields may be added in the future.
#[derive(Clone, Debug)]
#[cfg_attr(doc, doc(cfg(feature = "context")))]
#[non_exhaustive]
pub struct ContextProcessingOptions {
/// Block messages if the user has been manually muted.
pub block_if_muted: bool,
Expand All @@ -63,6 +65,9 @@ pub struct ContextProcessingOptions {
///
/// Messages will be trimmed to fit.
pub character_limit: Option<NonZeroUsize>,
/// Ensure word-break will work on the message.
#[cfg(feature = "width")]
pub word_break: Option<ContextWordBreakOptions>,
/// Rate-limiting options.
pub rate_limit: Option<ContextRateLimitOptions>,
/// Block messages if they are very similar to this many previous message.
Expand All @@ -83,6 +88,8 @@ impl Default for ContextProcessingOptions {
safe_mode_until: None,
character_limit: Some(NonZeroUsize::new(2048).unwrap()),
rate_limit: Some(ContextRateLimitOptions::default()),
#[cfg(feature = "width")]
word_break: Some(ContextWordBreakOptions::default()),
repetition_limit: Some(ContextRepetitionLimitOptions::default()),
max_safe_timeout: Duration::from_secs(30 * 60),
trim_whitespace: true,
Expand Down Expand Up @@ -127,6 +134,27 @@ impl ContextRateLimitOptions {
}
}

/// Options that ensure word break will be possible.
#[derive(Clone, Debug)]
#[cfg(feature = "width")]
#[cfg_attr(doc, doc(cfg(all(feature = "context", feature = "width"))))]
pub struct ContextWordBreakOptions {
/// The type of word-breaking used to display the text.
pub word_break: WordBreak,
/// The maximum length of an unbreakable part (before the entire message is blocked).
pub limit: NonZeroUsize,
}

#[cfg(feature = "width")]
impl Default for ContextWordBreakOptions {
fn default() -> Self {
Self {
word_break: WordBreak::BreakAll,
limit: NonZeroUsize::new(16).unwrap(),
}
}
}

/// Options that control repetition-limiting.
#[derive(Clone, Debug)]
#[cfg_attr(doc, doc(cfg(feature = "context")))]
Expand Down Expand Up @@ -252,6 +280,16 @@ impl Context {
censored_str = trim_whitespace(censored_str);
}

#[cfg(feature = "width")]
{
if let Some(word_break) = &options.word_break {
let max = crate::width::width_str_max_unbroken(censored_str, word_break.word_break);
if max > word_break.limit.get() {
return Err(BlockReason::Unbroken(max));
}
}
}

if censored_str.len() < censored.len() {
// Something was trimmed, must must re-allocate.
censored = String::from(censored_str);
Expand Down Expand Up @@ -514,6 +552,9 @@ impl Default for Context {
pub enum BlockReason {
/// The particular message was *severely* inappropriate, more specifically, `Type`.
Inappropriate(Type),
#[cfg(feature = "width")]
/// There was an unbroken part of the string of this length, exceeding the limit.
Unbroken(usize),
/// Recent messages were generally inappropriate, and this message isn't on the safe list.
/// Alternatively, if targeted is false, safe mode was configured globally.
/// Try again after `Duration`.
Expand All @@ -537,7 +578,18 @@ impl BlockReason {
/// default warning to send to the user.
pub fn generic_str(self) -> &'static str {
match self {
Self::Inappropriate(_) => "Your message was held for severe profanity",
Self::Inappropriate(typ) => {
if typ.is(Type::OFFENSIVE) {
"Your message was held for being highly offensive"
} else if typ.is(Type::SEXUAL) {
"Your message was held for being overly sexual"
} else if typ.is(Type::MEAN) {
"Your message was held for being overly mean"
} else {
"Your message was held for severe profanity"
}
}
Self::Unbroken(_) => "Part of your message is too wide to display",
Self::Unsafe { .. } => "You have been temporarily restricted due to profanity/spam",
Self::Repetitious(_) => "Your message was too similar to recent messages",
Self::Spam(_) => "You have been temporarily muted due to excessive frequency",
Expand All @@ -556,15 +608,6 @@ impl BlockReason {
/// muted for).
pub fn contextual_string(self) -> String {
match self {
Self::Inappropriate(typ) => String::from(if typ.is(Type::OFFENSIVE) {
"Your message was held for being highly offensive"
} else if typ.is(Type::SEXUAL) {
"Your message was held for being overly sexual"
} else if typ.is(Type::MEAN) {
"Your message was held for being overly mean"
} else {
"Your message was held for severe profanity"
}),
Self::Unsafe {
remaining,
targeted: true,
Expand All @@ -584,7 +627,7 @@ impl BlockReason {
FormattedDuration(dur)
),
Self::Muted(dur) => format!("You have been muted for {}", FormattedDuration(dur)),
_ => String::from(self.generic_str()),
_ => self.generic_str().to_owned(),
}
}
}
Expand Down Expand Up @@ -836,11 +879,17 @@ mod tests {
#[test]
#[cfg(feature = "width")]
fn character_limit() {
use crate::{BlockReason, Context, ContextProcessingOptions};
use crate::{
context::ContextWordBreakOptions, BlockReason, Context, ContextProcessingOptions,
};
let mut ctx = Context::new();

let opts = ContextProcessingOptions {
character_limit: Some(NonZeroUsize::new(5).unwrap()),
word_break: Some(ContextWordBreakOptions {
word_break: crate::width::WordBreak::BreakAll,
limit: NonZeroUsize::new(5).unwrap(),
}),
..Default::default()
};

Expand All @@ -849,11 +898,24 @@ mod tests {
Ok(String::from("abcde"))
);

#[cfg(feature = "width")]
assert_eq!(
ctx.process_with_options(String::from("a﷽"), &opts),
Ok(String::from("a"))
);

let opts = ContextProcessingOptions {
character_limit: Some(NonZeroUsize::new(20).unwrap()),
word_break: Some(ContextWordBreakOptions {
word_break: crate::width::WordBreak::BreakAll,
limit: NonZeroUsize::new(5).unwrap(),
}),
..Default::default()
};

assert_eq!(
ctx.process_with_options("abc ௌௌௌௌ def".to_owned(), &opts),
Err(BlockReason::Unbroken(10))
);
}

#[test]
Expand Down
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ pub use replacements::Replacements;
pub use trie::Trie;

#[cfg(feature = "width")]
pub use width::{trim_to_width, width, width_str};
pub use width::{trim_to_width, width, width_str, width_str_max_unbroken};

#[cfg(feature = "censor")]
pub use typ::Type;
Expand Down
85 changes: 83 additions & 2 deletions src/width.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use crate::is_whitespace;
use std::str::from_utf8;

const MODE_WIDTH: u8 = 10;
Expand Down Expand Up @@ -65,6 +66,60 @@ pub fn width_str(s: &str) -> usize {
s.chars().map(|c| width(c) / 100).sum::<usize>() / 10
}

/// How text is expected to be displayed.
///
/// Eventually, `BreakWord` will be supported.
#[derive(Copy, Clone, Debug)]
#[non_exhaustive]
pub enum WordBreak {
// TODO: BreakWord
BreakAll,
}

/// Like `width_str` but computes the width of the max unbroken (no line break) part of the string.
///
/// In certain cases, not even CSS's `word-break: break-all;` (or equivalents) will be able to
/// break a string, so it's good to know how long the lines might get.
///
/// For example, try selecting the following unbroken part: ௌௌௌௌ
pub fn width_str_max_unbroken(s: &str, _word_break: WordBreak) -> usize {
let mut start = 0;
break_all_linebreaks(&s)
.map(|p| {
let unbroken = &s[start..p];
start = p;
width_str(unbroken.trim_end_matches(is_whitespace))
})
.max()
.unwrap_or(0)
}

// TODO unicode-linebreak = { version = "0.1.5", optional = true }

fn break_all_linebreaks(s: &str) -> impl Iterator<Item = usize> + '_ {
use finl_unicode::categories::{CharacterCategories, MinorCategory};

use itertools::Itertools;
s.char_indices()
.tuple_windows()
.filter_map(|((_, c1), (p, c2))| {
let c1 = c1.get_minor_category();
let c2 = c2.get_minor_category();
let break_all = !matches!(c1, MinorCategory::Mn | MinorCategory::Mc)
&& !matches!(c2, MinorCategory::Mn | MinorCategory::Mc);
if break_all
|| [c1, c2]
.into_iter()
.any(|c| matches!(c, MinorCategory::Zs | MinorCategory::Zl))
{
Some(p)
} else {
None
}
})
.chain(std::iter::once(s.len()))
}

/// Trims a string to a maximum number of `m`'s. A budget of 5 would allow five m, or more narrower
/// characters, or fewer wider characters.
pub fn trim_to_width(s: &str, mut budget: usize) -> &str {
Expand All @@ -81,8 +136,8 @@ pub fn trim_to_width(s: &str, mut budget: usize) -> &str {

#[cfg(test)]
mod test {
use crate::width::{trim_to_width, width_str};
use crate::{width, CensorStr};
use crate::width::{trim_to_width, width_str, WordBreak};
use crate::{width, width_str_max_unbroken, CensorStr};
use serial_test::serial;

/*
Expand All @@ -92,6 +147,23 @@ mod test {
}
*/

#[test]
pub fn unbroken() {
let tests = [
("", 0),
("m", 1),
("mm", 1),
("m m", 1),
("m m", 1),
("mm m", 1),
("m mm", 1),
("m;m", 1),
];
for (s, w) in tests {
assert_eq!(width_str_max_unbroken(s, WordBreak::BreakAll), w, "{s} {w}");
}
}

#[test]
pub fn m() {
assert_eq!(width('m'), 1000);
Expand Down Expand Up @@ -123,6 +195,15 @@ mod test {
assert!(width('꧅') >= 1500);
}

#[test]
pub fn tamil() {
assert_eq!(
width_str_max_unbroken("abc ௌௌௌௌ def", WordBreak::BreakAll),
10
);
assert_eq!(width_str_max_unbroken("abc ௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌ", WordBreak::BreakAll), 345);
}

#[test]
pub fn emoji() {
assert_eq!(width_str("😀🐿"), 4);
Expand Down

0 comments on commit 6c4b4f9

Please sign in to comment.