diff --git a/test/validation/test_formatting_validation.py b/test/validation/test_formatting_validation.py index 59c06fa6..1894c4a9 100644 --- a/test/validation/test_formatting_validation.py +++ b/test/validation/test_formatting_validation.py @@ -8,6 +8,7 @@ load_valid_transcription_from_file, ) from tor.validation.formatting_validation import ( + check_for_autogenerated_long_hyphen, check_for_fenced_code_block, check_for_missing_separators, check_for_formatting_issues, @@ -242,6 +243,23 @@ def test_check_for_fenced_code_block(test_input: str, should_match: bool) -> Non assert actual == expected +@pytest.mark.parametrize( + "test_input,should_match", + [ + ("—-", True), + ("-—", True), + ("---------", False), + ("---"), False), + ("Word\n int x = 1\nWord", False), + ], +) +def test_check_for_autogenerated_long_hyphen(test_input: str, should_match: bool) -> None: + """Test if autogenerated long hyphens are detected.""" + actual = check_for_autogenerated_long_hyphen(test_input) + expected = FormattingIssue.AUTOGENERATED_LONG_HYPHEN if should_match else None + assert actual == expected + + @pytest.mark.parametrize( "test_input,should_match", [ @@ -339,6 +357,12 @@ def test_check_for_invalid_header(test_input: str, should_match: bool) -> None: ), [FormattingIssue.FENCED_CODE_BLOCK, FormattingIssue.MISSING_SEPARATORS], ), + ( + load_invalid_transcription_from_file( + "autogenerated-long-hyphen.txt" + ), + [FormattingIssue.AUTOGENERATED_LONG_HYPHEN, FormattingIssue.MISSING_SEPARATORS], + ), ( load_invalid_transcription_from_file("unescaped_username_subreddit.txt"), [FormattingIssue.UNESCAPED_USERNAME, FormattingIssue.UNESCAPED_SUBREDDIT], diff --git a/test/validation/transcriptions/invalid/autogenerated-long-hyphen.txt b/test/validation/transcriptions/invalid/autogenerated-long-hyphen.txt new file mode 100644 index 00000000..8c10fbe1 --- /dev/null +++ b/test/validation/transcriptions/invalid/autogenerated-long-hyphen.txt @@ -0,0 +1,9 @@ +*Image Transcription: Facebook* + +-— + +I had to come here to vent because I'm scared and sad .... my 300sqft camper that I birthed my son in unassisted has been "under construction" for two months and I'm due in November and wanted to start an outdoor space before baby came so I could move more freely and privately ... my baby daddy has destroyed all the building supplies $3k+ worth of lumber and insulation left in a big storm instead of being brought inside ... he's been getting drunk and sleeping there instead of working and I've been working full time and doing all the meal prep and laundry and shopping and managing the kids schedules while homeschooling and he doesn't do anything but game and smoke and drink and I have no help or support - not to mention there's an assault charge against him for abusing me now which means CPS is involved and that's stressful itself .... we're homeless and having authorities questioned our whole lives and I have no family or friends and all my energy and money is SPENT doing literally EVERYTHING alone and paying for babysitting... baby daddy won't shower or eat at all hasn't been sleeping just gaming and his car has mold growth and mushrooms in the floor boards.... his mouth is badly infected from lack of hygiene... he canceled all my therapy appointments for the next two months .... and he's due to potentially go to jail in September he refused council for court ... I cry every day.... I'm so alone and sad and overwhelmed and I just want to be in my tiny home but I cannot build it myself ... anyway still planning an unassisted but will probably be totally and utterly alone with me and my two small kids which is .... sad to say the least and probably end up having it on a beach somewhere because I'm legitimately homeless + +—- + +^^I'm a human volunteer content transcriber for Reddit and you could be too! [If you'd like more information on what we do and why we do it, click here!](https://www.reddit.com/r/TranscribersOfReddit/wiki/index) \ No newline at end of file diff --git a/tor/validation/formatting_issues.py b/tor/validation/formatting_issues.py index ff2a7c58..47722824 100644 --- a/tor/validation/formatting_issues.py +++ b/tor/validation/formatting_issues.py @@ -7,6 +7,7 @@ class FormattingIssue(Enum): HEADING_WITH_DASHES = "heading_with_dashes" MALFORMED_FOOTER = "malformed_footer" FENCED_CODE_BLOCK = "fenced_code_block" + AUTOGENERATED_LONG_HYPHEN = "autogenerated_long_hyphen" UNESCAPED_USERNAME = "unescaped_username" UNESCAPED_SUBREDDIT = "unescaped_subreddit" UNESCAPED_HEADING = "unescaped_heading" diff --git a/tor/validation/formatting_validation.py b/tor/validation/formatting_validation.py index f26e768b..107e04d9 100644 --- a/tor/validation/formatting_validation.py +++ b/tor/validation/formatting_validation.py @@ -68,6 +68,10 @@ # Image Transcription VALID_HEADERS = ["Audio Transcription", "Image Transcription", "Video Transcription"] +# Regex to recognize separators being replaced with autogenerated long hyphens by mobile devices +# —- or -— instead of --- +AUTOGENERATED_LONG_HYPHEN_PATTERN = re.compile(r"—-|-—") + def check_for_bold_header(transcription: str) -> Optional[FormattingIssue]: """Check if the transcription has a bold instead of italic header.""" @@ -149,6 +153,24 @@ def check_for_fenced_code_block(transcription: str) -> Optional[FormattingIssue] ) +def check_for_autogenerated_long_hyphen(transcription: str) -> Optional[FormattingIssue]: + """Check if the transcription contains autogenerated long hyphens as a result of mobile device 'assistance' + + Separator should look like this: + --- + + Mobile devices may convert the separator to: + —- or -— + + These don't display correctly on all devices + """ + return ( + FormattingIssue.AUTOGENERATED_LONG_HYPHEN + if AUTOGENERATED_LONG_HYPHEN_PATTERN.search(transcription) is not None + else None + ) + + def check_for_unescaped_username(transcription: str) -> Optional[FormattingIssue]: """Check if the transcription contains an unescaped username. @@ -224,6 +246,7 @@ def check_for_formatting_issues(transcription: str) -> Set[FormattingIssue]: check_for_heading_with_dashes(transcription), check_for_missing_separators(transcription), check_for_fenced_code_block(transcription), + check_for_autogenerated_long_hyphen(transcription), check_for_unescaped_username(transcription), check_for_unescaped_subreddit(transcription), check_for_unescaped_heading(transcription),