Skip to content
This repository has been archived by the owner on Sep 21, 2023. It is now read-only.

Implement formatting check for long hyphen #261

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions test/validation/test_formatting_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
load_valid_transcription_from_file,
)
from tor.validation.formatting_validation import (
check_for_autogenerated_long_hyphen,
check_for_fenced_code_block,
check_for_missing_separators,
check_for_formatting_issues,
Expand Down Expand Up @@ -242,6 +243,23 @@ def test_check_for_fenced_code_block(test_input: str, should_match: bool) -> Non
assert actual == expected


@pytest.mark.parametrize(
"test_input,should_match",
[
("—-", True),
("-—", True),
("---------", False),
("---"), False),
("Word\n int x = 1\nWord", False),
],
)
def test_check_for_autogenerated_long_hyphen(test_input: str, should_match: bool) -> None:
"""Test if autogenerated long hyphens are detected."""
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The long hyphen isn't always auto-generated, so perhaps "accidental" would be a better descriptor than "autogenerated"?

Side note: if settings for "smart quotes" (and the like) are enabled on macOS, it automatically replaces -- with there too. Not just mobile. 😄

actual = check_for_autogenerated_long_hyphen(test_input)
expected = FormattingIssue.AUTOGENERATED_LONG_HYPHEN if should_match else None
assert actual == expected


@pytest.mark.parametrize(
"test_input,should_match",
[
Expand Down Expand Up @@ -339,6 +357,12 @@ def test_check_for_invalid_header(test_input: str, should_match: bool) -> None:
),
[FormattingIssue.FENCED_CODE_BLOCK, FormattingIssue.MISSING_SEPARATORS],
),
(
load_invalid_transcription_from_file(
"autogenerated-long-hyphen.txt"
),
[FormattingIssue.AUTOGENERATED_LONG_HYPHEN, FormattingIssue.MISSING_SEPARATORS],
),
(
load_invalid_transcription_from_file("unescaped_username_subreddit.txt"),
[FormattingIssue.UNESCAPED_USERNAME, FormattingIssue.UNESCAPED_SUBREDDIT],
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
*Image Transcription: Facebook*

-—

I had to come here to vent because I'm scared and sad .... my 300sqft camper that I birthed my son in unassisted has been "under construction" for two months and I'm due in November and wanted to start an outdoor space before baby came so I could move more freely and privately ... my baby daddy has destroyed all the building supplies $3k+ worth of lumber and insulation left in a big storm instead of being brought inside ... he's been getting drunk and sleeping there instead of working and I've been working full time and doing all the meal prep and laundry and shopping and managing the kids schedules while homeschooling and he doesn't do anything but game and smoke and drink and I have no help or support - not to mention there's an assault charge against him for abusing me now which means CPS is involved and that's stressful itself .... we're homeless and having authorities questioned our whole lives and I have no family or friends and all my energy and money is SPENT doing literally EVERYTHING alone and paying for babysitting... baby daddy won't shower or eat at all hasn't been sleeping just gaming and his car has mold growth and mushrooms in the floor boards.... his mouth is badly infected from lack of hygiene... he canceled all my therapy appointments for the next two months .... and he's due to potentially go to jail in September he refused council for court ... I cry every day.... I'm so alone and sad and overwhelmed and I just want to be in my tiny home but I cannot build it myself ... anyway still planning an unassisted but will probably be totally and utterly alone with me and my two small kids which is .... sad to say the least and probably end up having it on a beach somewhere because I'm legitimately homeless

—-

^^I'm a human volunteer content transcriber for Reddit and you could be too! [If you'd like more information on what we do and why we do it, click here!](https://www.reddit.com/r/TranscribersOfReddit/wiki/index)
1 change: 1 addition & 0 deletions tor/validation/formatting_issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ class FormattingIssue(Enum):
HEADING_WITH_DASHES = "heading_with_dashes"
MALFORMED_FOOTER = "malformed_footer"
FENCED_CODE_BLOCK = "fenced_code_block"
AUTOGENERATED_LONG_HYPHEN = "autogenerated_long_hyphen"
UNESCAPED_USERNAME = "unescaped_username"
UNESCAPED_SUBREDDIT = "unescaped_subreddit"
UNESCAPED_HEADING = "unescaped_heading"
Expand Down
23 changes: 23 additions & 0 deletions tor/validation/formatting_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@
# Image Transcription
VALID_HEADERS = ["Audio Transcription", "Image Transcription", "Video Transcription"]

# Regex to recognize separators being replaced with autogenerated long hyphens by mobile devices
# —- or -— instead of ---
AUTOGENERATED_LONG_HYPHEN_PATTERN = re.compile(r"—-|-—")


def check_for_bold_header(transcription: str) -> Optional[FormattingIssue]:
"""Check if the transcription has a bold instead of italic header."""
Expand Down Expand Up @@ -149,6 +153,24 @@ def check_for_fenced_code_block(transcription: str) -> Optional[FormattingIssue]
)


def check_for_autogenerated_long_hyphen(transcription: str) -> Optional[FormattingIssue]:
"""Check if the transcription contains autogenerated long hyphens as a result of mobile device 'assistance'

Separator should look like this:
---

Mobile devices may convert the separator to:
—- or -—

These don't display correctly on all devices
"""
return (
FormattingIssue.AUTOGENERATED_LONG_HYPHEN
if AUTOGENERATED_LONG_HYPHEN_PATTERN.search(transcription) is not None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not drop the regex and do a simple x in y check?

Suggested change
if AUTOGENERATED_LONG_HYPHEN_PATTERN.search(transcription) is not None
if "\u2014" in transcription # \u2014 == em dash == "long hyphen"

else None
)


def check_for_unescaped_username(transcription: str) -> Optional[FormattingIssue]:
"""Check if the transcription contains an unescaped username.

Expand Down Expand Up @@ -224,6 +246,7 @@ def check_for_formatting_issues(transcription: str) -> Set[FormattingIssue]:
check_for_heading_with_dashes(transcription),
check_for_missing_separators(transcription),
check_for_fenced_code_block(transcription),
check_for_autogenerated_long_hyphen(transcription),
check_for_unescaped_username(transcription),
check_for_unescaped_subreddit(transcription),
check_for_unescaped_heading(transcription),
Expand Down