From fe7a9472bb880fb42791eda0a118b206bf38f846 Mon Sep 17 00:00:00 2001 From: Emily Pastewka Date: Tue, 7 Nov 2023 20:29:26 -0700 Subject: [PATCH 1/2] filter out URLs before finding hashtags --- obsidiantools/_constants.py | 1 + obsidiantools/md_utils.py | 7 +++++++ tests/vault-stub/Sussudio.md | 1 + 3 files changed, 9 insertions(+) diff --git a/obsidiantools/_constants.py b/obsidiantools/_constants.py index a5dcff2..a821c4d 100644 --- a/obsidiantools/_constants.py +++ b/obsidiantools/_constants.py @@ -1,6 +1,7 @@ # WIKILINKS AND EMBEDDED FILES: regex that includes any aliases # group 0 captures embedded link; group 1 is everything inside [[]] WIKILINK_REGEX = r'(!)?\[{2}([^\]\]]+)\]{2}' +URL_REGEX = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))" # TAGS TAG_INCLUDE_NESTED_REGEX = r'(? list[str]: str_transform_func=_transform_md_file_string_for_tag_parsing) # remove wikilinks so that '#' headers are not caught: src_txt = _remove_wikilinks_from_source_text(src_txt) + # remove URLs so that '#' in URLs are not caught + src_text = _remove_URLs_from_source_text(src_text) tags = _get_tags_from_source_text(src_txt, show_nested=show_nested) return tags @@ -444,6 +447,10 @@ def _remove_wikilinks_from_source_text(src_txt: str) -> str: return re.sub(WIKILINK_REGEX, '', src_txt) +def _remove_URLs_from_source_text(src_txt: str) -> str: + return re.sub(URL_REGEX, '', src_txt) + + def _transform_md_file_string_for_tag_parsing(txt: str) -> str: return txt.replace('\\#', '') diff --git a/tests/vault-stub/Sussudio.md b/tests/vault-stub/Sussudio.md index e8b49ce..26fe871 100755 --- a/tests/vault-stub/Sussudio.md +++ b/tests/vault-stub/Sussudio.md @@ -32,6 +32,7 @@ However these shouldn't be recognised as tags: - #1985 - [[American Psycho (film)#Patrick Bateman]] - \#hash_char_not_tag +- docs.google.com/document/d/12345/edit#heading ```python # #code_comment_not_tag From c0415ca12c54cae3ab61aae1f1326bdebf7b2678 Mon Sep 17 00:00:00 2001 From: Emily Pastewka Date: Tue, 7 Nov 2023 20:40:49 -0700 Subject: [PATCH 2/2] test _remove_wikilinks_from_source_text too --- tests/.DS_Store | Bin 0 -> 6148 bytes tests/test_md_utils.py | 11 ++++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 tests/.DS_Store diff --git a/tests/.DS_Store b/tests/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..bb65db76aaae8dbf5788f25e7a353d3d8595ab27 GIT binary patch literal 6148 zcmeHK!A`?440Xn&b#U2{V-8$7Oz;6^DqpZ4U}Z24EuDaJC1<{ZAK}O^!0+%Jdt+5Q zu@eGhOU_FjJ8@s5Iw2xAspn&&5fLR&#@-N3hwwOSOL}VIK^Nz^qZt*&Vm!U9THa>( zjSTR&8_WIY|M~X&e>2FQi~(cd zUoqhN*)*Hrm9)0DUXE+6hh9Kg*e?jKL(qw(7`|MJ51~