From fe7a9472bb880fb42791eda0a118b206bf38f846 Mon Sep 17 00:00:00 2001 From: Emily Pastewka Date: Tue, 7 Nov 2023 20:29:26 -0700 Subject: [PATCH 1/4] filter out URLs before finding hashtags --- obsidiantools/_constants.py | 1 + obsidiantools/md_utils.py | 7 +++++++ tests/vault-stub/Sussudio.md | 1 + 3 files changed, 9 insertions(+) diff --git a/obsidiantools/_constants.py b/obsidiantools/_constants.py index a5dcff2..a821c4d 100644 --- a/obsidiantools/_constants.py +++ b/obsidiantools/_constants.py @@ -1,6 +1,7 @@ # WIKILINKS AND EMBEDDED FILES: regex that includes any aliases # group 0 captures embedded link; group 1 is everything inside [[]] WIKILINK_REGEX = r'(!)?\[{2}([^\]\]]+)\]{2}' +URL_REGEX = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))" # TAGS TAG_INCLUDE_NESTED_REGEX = r'(? list[str]: str_transform_func=_transform_md_file_string_for_tag_parsing) # remove wikilinks so that '#' headers are not caught: src_txt = _remove_wikilinks_from_source_text(src_txt) + # remove URLs so that '#' in URLs are not caught + src_text = _remove_URLs_from_source_text(src_text) tags = _get_tags_from_source_text(src_txt, show_nested=show_nested) return tags @@ -444,6 +447,10 @@ def _remove_wikilinks_from_source_text(src_txt: str) -> str: return re.sub(WIKILINK_REGEX, '', src_txt) +def _remove_URLs_from_source_text(src_txt: str) -> str: + return re.sub(URL_REGEX, '', src_txt) + + def _transform_md_file_string_for_tag_parsing(txt: str) -> str: return txt.replace('\\#', '') diff --git a/tests/vault-stub/Sussudio.md b/tests/vault-stub/Sussudio.md index e8b49ce..26fe871 100755 --- a/tests/vault-stub/Sussudio.md +++ b/tests/vault-stub/Sussudio.md @@ -32,6 +32,7 @@ However these shouldn't be recognised as tags: - #1985 - [[American Psycho (film)#Patrick Bateman]] - \#hash_char_not_tag +- docs.google.com/document/d/12345/edit#heading ```python # #code_comment_not_tag From c0415ca12c54cae3ab61aae1f1326bdebf7b2678 Mon Sep 17 00:00:00 2001 From: Emily Pastewka Date: Tue, 7 Nov 2023 20:40:49 -0700 Subject: [PATCH 2/4] test _remove_wikilinks_from_source_text too --- tests/.DS_Store | Bin 0 -> 6148 bytes tests/test_md_utils.py | 11 ++++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 tests/.DS_Store diff --git a/tests/.DS_Store b/tests/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..bb65db76aaae8dbf5788f25e7a353d3d8595ab27 GIT binary patch literal 6148 zcmeHK!A`?440Xn&b#U2{V-8$7Oz;6^DqpZ4U}Z24EuDaJC1<{ZAK}O^!0+%Jdt+5Q zu@eGhOU_FjJ8@s5Iw2xAspn&&5fLR&#@-N3hwwOSOL}VIK^Nz^qZt*&Vm!U9THa>( zjSTR&8_WIY|M~X&e>2FQi~(cd zUoqhN*)*Hrm9)0DUXE+6hh9Kg*e?jKL(qw(7`|MJ51~ Date: Tue, 8 Jul 2025 18:47:01 +0100 Subject: [PATCH 3/4] Remove DS_Store --- tests/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 tests/.DS_Store diff --git a/tests/.DS_Store b/tests/.DS_Store deleted file mode 100644 index bb65db76aaae8dbf5788f25e7a353d3d8595ab27..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK!A`?440Xn&b#U2{V-8$7Oz;6^DqpZ4U}Z24EuDaJC1<{ZAK}O^!0+%Jdt+5Q zu@eGhOU_FjJ8@s5Iw2xAspn&&5fLR&#@-N3hwwOSOL}VIK^Nz^qZt*&Vm!U9THa>( zjSTR&8_WIY|M~X&e>2FQi~(cd zUoqhN*)*Hrm9)0DUXE+6hh9Kg*e?jKL(qw(7`|MJ51~ Date: Tue, 8 Jul 2025 19:08:02 +0100 Subject: [PATCH 4/4] typo --- obsidiantools/md_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/obsidiantools/md_utils.py b/obsidiantools/md_utils.py index e11fc92..9e1a3d4 100644 --- a/obsidiantools/md_utils.py +++ b/obsidiantools/md_utils.py @@ -243,7 +243,7 @@ def get_tags(filepath: Path, *, show_nested: bool = False) -> list[str]: # remove wikilinks so that '#' headers are not caught: src_txt = _remove_wikilinks_from_source_text(src_txt) # remove URLs so that '#' in URLs are not caught - src_text = _remove_URLs_from_source_text(src_text) + src_txt = _remove_URLs_from_source_text(src_txt) tags = _get_tags_from_source_text(src_txt, show_nested=show_nested) return tags