diff --git a/obsidiantools/_constants.py b/obsidiantools/_constants.py index a5dcff2..a821c4d 100644 --- a/obsidiantools/_constants.py +++ b/obsidiantools/_constants.py @@ -1,6 +1,7 @@ # WIKILINKS AND EMBEDDED FILES: regex that includes any aliases # group 0 captures embedded link; group 1 is everything inside [[]] WIKILINK_REGEX = r'(!)?\[{2}([^\]\]]+)\]{2}' +URL_REGEX = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))" # TAGS TAG_INCLUDE_NESTED_REGEX = r'(? list[str]: str_transform_func=_transform_md_file_string_for_tag_parsing) # remove wikilinks so that '#' headers are not caught: src_txt = _remove_wikilinks_from_source_text(src_txt) + # remove URLs so that '#' in URLs are not caught + src_txt = _remove_URLs_from_source_text(src_txt) tags = _get_tags_from_source_text(src_txt, show_nested=show_nested) return tags @@ -444,6 +447,10 @@ def _remove_wikilinks_from_source_text(src_txt: str) -> str: return re.sub(WIKILINK_REGEX, '', src_txt) +def _remove_URLs_from_source_text(src_txt: str) -> str: + return re.sub(URL_REGEX, '', src_txt) + + def _transform_md_file_string_for_tag_parsing(txt: str) -> str: return txt.replace('\\#', '') diff --git a/tests/test_md_utils.py b/tests/test_md_utils.py index b926080..0da1979 100644 --- a/tests/test_md_utils.py +++ b/tests/test_md_utils.py @@ -15,6 +15,7 @@ get_front_matter, get_tags, _remove_wikilinks_from_source_text, + _remove_URLs_from_source_text, _replace_wikilinks_with_their_text, _replace_md_links_with_their_text, get_readable_text_from_md_file) @@ -288,8 +289,16 @@ def test_latex(): assert actual_latex_list == expected_latex_list -def test_remove_wikilinks(txt_wikilink_extraction_stub): +def test_remove_wikilinks(txt_md_link_extraction_stub): out_str = _remove_wikilinks_from_source_text( + txt_md_link_extraction_stub) + + expected_str = '[Obsidian.md homepage]()\n[Github homepage]()\n\n[Github homepage]()\n' + assert out_str == expected_str + + +def test_remove_URLs(txt_wikilink_extraction_stub): + out_str = _remove_URLs_from_source_text( txt_wikilink_extraction_stub) expected_str = "\n" * 6 diff --git a/tests/vault-stub/Sussudio.md b/tests/vault-stub/Sussudio.md index e8b49ce..26fe871 100755 --- a/tests/vault-stub/Sussudio.md +++ b/tests/vault-stub/Sussudio.md @@ -32,6 +32,7 @@ However these shouldn't be recognised as tags: - #1985 - [[American Psycho (film)#Patrick Bateman]] - \#hash_char_not_tag +- docs.google.com/document/d/12345/edit#heading ```python # #code_comment_not_tag