Skip to content

Commit

Permalink
Fix - Regular Expression text in ignore and trigger were not proc…
Browse files Browse the repository at this point in the history
…essing correctly, also refactored for lower CPU usage (#1747)
  • Loading branch information
dgtlmoon committed Sep 5, 2023
1 parent d34832d commit 40d01ac
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 20 deletions.
46 changes: 27 additions & 19 deletions changedetectionio/html_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,42 +191,50 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
#
# wordlist - list of regex's (str) or words (str)
def strip_ignore_text(content, wordlist, mode="content"):
ignore = []
i = 0
output = []
ignore_text = []
ignore_regex = []

# @todo check this runs case insensitive
for k in wordlist:
ignored_line_numbers = []

for k in wordlist:
# Is it a regex?
if k[0] == '/':
ignore_regex.append(k.strip(" /"))
x = re.search('^\/(.*)\/(.*)', k.strip())
if x:
# Starts with / but doesn't look like a regex
p = x.group(1)
try:
# @Todo python regex options can go before the regex str, but not really many of the options apply on a per-line basis
ignore_regex.append(re.compile(rf"{p}", re.IGNORECASE))
except Exception as e:
# Badly formed regex, treat as text
ignore_text.append(k.strip())
else:
ignore.append(k)
# Had a / but doesn't work as regex
ignore_text.append(k.strip())

i = 0
output = []
ignored_line_numbers = []
for line in content.splitlines():
i += 1
# Always ignore blank lines in this mode. (when this function gets called)
got_match = False
if len(line.strip()):
regex_matches = False
for l in ignore_text:
if l.lower() in line.lower():
got_match = True

# if any of these match, skip
for regex in ignore_regex:
try:
if re.search(regex, line, re.IGNORECASE):
regex_matches = True
except Exception as e:
continue
if not got_match:
for r in ignore_regex:
if r.search(line):
got_match = True

if not regex_matches and not any(skip_text.lower() in line.lower() for skip_text in ignore):
if not got_match:
# Not ignored
output.append(line.encode('utf8'))
else:
ignored_line_numbers.append(i)



# Used for finding out what to highlight
if mode == "line numbers":
return ignored_line_numbers
Expand Down
21 changes: 20 additions & 1 deletion changedetectionio/tests/test_ignore_regex_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,35 @@ def test_strip_regex_text_func():
but sometimes we want to remove the lines.
but 1 lines
skip 5 lines
really? yes man
#/not this tries weirdly formed regex or just strings starting with /
/not this
but including 1234 lines
igNORe-cAse text we dont want to keep
but not always."""

ignore_lines = ["sometimes", "/\s\d{2,3}\s/", "/ignore-case text/"]

ignore_lines = [
"sometimes",
"/\s\d{2,3}\s/",
"/ignore-case text/",
"really?",
"/skip \d lines/i",
"/not"
]


fetcher = fetch_site_status.perform_site_check(datastore=False)
stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines)

assert b"but 1 lines" in stripped_content
assert b"igNORe-cAse text" not in stripped_content
assert b"but 1234 lines" not in stripped_content
assert b"really" not in stripped_content
assert b"not this" not in stripped_content

# Check line number reporting
stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines, mode="line numbers")
assert stripped_content == [2, 5, 6, 7, 8, 10]

0 comments on commit 40d01ac

Please sign in to comment.