Fix - Regular Expression text in ignore and trigger were not proc…

…essing correctly, also refactored for lower CPU usage (#1747)
dgtlmoon · Sep 5, 2023 · 40d01ac · 40d01ac
1 parent d34832d
commit 40d01ac
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 20 deletions.
diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py
@@ -191,42 +191,50 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
 #
 # wordlist - list of regex's (str) or words (str)
 def strip_ignore_text(content, wordlist, mode="content"):
-    ignore = []
+    i = 0
+    output = []
+    ignore_text = []
     ignore_regex = []
 
-    # @todo check this runs case insensitive
-    for k in wordlist:
+    ignored_line_numbers = []
 
+    for k in wordlist:
         # Is it a regex?
-        if k[0] == '/':
-            ignore_regex.append(k.strip(" /"))
+        x = re.search('^\/(.*)\/(.*)', k.strip())
+        if x:
+            # Starts with / but doesn't look like a regex
+            p = x.group(1)
+            try:
+                # @Todo python regex options can go before the regex str, but not really many of the options apply on a per-line basis
+                ignore_regex.append(re.compile(rf"{p}", re.IGNORECASE))
+            except Exception as e:
+                # Badly formed regex, treat as text
+                ignore_text.append(k.strip())
         else:
-            ignore.append(k)
+            # Had a / but doesn't work as regex
+            ignore_text.append(k.strip())
 
-    i = 0
-    output = []
-    ignored_line_numbers = []
     for line in content.splitlines():
         i += 1
         # Always ignore blank lines in this mode. (when this function gets called)
+        got_match = False
         if len(line.strip()):
-            regex_matches = False
+            for l in ignore_text:
+                if l.lower() in line.lower():
+                    got_match = True
 
-            # if any of these match, skip
-            for regex in ignore_regex:
-                try:
-                    if re.search(regex, line, re.IGNORECASE):
-                        regex_matches = True
-                except Exception as e:
-                    continue
+            if not got_match:
+                for r in ignore_regex:
+                    if r.search(line):
+                        got_match = True
 
-            if not regex_matches and not any(skip_text.lower() in line.lower() for skip_text in ignore):
+            if not got_match:
+                # Not ignored
                 output.append(line.encode('utf8'))
             else:
                 ignored_line_numbers.append(i)
 
 
-
     # Used for finding out what to highlight
     if mode == "line numbers":
         return ignored_line_numbers

diff --git a/changedetectionio/tests/test_ignore_regex_text.py b/changedetectionio/tests/test_ignore_regex_text.py
@@ -15,16 +15,35 @@ def test_strip_regex_text_func():
     but sometimes we want to remove the lines.
     
     but 1 lines
+    skip 5 lines
+    really? yes man
+#/not this tries weirdly formed regex or just strings starting with /
+/not this
     but including 1234 lines
     igNORe-cAse text we dont want to keep    
     but not always."""
 
-    ignore_lines = ["sometimes", "/\s\d{2,3}\s/", "/ignore-case text/"]
+
+    ignore_lines = [
+        "sometimes",
+        "/\s\d{2,3}\s/",
+        "/ignore-case text/",
+        "really?",
+        "/skip \d lines/i",
+        "/not"
+    ]
+
 
     fetcher = fetch_site_status.perform_site_check(datastore=False)
     stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines)
 
     assert b"but 1 lines" in stripped_content
     assert b"igNORe-cAse text" not in stripped_content
     assert b"but 1234 lines" not in stripped_content
+    assert b"really" not in stripped_content
+    assert b"not this" not in stripped_content
+
+    # Check line number reporting
+    stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines, mode="line numbers")
+    assert stripped_content == [2, 5, 6, 7, 8, 10]