From 5108201f0bcd1a9890438eb22b2001efa03f2fb9 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 10 Sep 2024 14:28:58 +0200 Subject: [PATCH] Filters should apply at the end of the chain --- .../processors/text_json_diff/processor.py | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/changedetectionio/processors/text_json_diff/processor.py b/changedetectionio/processors/text_json_diff/processor.py index 2327423d475..8bccde19b02 100644 --- a/changedetectionio/processors/text_json_diff/processor.py +++ b/changedetectionio/processors/text_json_diff/processor.py @@ -204,18 +204,6 @@ def run_changedetection(self, watch, skip_when_checksum_same=True): is_rss=is_rss # #1874 activate the something</p> will add an extra line feed to signify the paragraph gap - # we end up with 'Some text\n\n', sorting will add all those extra \n at the start, so we remove them here. - stripped_text_from_html = stripped_text_from_html.replace('\n\n', '\n') - stripped_text_from_html = '\n'.join( sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower() )) - - if watch.get('trim_text_whitespace') and stripped_text_from_html: - stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.splitlines()) - - if watch.get('remove_duplicate_lines') and stripped_text_from_html: - stripped_text_from_html = '\n'.join(dict.fromkeys(line for line in stripped_text_from_html.splitlines())) - # Re #340 - return the content before the 'ignore text' was applied text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8') @@ -304,6 +292,20 @@ def run_changedetection(self, watch, skip_when_checksum_same=True): stripped_text_from_html = b''.join(regex_matched_output) text_content_before_ignored_filter = stripped_text_from_html + + if watch.get('sort_text_alphabetically') and stripped_text_from_html: + # Note: Because a <p>something</p> will add an extra line feed to signify the paragraph gap + # we end up with 'Some text\n\n', sorting will add all those extra \n at the start, so we remove them here. + stripped_text_from_html = stripped_text_from_html.replace(b'\n\n', b'\n') + stripped_text_from_html = '\n'.join(sorted(stripped_text_from_html.decode('utf-8').splitlines(), key=lambda x: x.lower())).encode('utf-8') + + # + if watch.get('trim_text_whitespace') and stripped_text_from_html: + stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.decode('utf-8').splitlines()).encode('utf-8') +# + if watch.get('remove_duplicate_lines') and stripped_text_from_html: + stripped_text_from_html = '\n'.join(dict.fromkeys(line for line in stripped_text_from_html.decode('utf-8').splitlines())).encode('utf-8') + # Re #133 - if we should strip whitespaces from triggering the change detected comparison if self.datastore.data['settings']['application'].get('ignore_whitespace', False): fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()