From e12b5a6f71b161c6e92503e0ad894b69177d0f9a Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Mon, 29 Jul 2024 11:41:05 +0200 Subject: [PATCH 1/3] Re #2528 - handle zero-byte responses with "Empty pages are a change" the same as when the HTML doesnt render any useful text --- changedetectionio/content_fetchers/base.py | 3 +- .../content_fetchers/playwright.py | 9 ++-- .../content_fetchers/puppeteer.py | 15 ++++--- .../content_fetchers/requests.py | 13 +++--- .../content_fetchers/webdriver_selenium.py | 3 +- changedetectionio/processors/__init__.py | 16 ++++++- .../tests/test_nonrenderable_pages.py | 43 ++++++++++--------- 7 files changed, 62 insertions(+), 40 deletions(-) diff --git a/changedetectionio/content_fetchers/base.py b/changedetectionio/content_fetchers/base.py index 91fcc8e1b4b..66dd74036cd 100644 --- a/changedetectionio/content_fetchers/base.py +++ b/changedetectionio/content_fetchers/base.py @@ -81,7 +81,8 @@ def run(self, request_method, ignore_status_codes=False, current_include_filters=None, - is_binary=False): + is_binary=False, + empty_pages_are_a_change=False): # Should set self.error, self.status_code and self.content pass diff --git a/changedetectionio/content_fetchers/playwright.py b/changedetectionio/content_fetchers/playwright.py index e908ce1c8a9..53be33f1d04 100644 --- a/changedetectionio/content_fetchers/playwright.py +++ b/changedetectionio/content_fetchers/playwright.py @@ -83,7 +83,8 @@ def run(self, request_method, ignore_status_codes=False, current_include_filters=None, - is_binary=False): + is_binary=False, + empty_pages_are_a_change=False): from playwright.sync_api import sync_playwright import playwright._impl._errors @@ -130,7 +131,7 @@ def run(self, if response is None: context.close() browser.close() - logger.debug("Content Fetcher > Response object was none") + logger.debug("Content Fetcher > Response object from the browser communication was none") raise EmptyReply(url=url, status_code=None) try: @@ -166,10 +167,10 @@ def run(self, raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot) - if len(self.page.content().strip()) == 0: + if not empty_pages_are_a_change and len(self.page.content().strip()) == 0: + logger.debug("Content Fetcher > Content was empty, empty_pages_are_a_change = False") context.close() browser.close() - logger.debug("Content Fetcher > Content was empty") raise EmptyReply(url=url, status_code=response.status) # Run Browser Steps here diff --git a/changedetectionio/content_fetchers/puppeteer.py b/changedetectionio/content_fetchers/puppeteer.py index 923a2be1ce2..9dd06c38e4a 100644 --- a/changedetectionio/content_fetchers/puppeteer.py +++ b/changedetectionio/content_fetchers/puppeteer.py @@ -75,7 +75,8 @@ async def fetch_page(self, request_method, ignore_status_codes, current_include_filters, - is_binary + is_binary, + empty_pages_are_a_change ): from changedetectionio.content_fetchers import visualselector_xpath_selectors @@ -153,7 +154,7 @@ async def fetch_page(self, if response is None: await self.page.close() await browser.close() - logger.warning("Content Fetcher > Response object was none") + logger.warning("Content Fetcher > Response object was none (as in, the response from the browser was empty, not just the content)") raise EmptyReply(url=url, status_code=None) self.headers = response.headers @@ -186,10 +187,11 @@ async def fetch_page(self, raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot) content = await self.page.content - if len(content.strip()) == 0: + + if not empty_pages_are_a_change and len(content.strip()) == 0: + logger.error("Content Fetcher > Content was empty (empty_pages_are_a_change is False), closing browsers") await self.page.close() await browser.close() - logger.error("Content Fetcher > Content was empty") raise EmptyReply(url=url, status_code=response.status) # Run Browser Steps here @@ -247,7 +249,7 @@ async def main(self, **kwargs): await self.fetch_page(**kwargs) def run(self, url, timeout, request_headers, request_body, request_method, ignore_status_codes=False, - current_include_filters=None, is_binary=False): + current_include_filters=None, is_binary=False, empty_pages_are_a_change=False): #@todo make update_worker async which could run any of these content_fetchers within memory and time constraints max_time = os.getenv('PUPPETEER_MAX_PROCESSING_TIMEOUT_SECONDS', 180) @@ -262,7 +264,8 @@ def run(self, url, timeout, request_headers, request_body, request_method, ignor request_method=request_method, ignore_status_codes=ignore_status_codes, current_include_filters=current_include_filters, - is_binary=is_binary + is_binary=is_binary, + empty_pages_are_a_change=empty_pages_are_a_change ), timeout=max_time)) except asyncio.TimeoutError: raise(BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds.")) diff --git a/changedetectionio/content_fetchers/requests.py b/changedetectionio/content_fetchers/requests.py index 2e952c9c1eb..c39b263632b 100644 --- a/changedetectionio/content_fetchers/requests.py +++ b/changedetectionio/content_fetchers/requests.py @@ -1,9 +1,8 @@ +from loguru import logger +import chardet import hashlib import os - -import chardet import requests - from changedetectionio import strtobool from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived from changedetectionio.content_fetchers.base import Fetcher @@ -26,7 +25,8 @@ def run(self, request_method, ignore_status_codes=False, current_include_filters=None, - is_binary=False): + is_binary=False, + empty_pages_are_a_change=False): if self.browser_steps_get_valid_steps(): raise BrowserStepsInUnsupportedFetcher(url=url) @@ -74,7 +74,10 @@ def run(self, self.headers = r.headers if not r.content or not len(r.content): - raise EmptyReply(url=url, status_code=r.status_code) + if not empty_pages_are_a_change: + raise EmptyReply(url=url, status_code=r.status_code) + else: + logger.debug(f"URL {url} gave zero byte content reply with Status Code {r.status_code}, but empty_pages_are_a_change = True") # @todo test this # @todo maybe you really want to test zero-byte return pages? diff --git a/changedetectionio/content_fetchers/webdriver_selenium.py b/changedetectionio/content_fetchers/webdriver_selenium.py index a45746f03a8..72e80b1592a 100644 --- a/changedetectionio/content_fetchers/webdriver_selenium.py +++ b/changedetectionio/content_fetchers/webdriver_selenium.py @@ -56,7 +56,8 @@ def run(self, request_method, ignore_status_codes=False, current_include_filters=None, - is_binary=False): + is_binary=False, + empty_pages_are_a_change=False): from selenium import webdriver from selenium.webdriver.chrome.options import Options as ChromeOptions diff --git a/changedetectionio/processors/__init__.py b/changedetectionio/processors/__init__.py index 0ce964977e1..529f57dabd7 100644 --- a/changedetectionio/processors/__init__.py +++ b/changedetectionio/processors/__init__.py @@ -26,6 +26,8 @@ def __init__(self, *args, datastore, watch_uuid, **kwargs): def call_browser(self): from requests.structures import CaseInsensitiveDict + from changedetectionio.content_fetchers.exceptions import EmptyReply + # Protect against file:// access if re.search(r'^file://', self.watch.get('url', '').strip(), re.IGNORECASE): if not strtobool(os.getenv('ALLOW_FILE_URI', 'false')): @@ -133,8 +135,18 @@ def call_browser(self): is_binary = self.watch.is_pdf # And here we go! call the right browser with browser-specific settings - self.fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, self.watch.get('include_filters'), - is_binary=is_binary) + empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False) + + self.fetcher.run(url=url, + timeout=timeout, + request_headers=request_headers, + request_body=request_body, + request_method=request_method, + ignore_status_codes=ignore_status_codes, + current_include_filters=self.watch.get('include_filters'), + is_binary=is_binary, + empty_pages_are_a_change=empty_pages_are_a_change + ) #@todo .quit here could go on close object, so we can run JS if change-detected self.fetcher.quit() diff --git a/changedetectionio/tests/test_nonrenderable_pages.py b/changedetectionio/tests/test_nonrenderable_pages.py index 4e9169cfd80..3757eb6edf4 100644 --- a/changedetectionio/tests/test_nonrenderable_pages.py +++ b/changedetectionio/tests/test_nonrenderable_pages.py @@ -1,12 +1,7 @@ #!/usr/bin/env python3 -import time from flask import url_for -from urllib.request import urlopen -from .util import set_original_response, set_modified_response, live_server_setup - -sleep_time_for_fetch_thread = 3 - +from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks def set_nonrenderable_response(): test_return_data = """ @@ -22,6 +17,13 @@ def set_nonrenderable_response(): return None +def set_zero_byte_response(): + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write("") + + return None + def test_check_basic_change_detection_functionality(client, live_server, measure_memory_usage): set_original_response() live_server_setup(live_server) @@ -35,18 +37,11 @@ def test_check_basic_change_detection_functionality(client, live_server, measure assert b"1 Imported" in res.data - time.sleep(sleep_time_for_fetch_thread) - - # Do this a few times.. ensures we dont accidently set the status - for n in range(3): - client.get(url_for("form_watch_checknow"), follow_redirects=True) + wait_for_all_checks(client) - # Give the thread time to pick it up - time.sleep(sleep_time_for_fetch_thread) - - # It should report nothing found (no new 'unviewed' class) - res = client.get(url_for("index")) - assert b'unviewed' not in res.data + # It should report nothing found (no new 'unviewed' class) + res = client.get(url_for("index")) + assert b'unviewed' not in res.data ##################### @@ -64,7 +59,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure client.get(url_for("form_watch_checknow"), follow_redirects=True) # Give the thread time to pick it up - time.sleep(sleep_time_for_fetch_thread) + wait_for_all_checks(client) # It should report nothing found (no new 'unviewed' class) res = client.get(url_for("index")) @@ -86,14 +81,20 @@ def test_check_basic_change_detection_functionality(client, live_server, measure client.get(url_for("form_watch_checknow"), follow_redirects=True) # Give the thread time to pick it up - time.sleep(sleep_time_for_fetch_thread) + wait_for_all_checks(client) # It should report nothing found (no new 'unviewed' class) res = client.get(url_for("index")) assert b'unviewed' in res.data + client.get(url_for("mark_all_viewed"), follow_redirects=True) - - + # A totally zero byte (#2528) response should also not trigger an error + set_zero_byte_response() + client.get(url_for("form_watch_checknow"), follow_redirects=True) + wait_for_all_checks(client) + res = client.get(url_for("index")) + assert b'unviewed' in res.data # A change should have registered because empty_pages_are_a_change is ON + assert b'fetch-error' not in res.data # # Cleanup everything From b97d34d77c0f26b97b41048fb276ca6b5d81b034 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Mon, 29 Jul 2024 11:45:44 +0200 Subject: [PATCH 2/3] Bump field text --- changedetectionio/templates/settings.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changedetectionio/templates/settings.html b/changedetectionio/templates/settings.html index e07abfa7e43..f1131f94ad1 100644 --- a/changedetectionio/templates/settings.html +++ b/changedetectionio/templates/settings.html @@ -76,7 +76,7 @@
{{ render_checkbox_field(form.application.form.empty_pages_are_a_change) }} - When a page contains HTML, but no renderable text appears (empty page), is this considered a change? + When a request returns no content, or the HTML does not contain any text, is this considered a change?
{% if form.requests.proxy %}
From 4fdabd53fc46143a651c55df7d4f30f34e740c6a Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Mon, 29 Jul 2024 12:50:43 +0200 Subject: [PATCH 3/3] Solve circular import --- changedetectionio/update_worker.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index 1b2207f845a..ba18384838d 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -1,6 +1,5 @@ from .processors.exceptions import ProcessorException -from . import content_fetchers - +import changedetectionio.content_fetchers.exceptions as content_fetchers_exceptions from changedetectionio.processors.text_json_diff.processor import FilterNotFoundInResponse from changedetectionio import html_tools @@ -301,7 +300,7 @@ def run(self): self.datastore.update_watch(uuid=uuid, update_obj={'last_error': e.message}) process_changedetection_results = False - except content_fetchers.exceptions.ReplyWithContentButNoText as e: + except content_fetchers_exceptions.ReplyWithContentButNoText as e: # Totally fine, it's by choice - just continue on, nothing more to care about # Page had elements/content but no renderable text # Backend (not filters) gave zero output @@ -327,7 +326,7 @@ def run(self): process_changedetection_results = False - except content_fetchers.exceptions.Non200ErrorCodeReceived as e: + except content_fetchers_exceptions.Non200ErrorCodeReceived as e: if e.status_code == 403: err_text = "Error - 403 (Access denied) received" elif e.status_code == 404: @@ -380,23 +379,23 @@ def run(self): process_changedetection_results = False - except content_fetchers.exceptions.checksumFromPreviousCheckWasTheSame as e: + except content_fetchers_exceptions.checksumFromPreviousCheckWasTheSame as e: # Yes fine, so nothing todo, don't continue to process. process_changedetection_results = False changed_detected = False - except content_fetchers.exceptions.BrowserConnectError as e: + except content_fetchers_exceptions.BrowserConnectError as e: self.datastore.update_watch(uuid=uuid, update_obj={'last_error': e.msg } ) process_changedetection_results = False - except content_fetchers.exceptions.BrowserFetchTimedOut as e: + except content_fetchers_exceptions.BrowserFetchTimedOut as e: self.datastore.update_watch(uuid=uuid, update_obj={'last_error': e.msg } ) process_changedetection_results = False - except content_fetchers.exceptions.BrowserStepsStepException as e: + except content_fetchers_exceptions.BrowserStepsStepException as e: if not self.datastore.data['watching'].get(uuid): continue @@ -438,25 +437,25 @@ def run(self): process_changedetection_results = False - except content_fetchers.exceptions.EmptyReply as e: + except content_fetchers_exceptions.EmptyReply as e: # Some kind of custom to-str handler in the exception handler that does this? err_text = "EmptyReply - try increasing 'Wait seconds before extracting text', Status Code {}".format(e.status_code) self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text, 'last_check_status': e.status_code}) process_changedetection_results = False - except content_fetchers.exceptions.ScreenshotUnavailable as e: + except content_fetchers_exceptions.ScreenshotUnavailable as e: err_text = "Screenshot unavailable, page did not render fully in the expected time or page was too long - try increasing 'Wait seconds before extracting text'" self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text, 'last_check_status': e.status_code}) process_changedetection_results = False - except content_fetchers.exceptions.JSActionExceptions as e: + except content_fetchers_exceptions.JSActionExceptions as e: err_text = "Error running JS Actions - Page request - "+e.message if e.screenshot: watch.save_screenshot(screenshot=e.screenshot, as_error=True) self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text, 'last_check_status': e.status_code}) process_changedetection_results = False - except content_fetchers.exceptions.PageUnloadable as e: + except content_fetchers_exceptions.PageUnloadable as e: err_text = "Page request from server didnt respond correctly" if e.message: err_text = "{} - {}".format(err_text, e.message) @@ -468,7 +467,7 @@ def run(self): 'last_check_status': e.status_code, 'has_ldjson_price_data': None}) process_changedetection_results = False - except content_fetchers.exceptions.BrowserStepsInUnsupportedFetcher as e: + except content_fetchers_exceptions.BrowserStepsInUnsupportedFetcher as e: err_text = "This watch has Browser Steps configured and so it cannot run with the 'Basic fast Plaintext/HTTP Client', either remove the Browser Steps or select a Chrome fetcher." self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text}) process_changedetection_results = False