From e12b5a6f71b161c6e92503e0ad894b69177d0f9a Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Mon, 29 Jul 2024 11:41:05 +0200 Subject: [PATCH] Re #2528 - handle zero-byte responses with "Empty pages are a change" the same as when the HTML doesnt render any useful text --- changedetectionio/content_fetchers/base.py | 3 +- .../content_fetchers/playwright.py | 9 ++-- .../content_fetchers/puppeteer.py | 15 ++++--- .../content_fetchers/requests.py | 13 +++--- .../content_fetchers/webdriver_selenium.py | 3 +- changedetectionio/processors/__init__.py | 16 ++++++- .../tests/test_nonrenderable_pages.py | 43 ++++++++++--------- 7 files changed, 62 insertions(+), 40 deletions(-) diff --git a/changedetectionio/content_fetchers/base.py b/changedetectionio/content_fetchers/base.py index 91fcc8e1b4b..66dd74036cd 100644 --- a/changedetectionio/content_fetchers/base.py +++ b/changedetectionio/content_fetchers/base.py @@ -81,7 +81,8 @@ def run(self, request_method, ignore_status_codes=False, current_include_filters=None, - is_binary=False): + is_binary=False, + empty_pages_are_a_change=False): # Should set self.error, self.status_code and self.content pass diff --git a/changedetectionio/content_fetchers/playwright.py b/changedetectionio/content_fetchers/playwright.py index e908ce1c8a9..53be33f1d04 100644 --- a/changedetectionio/content_fetchers/playwright.py +++ b/changedetectionio/content_fetchers/playwright.py @@ -83,7 +83,8 @@ def run(self, request_method, ignore_status_codes=False, current_include_filters=None, - is_binary=False): + is_binary=False, + empty_pages_are_a_change=False): from playwright.sync_api import sync_playwright import playwright._impl._errors @@ -130,7 +131,7 @@ def run(self, if response is None: context.close() browser.close() - logger.debug("Content Fetcher > Response object was none") + logger.debug("Content Fetcher > Response object from the browser communication was none") raise EmptyReply(url=url, status_code=None) try: @@ -166,10 +167,10 @@ def run(self, raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot) - if len(self.page.content().strip()) == 0: + if not empty_pages_are_a_change and len(self.page.content().strip()) == 0: + logger.debug("Content Fetcher > Content was empty, empty_pages_are_a_change = False") context.close() browser.close() - logger.debug("Content Fetcher > Content was empty") raise EmptyReply(url=url, status_code=response.status) # Run Browser Steps here diff --git a/changedetectionio/content_fetchers/puppeteer.py b/changedetectionio/content_fetchers/puppeteer.py index 923a2be1ce2..9dd06c38e4a 100644 --- a/changedetectionio/content_fetchers/puppeteer.py +++ b/changedetectionio/content_fetchers/puppeteer.py @@ -75,7 +75,8 @@ async def fetch_page(self, request_method, ignore_status_codes, current_include_filters, - is_binary + is_binary, + empty_pages_are_a_change ): from changedetectionio.content_fetchers import visualselector_xpath_selectors @@ -153,7 +154,7 @@ async def fetch_page(self, if response is None: await self.page.close() await browser.close() - logger.warning("Content Fetcher > Response object was none") + logger.warning("Content Fetcher > Response object was none (as in, the response from the browser was empty, not just the content)") raise EmptyReply(url=url, status_code=None) self.headers = response.headers @@ -186,10 +187,11 @@ async def fetch_page(self, raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot) content = await self.page.content - if len(content.strip()) == 0: + + if not empty_pages_are_a_change and len(content.strip()) == 0: + logger.error("Content Fetcher > Content was empty (empty_pages_are_a_change is False), closing browsers") await self.page.close() await browser.close() - logger.error("Content Fetcher > Content was empty") raise EmptyReply(url=url, status_code=response.status) # Run Browser Steps here @@ -247,7 +249,7 @@ async def main(self, **kwargs): await self.fetch_page(**kwargs) def run(self, url, timeout, request_headers, request_body, request_method, ignore_status_codes=False, - current_include_filters=None, is_binary=False): + current_include_filters=None, is_binary=False, empty_pages_are_a_change=False): #@todo make update_worker async which could run any of these content_fetchers within memory and time constraints max_time = os.getenv('PUPPETEER_MAX_PROCESSING_TIMEOUT_SECONDS', 180) @@ -262,7 +264,8 @@ def run(self, url, timeout, request_headers, request_body, request_method, ignor request_method=request_method, ignore_status_codes=ignore_status_codes, current_include_filters=current_include_filters, - is_binary=is_binary + is_binary=is_binary, + empty_pages_are_a_change=empty_pages_are_a_change ), timeout=max_time)) except asyncio.TimeoutError: raise(BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds.")) diff --git a/changedetectionio/content_fetchers/requests.py b/changedetectionio/content_fetchers/requests.py index 2e952c9c1eb..c39b263632b 100644 --- a/changedetectionio/content_fetchers/requests.py +++ b/changedetectionio/content_fetchers/requests.py @@ -1,9 +1,8 @@ +from loguru import logger +import chardet import hashlib import os - -import chardet import requests - from changedetectionio import strtobool from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived from changedetectionio.content_fetchers.base import Fetcher @@ -26,7 +25,8 @@ def run(self, request_method, ignore_status_codes=False, current_include_filters=None, - is_binary=False): + is_binary=False, + empty_pages_are_a_change=False): if self.browser_steps_get_valid_steps(): raise BrowserStepsInUnsupportedFetcher(url=url) @@ -74,7 +74,10 @@ def run(self, self.headers = r.headers if not r.content or not len(r.content): - raise EmptyReply(url=url, status_code=r.status_code) + if not empty_pages_are_a_change: + raise EmptyReply(url=url, status_code=r.status_code) + else: + logger.debug(f"URL {url} gave zero byte content reply with Status Code {r.status_code}, but empty_pages_are_a_change = True") # @todo test this # @todo maybe you really want to test zero-byte return pages? diff --git a/changedetectionio/content_fetchers/webdriver_selenium.py b/changedetectionio/content_fetchers/webdriver_selenium.py index a45746f03a8..72e80b1592a 100644 --- a/changedetectionio/content_fetchers/webdriver_selenium.py +++ b/changedetectionio/content_fetchers/webdriver_selenium.py @@ -56,7 +56,8 @@ def run(self, request_method, ignore_status_codes=False, current_include_filters=None, - is_binary=False): + is_binary=False, + empty_pages_are_a_change=False): from selenium import webdriver from selenium.webdriver.chrome.options import Options as ChromeOptions diff --git a/changedetectionio/processors/__init__.py b/changedetectionio/processors/__init__.py index 0ce964977e1..529f57dabd7 100644 --- a/changedetectionio/processors/__init__.py +++ b/changedetectionio/processors/__init__.py @@ -26,6 +26,8 @@ def __init__(self, *args, datastore, watch_uuid, **kwargs): def call_browser(self): from requests.structures import CaseInsensitiveDict + from changedetectionio.content_fetchers.exceptions import EmptyReply + # Protect against file:// access if re.search(r'^file://', self.watch.get('url', '').strip(), re.IGNORECASE): if not strtobool(os.getenv('ALLOW_FILE_URI', 'false')): @@ -133,8 +135,18 @@ def call_browser(self): is_binary = self.watch.is_pdf # And here we go! call the right browser with browser-specific settings - self.fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, self.watch.get('include_filters'), - is_binary=is_binary) + empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False) + + self.fetcher.run(url=url, + timeout=timeout, + request_headers=request_headers, + request_body=request_body, + request_method=request_method, + ignore_status_codes=ignore_status_codes, + current_include_filters=self.watch.get('include_filters'), + is_binary=is_binary, + empty_pages_are_a_change=empty_pages_are_a_change + ) #@todo .quit here could go on close object, so we can run JS if change-detected self.fetcher.quit() diff --git a/changedetectionio/tests/test_nonrenderable_pages.py b/changedetectionio/tests/test_nonrenderable_pages.py index 4e9169cfd80..3757eb6edf4 100644 --- a/changedetectionio/tests/test_nonrenderable_pages.py +++ b/changedetectionio/tests/test_nonrenderable_pages.py @@ -1,12 +1,7 @@ #!/usr/bin/env python3 -import time from flask import url_for -from urllib.request import urlopen -from .util import set_original_response, set_modified_response, live_server_setup - -sleep_time_for_fetch_thread = 3 - +from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks def set_nonrenderable_response(): test_return_data = """ @@ -22,6 +17,13 @@ def set_nonrenderable_response(): return None +def set_zero_byte_response(): + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write("") + + return None + def test_check_basic_change_detection_functionality(client, live_server, measure_memory_usage): set_original_response() live_server_setup(live_server) @@ -35,18 +37,11 @@ def test_check_basic_change_detection_functionality(client, live_server, measure assert b"1 Imported" in res.data - time.sleep(sleep_time_for_fetch_thread) - - # Do this a few times.. ensures we dont accidently set the status - for n in range(3): - client.get(url_for("form_watch_checknow"), follow_redirects=True) + wait_for_all_checks(client) - # Give the thread time to pick it up - time.sleep(sleep_time_for_fetch_thread) - - # It should report nothing found (no new 'unviewed' class) - res = client.get(url_for("index")) - assert b'unviewed' not in res.data + # It should report nothing found (no new 'unviewed' class) + res = client.get(url_for("index")) + assert b'unviewed' not in res.data ##################### @@ -64,7 +59,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure client.get(url_for("form_watch_checknow"), follow_redirects=True) # Give the thread time to pick it up - time.sleep(sleep_time_for_fetch_thread) + wait_for_all_checks(client) # It should report nothing found (no new 'unviewed' class) res = client.get(url_for("index")) @@ -86,14 +81,20 @@ def test_check_basic_change_detection_functionality(client, live_server, measure client.get(url_for("form_watch_checknow"), follow_redirects=True) # Give the thread time to pick it up - time.sleep(sleep_time_for_fetch_thread) + wait_for_all_checks(client) # It should report nothing found (no new 'unviewed' class) res = client.get(url_for("index")) assert b'unviewed' in res.data + client.get(url_for("mark_all_viewed"), follow_redirects=True) - - + # A totally zero byte (#2528) response should also not trigger an error + set_zero_byte_response() + client.get(url_for("form_watch_checknow"), follow_redirects=True) + wait_for_all_checks(client) + res = client.get(url_for("index")) + assert b'unviewed' in res.data # A change should have registered because empty_pages_are_a_change is ON + assert b'fetch-error' not in res.data # # Cleanup everything