From e12b5a6f71b161c6e92503e0ad894b69177d0f9a Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Mon, 29 Jul 2024 11:41:05 +0200
Subject: [PATCH] Re #2528 - handle zero-byte responses with "Empty pages are a
 change" the same as when the HTML doesnt render any useful text

---
 changedetectionio/content_fetchers/base.py    |  3 +-
 .../content_fetchers/playwright.py            |  9 ++--
 .../content_fetchers/puppeteer.py             | 15 ++++---
 .../content_fetchers/requests.py              | 13 +++---
 .../content_fetchers/webdriver_selenium.py    |  3 +-
 changedetectionio/processors/__init__.py      | 16 ++++++-
 .../tests/test_nonrenderable_pages.py         | 43 ++++++++++---------
 7 files changed, 62 insertions(+), 40 deletions(-)

diff --git a/changedetectionio/content_fetchers/base.py b/changedetectionio/content_fetchers/base.py
index 91fcc8e1b4b..66dd74036cd 100644
--- a/changedetectionio/content_fetchers/base.py
+++ b/changedetectionio/content_fetchers/base.py
@@ -81,7 +81,8 @@ def run(self,
             request_method,
             ignore_status_codes=False,
             current_include_filters=None,
-            is_binary=False):
+            is_binary=False,
+            empty_pages_are_a_change=False):
         # Should set self.error, self.status_code and self.content
         pass
 
diff --git a/changedetectionio/content_fetchers/playwright.py b/changedetectionio/content_fetchers/playwright.py
index e908ce1c8a9..53be33f1d04 100644
--- a/changedetectionio/content_fetchers/playwright.py
+++ b/changedetectionio/content_fetchers/playwright.py
@@ -83,7 +83,8 @@ def run(self,
             request_method,
             ignore_status_codes=False,
             current_include_filters=None,
-            is_binary=False):
+            is_binary=False,
+            empty_pages_are_a_change=False):
 
         from playwright.sync_api import sync_playwright
         import playwright._impl._errors
@@ -130,7 +131,7 @@ def run(self,
             if response is None:
                 context.close()
                 browser.close()
-                logger.debug("Content Fetcher > Response object was none")
+                logger.debug("Content Fetcher > Response object from the browser communication was none")
                 raise EmptyReply(url=url, status_code=None)
 
             try:
@@ -166,10 +167,10 @@ def run(self,
 
                 raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
 
-            if len(self.page.content().strip()) == 0:
+            if not empty_pages_are_a_change and len(self.page.content().strip()) == 0:
+                logger.debug("Content Fetcher > Content was empty, empty_pages_are_a_change = False")
                 context.close()
                 browser.close()
-                logger.debug("Content Fetcher > Content was empty")
                 raise EmptyReply(url=url, status_code=response.status)
 
             # Run Browser Steps here
diff --git a/changedetectionio/content_fetchers/puppeteer.py b/changedetectionio/content_fetchers/puppeteer.py
index 923a2be1ce2..9dd06c38e4a 100644
--- a/changedetectionio/content_fetchers/puppeteer.py
+++ b/changedetectionio/content_fetchers/puppeteer.py
@@ -75,7 +75,8 @@ async def fetch_page(self,
                          request_method,
                          ignore_status_codes,
                          current_include_filters,
-                         is_binary
+                         is_binary,
+                         empty_pages_are_a_change
                          ):
 
         from changedetectionio.content_fetchers import visualselector_xpath_selectors
@@ -153,7 +154,7 @@ async def fetch_page(self,
         if response is None:
             await self.page.close()
             await browser.close()
-            logger.warning("Content Fetcher > Response object was none")
+            logger.warning("Content Fetcher > Response object was none (as in, the response from the browser was empty, not just the content)")
             raise EmptyReply(url=url, status_code=None)
 
         self.headers = response.headers
@@ -186,10 +187,11 @@ async def fetch_page(self,
 
             raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
         content = await self.page.content
-        if len(content.strip()) == 0:
+
+        if not empty_pages_are_a_change and len(content.strip()) == 0:
+            logger.error("Content Fetcher > Content was empty (empty_pages_are_a_change is False), closing browsers")
             await self.page.close()
             await browser.close()
-            logger.error("Content Fetcher > Content was empty")
             raise EmptyReply(url=url, status_code=response.status)
 
         # Run Browser Steps here
@@ -247,7 +249,7 @@ async def main(self, **kwargs):
         await self.fetch_page(**kwargs)
 
     def run(self, url, timeout, request_headers, request_body, request_method, ignore_status_codes=False,
-            current_include_filters=None, is_binary=False):
+            current_include_filters=None, is_binary=False, empty_pages_are_a_change=False):
 
         #@todo make update_worker async which could run any of these content_fetchers within memory and time constraints
         max_time = os.getenv('PUPPETEER_MAX_PROCESSING_TIMEOUT_SECONDS', 180)
@@ -262,7 +264,8 @@ def run(self, url, timeout, request_headers, request_body, request_method, ignor
                 request_method=request_method,
                 ignore_status_codes=ignore_status_codes,
                 current_include_filters=current_include_filters,
-                is_binary=is_binary
+                is_binary=is_binary,
+                empty_pages_are_a_change=empty_pages_are_a_change
             ), timeout=max_time))
         except asyncio.TimeoutError:
             raise(BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds."))
diff --git a/changedetectionio/content_fetchers/requests.py b/changedetectionio/content_fetchers/requests.py
index 2e952c9c1eb..c39b263632b 100644
--- a/changedetectionio/content_fetchers/requests.py
+++ b/changedetectionio/content_fetchers/requests.py
@@ -1,9 +1,8 @@
+from loguru import logger
+import chardet
 import hashlib
 import os
-
-import chardet
 import requests
-
 from changedetectionio import strtobool
 from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived
 from changedetectionio.content_fetchers.base import Fetcher
@@ -26,7 +25,8 @@ def run(self,
             request_method,
             ignore_status_codes=False,
             current_include_filters=None,
-            is_binary=False):
+            is_binary=False,
+            empty_pages_are_a_change=False):
 
         if self.browser_steps_get_valid_steps():
             raise BrowserStepsInUnsupportedFetcher(url=url)
@@ -74,7 +74,10 @@ def run(self,
         self.headers = r.headers
 
         if not r.content or not len(r.content):
-            raise EmptyReply(url=url, status_code=r.status_code)
+            if not empty_pages_are_a_change:
+                raise EmptyReply(url=url, status_code=r.status_code)
+            else:
+                logger.debug(f"URL {url} gave zero byte content reply with Status Code {r.status_code}, but empty_pages_are_a_change = True")
 
         # @todo test this
         # @todo maybe you really want to test zero-byte return pages?
diff --git a/changedetectionio/content_fetchers/webdriver_selenium.py b/changedetectionio/content_fetchers/webdriver_selenium.py
index a45746f03a8..72e80b1592a 100644
--- a/changedetectionio/content_fetchers/webdriver_selenium.py
+++ b/changedetectionio/content_fetchers/webdriver_selenium.py
@@ -56,7 +56,8 @@ def run(self,
             request_method,
             ignore_status_codes=False,
             current_include_filters=None,
-            is_binary=False):
+            is_binary=False,
+            empty_pages_are_a_change=False):
 
         from selenium import webdriver
         from selenium.webdriver.chrome.options import Options as ChromeOptions
diff --git a/changedetectionio/processors/__init__.py b/changedetectionio/processors/__init__.py
index 0ce964977e1..529f57dabd7 100644
--- a/changedetectionio/processors/__init__.py
+++ b/changedetectionio/processors/__init__.py
@@ -26,6 +26,8 @@ def __init__(self, *args, datastore, watch_uuid, **kwargs):
 
     def call_browser(self):
         from requests.structures import CaseInsensitiveDict
+        from changedetectionio.content_fetchers.exceptions import EmptyReply
+
         # Protect against file:// access
         if re.search(r'^file://', self.watch.get('url', '').strip(), re.IGNORECASE):
             if not strtobool(os.getenv('ALLOW_FILE_URI', 'false')):
@@ -133,8 +135,18 @@ def call_browser(self):
         is_binary = self.watch.is_pdf
 
         # And here we go! call the right browser with browser-specific settings
-        self.fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, self.watch.get('include_filters'),
-                    is_binary=is_binary)
+        empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
+
+        self.fetcher.run(url=url,
+                         timeout=timeout,
+                         request_headers=request_headers,
+                         request_body=request_body,
+                         request_method=request_method,
+                         ignore_status_codes=ignore_status_codes,
+                         current_include_filters=self.watch.get('include_filters'),
+                         is_binary=is_binary,
+                         empty_pages_are_a_change=empty_pages_are_a_change
+                         )
 
         #@todo .quit here could go on close object, so we can run JS if change-detected
         self.fetcher.quit()
diff --git a/changedetectionio/tests/test_nonrenderable_pages.py b/changedetectionio/tests/test_nonrenderable_pages.py
index 4e9169cfd80..3757eb6edf4 100644
--- a/changedetectionio/tests/test_nonrenderable_pages.py
+++ b/changedetectionio/tests/test_nonrenderable_pages.py
@@ -1,12 +1,7 @@
 #!/usr/bin/env python3
 
-import time
 from flask import url_for
-from urllib.request import urlopen
-from .util import set_original_response, set_modified_response, live_server_setup
-
-sleep_time_for_fetch_thread = 3
-
+from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks
 
 def set_nonrenderable_response():
     test_return_data = """<html>
@@ -22,6 +17,13 @@ def set_nonrenderable_response():
 
     return None
 
+def set_zero_byte_response():
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write("")
+
+    return None
+
 def test_check_basic_change_detection_functionality(client, live_server, measure_memory_usage):
     set_original_response()
     live_server_setup(live_server)
@@ -35,18 +37,11 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
 
     assert b"1 Imported" in res.data
 
-    time.sleep(sleep_time_for_fetch_thread)
-
-    # Do this a few times.. ensures we dont accidently set the status
-    for n in range(3):
-        client.get(url_for("form_watch_checknow"), follow_redirects=True)
+    wait_for_all_checks(client)
 
-        # Give the thread time to pick it up
-        time.sleep(sleep_time_for_fetch_thread)
-
-        # It should report nothing found (no new 'unviewed' class)
-        res = client.get(url_for("index"))
-        assert b'unviewed' not in res.data
+    # It should report nothing found (no new 'unviewed' class)
+    res = client.get(url_for("index"))
+    assert b'unviewed' not in res.data
 
 
     #####################
@@ -64,7 +59,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
     client.get(url_for("form_watch_checknow"), follow_redirects=True)
 
     # Give the thread time to pick it up
-    time.sleep(sleep_time_for_fetch_thread)
+    wait_for_all_checks(client)
 
     # It should report nothing found (no new 'unviewed' class)
     res = client.get(url_for("index"))
@@ -86,14 +81,20 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
     client.get(url_for("form_watch_checknow"), follow_redirects=True)
 
     # Give the thread time to pick it up
-    time.sleep(sleep_time_for_fetch_thread)
+    wait_for_all_checks(client)
 
     # It should report nothing found (no new 'unviewed' class)
     res = client.get(url_for("index"))
     assert b'unviewed' in res.data
+    client.get(url_for("mark_all_viewed"), follow_redirects=True)
 
-
-
+    # A totally zero byte (#2528) response should also not trigger an error
+    set_zero_byte_response()
+    client.get(url_for("form_watch_checknow"), follow_redirects=True)
+    wait_for_all_checks(client)
+    res = client.get(url_for("index"))
+    assert b'unviewed' in res.data # A change should have registered because empty_pages_are_a_change is ON
+    assert b'fetch-error' not in res.data
 
     #
     # Cleanup everything