Skip to content

Commit

Permalink
Re #2528 - handle zero-byte responses with "Empty pages are a change"…
Browse files Browse the repository at this point in the history
… the same as when the HTML doesnt render any useful text
  • Loading branch information
dgtlmoon committed Jul 29, 2024
1 parent f527744 commit e12b5a6
Show file tree
Hide file tree
Showing 7 changed files with 62 additions and 40 deletions.
3 changes: 2 additions & 1 deletion changedetectionio/content_fetchers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ def run(self,
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False):
is_binary=False,
empty_pages_are_a_change=False):
# Should set self.error, self.status_code and self.content
pass

Expand Down
9 changes: 5 additions & 4 deletions changedetectionio/content_fetchers/playwright.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ def run(self,
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False):
is_binary=False,
empty_pages_are_a_change=False):

from playwright.sync_api import sync_playwright
import playwright._impl._errors
Expand Down Expand Up @@ -130,7 +131,7 @@ def run(self,
if response is None:
context.close()
browser.close()
logger.debug("Content Fetcher > Response object was none")
logger.debug("Content Fetcher > Response object from the browser communication was none")
raise EmptyReply(url=url, status_code=None)

try:
Expand Down Expand Up @@ -166,10 +167,10 @@ def run(self,

raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)

if len(self.page.content().strip()) == 0:
if not empty_pages_are_a_change and len(self.page.content().strip()) == 0:
logger.debug("Content Fetcher > Content was empty, empty_pages_are_a_change = False")
context.close()
browser.close()
logger.debug("Content Fetcher > Content was empty")
raise EmptyReply(url=url, status_code=response.status)

# Run Browser Steps here
Expand Down
15 changes: 9 additions & 6 deletions changedetectionio/content_fetchers/puppeteer.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ async def fetch_page(self,
request_method,
ignore_status_codes,
current_include_filters,
is_binary
is_binary,
empty_pages_are_a_change
):

from changedetectionio.content_fetchers import visualselector_xpath_selectors
Expand Down Expand Up @@ -153,7 +154,7 @@ async def fetch_page(self,
if response is None:
await self.page.close()
await browser.close()
logger.warning("Content Fetcher > Response object was none")
logger.warning("Content Fetcher > Response object was none (as in, the response from the browser was empty, not just the content)")
raise EmptyReply(url=url, status_code=None)

self.headers = response.headers
Expand Down Expand Up @@ -186,10 +187,11 @@ async def fetch_page(self,

raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
content = await self.page.content
if len(content.strip()) == 0:

if not empty_pages_are_a_change and len(content.strip()) == 0:
logger.error("Content Fetcher > Content was empty (empty_pages_are_a_change is False), closing browsers")
await self.page.close()
await browser.close()
logger.error("Content Fetcher > Content was empty")
raise EmptyReply(url=url, status_code=response.status)

# Run Browser Steps here
Expand Down Expand Up @@ -247,7 +249,7 @@ async def main(self, **kwargs):
await self.fetch_page(**kwargs)

def run(self, url, timeout, request_headers, request_body, request_method, ignore_status_codes=False,
current_include_filters=None, is_binary=False):
current_include_filters=None, is_binary=False, empty_pages_are_a_change=False):

#@todo make update_worker async which could run any of these content_fetchers within memory and time constraints
max_time = os.getenv('PUPPETEER_MAX_PROCESSING_TIMEOUT_SECONDS', 180)
Expand All @@ -262,7 +264,8 @@ def run(self, url, timeout, request_headers, request_body, request_method, ignor
request_method=request_method,
ignore_status_codes=ignore_status_codes,
current_include_filters=current_include_filters,
is_binary=is_binary
is_binary=is_binary,
empty_pages_are_a_change=empty_pages_are_a_change
), timeout=max_time))
except asyncio.TimeoutError:
raise(BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds."))
Expand Down
13 changes: 8 additions & 5 deletions changedetectionio/content_fetchers/requests.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from loguru import logger
import chardet
import hashlib
import os

import chardet
import requests

from changedetectionio import strtobool
from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived
from changedetectionio.content_fetchers.base import Fetcher
Expand All @@ -26,7 +25,8 @@ def run(self,
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False):
is_binary=False,
empty_pages_are_a_change=False):

if self.browser_steps_get_valid_steps():
raise BrowserStepsInUnsupportedFetcher(url=url)
Expand Down Expand Up @@ -74,7 +74,10 @@ def run(self,
self.headers = r.headers

if not r.content or not len(r.content):
raise EmptyReply(url=url, status_code=r.status_code)
if not empty_pages_are_a_change:
raise EmptyReply(url=url, status_code=r.status_code)
else:
logger.debug(f"URL {url} gave zero byte content reply with Status Code {r.status_code}, but empty_pages_are_a_change = True")

# @todo test this
# @todo maybe you really want to test zero-byte return pages?
Expand Down
3 changes: 2 additions & 1 deletion changedetectionio/content_fetchers/webdriver_selenium.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ def run(self,
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False):
is_binary=False,
empty_pages_are_a_change=False):

from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
Expand Down
16 changes: 14 additions & 2 deletions changedetectionio/processors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ def __init__(self, *args, datastore, watch_uuid, **kwargs):

def call_browser(self):
from requests.structures import CaseInsensitiveDict
from changedetectionio.content_fetchers.exceptions import EmptyReply

# Protect against file:// access
if re.search(r'^file://', self.watch.get('url', '').strip(), re.IGNORECASE):
if not strtobool(os.getenv('ALLOW_FILE_URI', 'false')):
Expand Down Expand Up @@ -133,8 +135,18 @@ def call_browser(self):
is_binary = self.watch.is_pdf

# And here we go! call the right browser with browser-specific settings
self.fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, self.watch.get('include_filters'),
is_binary=is_binary)
empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)

self.fetcher.run(url=url,
timeout=timeout,
request_headers=request_headers,
request_body=request_body,
request_method=request_method,
ignore_status_codes=ignore_status_codes,
current_include_filters=self.watch.get('include_filters'),
is_binary=is_binary,
empty_pages_are_a_change=empty_pages_are_a_change
)

#@todo .quit here could go on close object, so we can run JS if change-detected
self.fetcher.quit()
Expand Down
43 changes: 22 additions & 21 deletions changedetectionio/tests/test_nonrenderable_pages.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,7 @@
#!/usr/bin/env python3

import time
from flask import url_for
from urllib.request import urlopen
from .util import set_original_response, set_modified_response, live_server_setup

sleep_time_for_fetch_thread = 3

from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks

def set_nonrenderable_response():
test_return_data = """<html>
Expand All @@ -22,6 +17,13 @@ def set_nonrenderable_response():

return None

def set_zero_byte_response():

with open("test-datastore/endpoint-content.txt", "w") as f:
f.write("")

return None

def test_check_basic_change_detection_functionality(client, live_server, measure_memory_usage):
set_original_response()
live_server_setup(live_server)
Expand All @@ -35,18 +37,11 @@ def test_check_basic_change_detection_functionality(client, live_server, measure

assert b"1 Imported" in res.data

time.sleep(sleep_time_for_fetch_thread)

# Do this a few times.. ensures we dont accidently set the status
for n in range(3):
client.get(url_for("form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)

# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)

# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
assert b'unviewed' not in res.data
# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
assert b'unviewed' not in res.data


#####################
Expand All @@ -64,7 +59,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
client.get(url_for("form_watch_checknow"), follow_redirects=True)

# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
wait_for_all_checks(client)

# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
Expand All @@ -86,14 +81,20 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
client.get(url_for("form_watch_checknow"), follow_redirects=True)

# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
wait_for_all_checks(client)

# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
assert b'unviewed' in res.data
client.get(url_for("mark_all_viewed"), follow_redirects=True)



# A totally zero byte (#2528) response should also not trigger an error
set_zero_byte_response()
client.get(url_for("form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("index"))
assert b'unviewed' in res.data # A change should have registered because empty_pages_are_a_change is ON
assert b'fetch-error' not in res.data

#
# Cleanup everything
Expand Down

0 comments on commit e12b5a6

Please sign in to comment.