Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle zero-byte responses with "Empty pages are a change" option, the same as when the HTML doesnt render any useful text #2530

Merged
merged 3 commits into from
Jul 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion changedetectionio/content_fetchers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ def run(self,
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False):
is_binary=False,
empty_pages_are_a_change=False):
# Should set self.error, self.status_code and self.content
pass

Expand Down
9 changes: 5 additions & 4 deletions changedetectionio/content_fetchers/playwright.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ def run(self,
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False):
is_binary=False,
empty_pages_are_a_change=False):

from playwright.sync_api import sync_playwright
import playwright._impl._errors
Expand Down Expand Up @@ -130,7 +131,7 @@ def run(self,
if response is None:
context.close()
browser.close()
logger.debug("Content Fetcher > Response object was none")
logger.debug("Content Fetcher > Response object from the browser communication was none")
raise EmptyReply(url=url, status_code=None)

try:
Expand Down Expand Up @@ -166,10 +167,10 @@ def run(self,

raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)

if len(self.page.content().strip()) == 0:
if not empty_pages_are_a_change and len(self.page.content().strip()) == 0:
logger.debug("Content Fetcher > Content was empty, empty_pages_are_a_change = False")
context.close()
browser.close()
logger.debug("Content Fetcher > Content was empty")
raise EmptyReply(url=url, status_code=response.status)

# Run Browser Steps here
Expand Down
15 changes: 9 additions & 6 deletions changedetectionio/content_fetchers/puppeteer.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ async def fetch_page(self,
request_method,
ignore_status_codes,
current_include_filters,
is_binary
is_binary,
empty_pages_are_a_change
):

from changedetectionio.content_fetchers import visualselector_xpath_selectors
Expand Down Expand Up @@ -153,7 +154,7 @@ async def fetch_page(self,
if response is None:
await self.page.close()
await browser.close()
logger.warning("Content Fetcher > Response object was none")
logger.warning("Content Fetcher > Response object was none (as in, the response from the browser was empty, not just the content)")
raise EmptyReply(url=url, status_code=None)

self.headers = response.headers
Expand Down Expand Up @@ -186,10 +187,11 @@ async def fetch_page(self,

raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
content = await self.page.content
if len(content.strip()) == 0:

if not empty_pages_are_a_change and len(content.strip()) == 0:
logger.error("Content Fetcher > Content was empty (empty_pages_are_a_change is False), closing browsers")
await self.page.close()
await browser.close()
logger.error("Content Fetcher > Content was empty")
raise EmptyReply(url=url, status_code=response.status)

# Run Browser Steps here
Expand Down Expand Up @@ -247,7 +249,7 @@ async def main(self, **kwargs):
await self.fetch_page(**kwargs)

def run(self, url, timeout, request_headers, request_body, request_method, ignore_status_codes=False,
current_include_filters=None, is_binary=False):
current_include_filters=None, is_binary=False, empty_pages_are_a_change=False):

#@todo make update_worker async which could run any of these content_fetchers within memory and time constraints
max_time = os.getenv('PUPPETEER_MAX_PROCESSING_TIMEOUT_SECONDS', 180)
Expand All @@ -262,7 +264,8 @@ def run(self, url, timeout, request_headers, request_body, request_method, ignor
request_method=request_method,
ignore_status_codes=ignore_status_codes,
current_include_filters=current_include_filters,
is_binary=is_binary
is_binary=is_binary,
empty_pages_are_a_change=empty_pages_are_a_change
), timeout=max_time))
except asyncio.TimeoutError:
raise(BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds."))
Expand Down
13 changes: 8 additions & 5 deletions changedetectionio/content_fetchers/requests.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from loguru import logger
import chardet
import hashlib
import os

import chardet
import requests

from changedetectionio import strtobool
from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived
from changedetectionio.content_fetchers.base import Fetcher
Expand All @@ -26,7 +25,8 @@ def run(self,
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False):
is_binary=False,
empty_pages_are_a_change=False):

if self.browser_steps_get_valid_steps():
raise BrowserStepsInUnsupportedFetcher(url=url)
Expand Down Expand Up @@ -74,7 +74,10 @@ def run(self,
self.headers = r.headers

if not r.content or not len(r.content):
raise EmptyReply(url=url, status_code=r.status_code)
if not empty_pages_are_a_change:
raise EmptyReply(url=url, status_code=r.status_code)
else:
logger.debug(f"URL {url} gave zero byte content reply with Status Code {r.status_code}, but empty_pages_are_a_change = True")

# @todo test this
# @todo maybe you really want to test zero-byte return pages?
Expand Down
3 changes: 2 additions & 1 deletion changedetectionio/content_fetchers/webdriver_selenium.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ def run(self,
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False):
is_binary=False,
empty_pages_are_a_change=False):

from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
Expand Down
16 changes: 14 additions & 2 deletions changedetectionio/processors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ def __init__(self, *args, datastore, watch_uuid, **kwargs):

def call_browser(self):
from requests.structures import CaseInsensitiveDict
from changedetectionio.content_fetchers.exceptions import EmptyReply

# Protect against file:// access
if re.search(r'^file://', self.watch.get('url', '').strip(), re.IGNORECASE):
if not strtobool(os.getenv('ALLOW_FILE_URI', 'false')):
Expand Down Expand Up @@ -133,8 +135,18 @@ def call_browser(self):
is_binary = self.watch.is_pdf

# And here we go! call the right browser with browser-specific settings
self.fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, self.watch.get('include_filters'),
is_binary=is_binary)
empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)

self.fetcher.run(url=url,
timeout=timeout,
request_headers=request_headers,
request_body=request_body,
request_method=request_method,
ignore_status_codes=ignore_status_codes,
current_include_filters=self.watch.get('include_filters'),
is_binary=is_binary,
empty_pages_are_a_change=empty_pages_are_a_change
)

#@todo .quit here could go on close object, so we can run JS if change-detected
self.fetcher.quit()
Expand Down
2 changes: 1 addition & 1 deletion changedetectionio/templates/settings.html
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
</div>
<div class="pure-control-group">
{{ render_checkbox_field(form.application.form.empty_pages_are_a_change) }}
<span class="pure-form-message-inline">When a page contains HTML, but no renderable text appears (empty page), is this considered a change?</span>
<span class="pure-form-message-inline">When a request returns no content, or the HTML does not contain any text, is this considered a change?</span>
</div>
{% if form.requests.proxy %}
<div class="pure-control-group inline-radio">
Expand Down
43 changes: 22 additions & 21 deletions changedetectionio/tests/test_nonrenderable_pages.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,7 @@
#!/usr/bin/env python3

import time
from flask import url_for
from urllib.request import urlopen
from .util import set_original_response, set_modified_response, live_server_setup

sleep_time_for_fetch_thread = 3

from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks

def set_nonrenderable_response():
test_return_data = """<html>
Expand All @@ -22,6 +17,13 @@ def set_nonrenderable_response():

return None

def set_zero_byte_response():

with open("test-datastore/endpoint-content.txt", "w") as f:
f.write("")

return None

def test_check_basic_change_detection_functionality(client, live_server, measure_memory_usage):
set_original_response()
live_server_setup(live_server)
Expand All @@ -35,18 +37,11 @@ def test_check_basic_change_detection_functionality(client, live_server, measure

assert b"1 Imported" in res.data

time.sleep(sleep_time_for_fetch_thread)

# Do this a few times.. ensures we dont accidently set the status
for n in range(3):
client.get(url_for("form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)

# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)

# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
assert b'unviewed' not in res.data
# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
assert b'unviewed' not in res.data


#####################
Expand All @@ -64,7 +59,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
client.get(url_for("form_watch_checknow"), follow_redirects=True)

# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
wait_for_all_checks(client)

# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
Expand All @@ -86,14 +81,20 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
client.get(url_for("form_watch_checknow"), follow_redirects=True)

# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
wait_for_all_checks(client)

# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
assert b'unviewed' in res.data
client.get(url_for("mark_all_viewed"), follow_redirects=True)



# A totally zero byte (#2528) response should also not trigger an error
set_zero_byte_response()
client.get(url_for("form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("index"))
assert b'unviewed' in res.data # A change should have registered because empty_pages_are_a_change is ON
assert b'fetch-error' not in res.data

#
# Cleanup everything
Expand Down
25 changes: 12 additions & 13 deletions changedetectionio/update_worker.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from .processors.exceptions import ProcessorException
from . import content_fetchers

import changedetectionio.content_fetchers.exceptions as content_fetchers_exceptions
from changedetectionio.processors.text_json_diff.processor import FilterNotFoundInResponse
from changedetectionio import html_tools

Expand Down Expand Up @@ -301,7 +300,7 @@ def run(self):
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': e.message})
process_changedetection_results = False

except content_fetchers.exceptions.ReplyWithContentButNoText as e:
except content_fetchers_exceptions.ReplyWithContentButNoText as e:
# Totally fine, it's by choice - just continue on, nothing more to care about
# Page had elements/content but no renderable text
# Backend (not filters) gave zero output
Expand All @@ -327,7 +326,7 @@ def run(self):

process_changedetection_results = False

except content_fetchers.exceptions.Non200ErrorCodeReceived as e:
except content_fetchers_exceptions.Non200ErrorCodeReceived as e:
if e.status_code == 403:
err_text = "Error - 403 (Access denied) received"
elif e.status_code == 404:
Expand Down Expand Up @@ -380,23 +379,23 @@ def run(self):

process_changedetection_results = False

except content_fetchers.exceptions.checksumFromPreviousCheckWasTheSame as e:
except content_fetchers_exceptions.checksumFromPreviousCheckWasTheSame as e:
# Yes fine, so nothing todo, don't continue to process.
process_changedetection_results = False
changed_detected = False
except content_fetchers.exceptions.BrowserConnectError as e:
except content_fetchers_exceptions.BrowserConnectError as e:
self.datastore.update_watch(uuid=uuid,
update_obj={'last_error': e.msg
}
)
process_changedetection_results = False
except content_fetchers.exceptions.BrowserFetchTimedOut as e:
except content_fetchers_exceptions.BrowserFetchTimedOut as e:
self.datastore.update_watch(uuid=uuid,
update_obj={'last_error': e.msg
}
)
process_changedetection_results = False
except content_fetchers.exceptions.BrowserStepsStepException as e:
except content_fetchers_exceptions.BrowserStepsStepException as e:

if not self.datastore.data['watching'].get(uuid):
continue
Expand Down Expand Up @@ -438,25 +437,25 @@ def run(self):

process_changedetection_results = False

except content_fetchers.exceptions.EmptyReply as e:
except content_fetchers_exceptions.EmptyReply as e:
# Some kind of custom to-str handler in the exception handler that does this?
err_text = "EmptyReply - try increasing 'Wait seconds before extracting text', Status Code {}".format(e.status_code)
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code})
process_changedetection_results = False
except content_fetchers.exceptions.ScreenshotUnavailable as e:
except content_fetchers_exceptions.ScreenshotUnavailable as e:
err_text = "Screenshot unavailable, page did not render fully in the expected time or page was too long - try increasing 'Wait seconds before extracting text'"
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code})
process_changedetection_results = False
except content_fetchers.exceptions.JSActionExceptions as e:
except content_fetchers_exceptions.JSActionExceptions as e:
err_text = "Error running JS Actions - Page request - "+e.message
if e.screenshot:
watch.save_screenshot(screenshot=e.screenshot, as_error=True)
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code})
process_changedetection_results = False
except content_fetchers.exceptions.PageUnloadable as e:
except content_fetchers_exceptions.PageUnloadable as e:
err_text = "Page request from server didnt respond correctly"
if e.message:
err_text = "{} - {}".format(err_text, e.message)
Expand All @@ -468,7 +467,7 @@ def run(self):
'last_check_status': e.status_code,
'has_ldjson_price_data': None})
process_changedetection_results = False
except content_fetchers.exceptions.BrowserStepsInUnsupportedFetcher as e:
except content_fetchers_exceptions.BrowserStepsInUnsupportedFetcher as e:
err_text = "This watch has Browser Steps configured and so it cannot run with the 'Basic fast Plaintext/HTTP Client', either remove the Browser Steps or select a Chrome fetcher."
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text})
process_changedetection_results = False
Expand Down