Skip to content

Commit

Permalink
Merge branch 'master' into text-filter-preview
Browse files Browse the repository at this point in the history
  • Loading branch information
dgtlmoon committed Sep 18, 2024
2 parents c0cc9a9 + e830fb2 commit 5c8c442
Show file tree
Hide file tree
Showing 13 changed files with 172 additions and 34 deletions.
8 changes: 5 additions & 3 deletions changedetectionio/blueprint/tags/templates/edit-tag.html
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,13 @@
{{ render_field(form.subtractive_selectors, rows=5, placeholder="header
footer
nav
.stockticker") }}
.stockticker
//*[contains(text(), 'Advertisement')]") }}
<span class="pure-form-message-inline">
<ul>
<li> Remove HTML element(s) by CSS selector before text conversion. </li>
<li> Add multiple elements or CSS selectors per line to ignore multiple parts of the HTML. </li>
<li> Remove HTML element(s) by CSS and XPath selectors before text conversion. </li>
<li> Don't paste HTML here, use only CSS and XPath selectors </li>
<li> Add multiple elements, CSS or XPath selectors per line to ignore multiple parts of the HTML. </li>
</ul>
</span>
</fieldset>
Expand Down
6 changes: 4 additions & 2 deletions changedetectionio/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,7 +469,7 @@ class processor_text_json_diff_form(commonSettingsForm):

include_filters = StringListField('CSS/JSONPath/JQ/XPath Filters', [ValidateCSSJSONXPATHInput()], default='')

subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)])
subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_json=False)])

extract_text = StringListField('Extract text', [ValidateListRegex()])

Expand All @@ -481,7 +481,9 @@ class processor_text_json_diff_form(commonSettingsForm):
method = SelectField('Request method', choices=valid_method, default=default_method)
ignore_status_codes = BooleanField('Ignore status codes (process non-2xx status codes as normal)', default=False)
check_unique_lines = BooleanField('Only trigger when unique lines appear in all history', default=False)
remove_duplicate_lines = BooleanField('Remove duplicate lines of text', default=False)
sort_text_alphabetically = BooleanField('Sort text alphabetically', default=False)
trim_text_whitespace = BooleanField('Trim whitespace before and after text', default=False)

filter_text_added = BooleanField('Added lines', default=True)
filter_text_replaced = BooleanField('Replaced/changed lines', default=True)
Expand Down Expand Up @@ -576,7 +578,7 @@ class globalSettingsApplicationForm(commonSettingsForm):
empty_pages_are_a_change = BooleanField('Treat empty pages as a change?', default=False)
fetch_backend = RadioField('Fetch Method', default="html_requests", choices=content_fetchers.available_fetchers(), validators=[ValidateContentFetcherIsReady()])
global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
global_subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)])
global_subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_json=False)])
ignore_whitespace = BooleanField('Ignore whitespace')
password = SaltyPasswordField()
pager_size = IntegerField('Pager size',
Expand Down
22 changes: 19 additions & 3 deletions changedetectionio/html_tools.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import List
from lxml import etree
import json
import re

Expand Down Expand Up @@ -57,11 +58,26 @@ def subtractive_css_selector(css_selector, html_content):
item.decompose()
return str(soup)

def subtractive_xpath_selector(xpath_selector, html_content):
html_tree = etree.HTML(html_content)
elements_to_remove = html_tree.xpath(xpath_selector)

for element in elements_to_remove:
element.getparent().remove(element)

modified_html = etree.tostring(html_tree, method="html").decode("utf-8")
return modified_html

def element_removal(selectors: List[str], html_content):
"""Joins individual filters into one css filter."""
selector = ",".join(selectors)
return subtractive_css_selector(selector, html_content)
"""Removes elements that match a list of CSS or xPath selectors."""
modified_html = html_content
for selector in selectors:
if selector.startswith(('xpath:', 'xpath1:', '//')):
xpath_selector = selector.removeprefix('xpath:').removeprefix('xpath1:')
modified_html = subtractive_xpath_selector(xpath_selector, modified_html)
else:
modified_html = subtractive_css_selector(selector, modified_html)
return modified_html

def elementpath_tostring(obj):
"""
Expand Down
2 changes: 2 additions & 0 deletions changedetectionio/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ def __init__(self, *arg, **kw):
'time_between_check_use_default': True,
'title': None,
'track_ldjson_price_data': None,
'trim_text_whitespace': False,
'remove_duplicate_lines': False,
'trigger_text': [], # List of text or regex to wait for until a change is detected
'url': '',
'uuid': str(uuid.uuid4()),
Expand Down
18 changes: 14 additions & 4 deletions changedetectionio/processors/text_json_diff/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,11 +218,19 @@ def run_changedetection(self, watch, skip_when_checksum_same=True):
is_rss=is_rss)) #1874 activate the <title workaround hack
stripped_text_from_html = future.result()

if watch.get('sort_text_alphabetically') and stripped_text_from_html:

if watch.get('trim_text_whitespace'):
stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines())

if watch.get('remove_duplicate_lines'):
stripped_text_from_html = '\n'.join(dict.fromkeys(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines()))

if watch.get('sort_text_alphabetically'):
# Note: Because a <p>something</p> will add an extra line feed to signify the paragraph gap
# we end up with 'Some text\n\n', sorting will add all those extra \n at the start, so we remove them here.
stripped_text_from_html = stripped_text_from_html.replace('\n\n', '\n')
stripped_text_from_html = '\n'.join( sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower() ))
stripped_text_from_html = stripped_text_from_html.replace("\n\n", "\n")
stripped_text_from_html = '\n'.join(sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower()))


# Re #340 - return the content before the 'ignore text' was applied
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
Expand Down Expand Up @@ -304,14 +312,16 @@ def run_changedetection(self, watch, skip_when_checksum_same=True):
for match in res:
regex_matched_output += [match] + [b'\n']

# Now we will only show what the regex matched
##########################################################
stripped_text_from_html = b''
text_content_before_ignored_filter = b''
if regex_matched_output:
# @todo some formatter for presentation?
stripped_text_from_html = b''.join(regex_matched_output)
text_content_before_ignored_filter = stripped_text_from_html



# Re #133 - if we should strip whitespaces from triggering the change detected comparison
if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
Expand Down
22 changes: 18 additions & 4 deletions changedetectionio/static/styles/scss/parts/_browser-steps.scss
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,29 @@
}
}

#browser-steps-fieldlist {
height: 100%;
overflow-y: scroll;
}

#browser-steps .flex-wrapper {
display: flex;
flex-flow: row;
height: 70vh;
font-size: 80%;
#browser-steps-ui {
flex-grow: 1; /* Allow it to grow and fill the available space */
flex-shrink: 1; /* Allow it to shrink if needed */
flex-basis: 0; /* Start with 0 base width so it stretches as much as possible */
background-color: #eee;
border-radius: 5px;

}

#browser-steps-fieldlist {
flex-grow: 0; /* Don't allow it to grow */
flex-shrink: 0; /* Don't allow it to shrink */
flex-basis: auto; /* Base width is determined by the content */
max-width: 400px; /* Set a max width to prevent overflow */
padding-left: 1rem;
overflow-y: scroll;
}
}

/* this is duplicate :( */
Expand Down
27 changes: 22 additions & 5 deletions changedetectionio/static/styles/styles.css
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,31 @@
#browser_steps li > label {
display: none; }

#browser-steps-fieldlist {
height: 100%;
overflow-y: scroll; }

#browser-steps .flex-wrapper {
display: flex;
flex-flow: row;
height: 70vh; }
height: 70vh;
font-size: 80%; }
#browser-steps .flex-wrapper #browser-steps-ui {
flex-grow: 1;
/* Allow it to grow and fill the available space */
flex-shrink: 1;
/* Allow it to shrink if needed */
flex-basis: 0;
/* Start with 0 base width so it stretches as much as possible */
background-color: #eee;
border-radius: 5px; }
#browser-steps .flex-wrapper #browser-steps-fieldlist {
flex-grow: 0;
/* Don't allow it to grow */
flex-shrink: 0;
/* Don't allow it to shrink */
flex-basis: auto;
/* Base width is determined by the content */
max-width: 400px;
/* Set a max width to prevent overflow */
padding-left: 1rem;
overflow-y: scroll; }

/* this is duplicate :( */
#browsersteps-selector-wrapper {
Expand Down
2 changes: 1 addition & 1 deletion changedetectionio/templates/_common_fields.html
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
<strong>Tip:</strong> Use <a target=_new href="https://github.com/caronc/apprise">AppRise Notification URLs</a> for notification to just about any service! <i><a target=_new href="https://github.com/dgtlmoon/changedetection.io/wiki/Notification-configuration-notes">Please read the notification services wiki here for important configuration notes</a></i>.<br>
</p>
<div data-target="#advanced-help-notifications" class="toggle-show pure-button button-tag button-xsmall">Show advanced help and tips</div>
<ul style="display: none" id="advanced-help-notifications">
<ul style="display: none" id="advanced-help-notifications">
<li><code><a target=_new href="https://github.com/caronc/apprise/wiki/Notify_discord">discord://</a></code> (or <code>https://discord.com/api/webhooks...</code>)) only supports a maximum <strong>2,000 characters</strong> of notification text, including the title.</li>
<li><code><a target=_new href="https://github.com/caronc/apprise/wiki/Notify_telegram">tgram://</a></code> bots can't send messages to other bots, so you should specify chat ID of non-bot user.</li>
<li><code><a target=_new href="https://github.com/caronc/apprise/wiki/Notify_telegram">tgram://</a></code> only supports very limited HTML and can fail when extra tags are sent, <a href="https://core.telegram.org/bots/api#html-style">read more here</a> (or use plaintext/markdown format)</li>
Expand Down
26 changes: 19 additions & 7 deletions changedetectionio/templates/edit.html
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@
<div id="loading-status-text" style="display: none;">Please wait, first browser step can take a little time to load..<div class="spinner"></div></div>
<div class="flex-wrapper" >

<div id="browser-steps-ui" class="noselect" style="width: 100%; background-color: #eee; border-radius: 5px;">
<div id="browser-steps-ui" class="noselect">

<div class="noselect" id="browsersteps-selector-wrapper" style="width: 100%">
<span class="loader" >
Expand All @@ -215,7 +215,7 @@ <h2 >Click here to Start</h2>
<canvas class="noselect" id="browsersteps-selector-canvas" style="max-width: 100%; width: 100%;"></canvas>
</div>
</div>
<div id="browser-steps-fieldlist" style="padding-left: 1em; width: 350px; font-size: 80%;" >
<div id="browser-steps-fieldlist" >
<span id="browser-seconds-remaining">Loading</span> <span style="font-size: 80%;"> (<a target=_new href="https://github.com/dgtlmoon/changedetection.io/pull/478/files#diff-1a79d924d1840c485238e66772391268a89c95b781d69091384cf1ea1ac146c9R4">?</a>) </span>
{{ render_field(form.browser_steps) }}
</div>
Expand Down Expand Up @@ -313,12 +313,13 @@ <h2 >Click here to Start</h2>
{{ render_field(form.subtractive_selectors, rows=5, placeholder=has_tag_filters_extra+"header
footer
nav
.stockticker") }}
.stockticker
//*[contains(text(), 'Advertisement')]") }}
<span class="pure-form-message-inline">
<ul>
<li> Remove HTML element(s) by CSS selector before text conversion. </li>
<li> Don't paste HTML here, use only CSS selectors </li>
<li> Add multiple elements or CSS selectors per line to ignore multiple parts of the HTML. </li>
<li> Remove HTML element(s) by CSS and XPath selectors before text conversion. </li>
<li> Don't paste HTML here, use only CSS and XPath selectors </li>
<li> Add multiple elements, CSS or XPath selectors per line to ignore multiple parts of the HTML. </li>
</ul>
</span>
</fieldset>
Expand All @@ -333,11 +334,22 @@ <h3>Text filtering</h3>
<span class="pure-form-message-inline">So it's always better to select <strong>Added</strong>+<strong>Replaced</strong> when you're interested in new content.</span><br>
<span class="pure-form-message-inline">When content is merely moved in a list, it will also trigger an <strong>addition</strong>, consider enabling <code><strong>Only trigger when unique lines appear</strong></code></span>
</fieldset>

<fieldset class="pure-control-group">
{{ render_checkbox_field(form.check_unique_lines) }}
<span class="pure-form-message-inline">Good for websites that just move the content around, and you want to know when NEW content is added, compares new lines against all history for this watch.</span>
</fieldset>
<fieldset class="pure-control-group">
{{ render_checkbox_field(form.remove_duplicate_lines) }}
<span class="pure-form-message-inline">Remove duplicate lines of text</span>
</fieldset>
<fieldset class="pure-control-group">
{{ render_checkbox_field(form.sort_text_alphabetically) }}
<span class="pure-form-message-inline">Helps reduce changes detected caused by sites shuffling lines around, combine with <i>check unique lines</i> below.</span>
</fieldset>
<fieldset class="pure-control-group">
{{ render_checkbox_field(form.trim_text_whitespace) }}
<span class="pure-form-message-inline">Remove any whitespace before and after each line of text</span>
</fieldset>
<fieldset class="pure-control-group">
{{ render_checkbox_field(form.check_unique_lines) }}
<span class="pure-form-message-inline">Good for websites that just move the content around, and you want to know when NEW content is added, compares new lines against all history for this watch.</span>
Expand Down
8 changes: 5 additions & 3 deletions changedetectionio/templates/settings.html
Original file line number Diff line number Diff line change
Expand Up @@ -155,11 +155,13 @@
{{ render_field(form.application.form.global_subtractive_selectors, rows=5, placeholder="header
footer
nav
.stockticker") }}
.stockticker
//*[contains(text(), 'Advertisement')]") }}
<span class="pure-form-message-inline">
<ul>
<li> Remove HTML element(s) by CSS selector before text conversion. </li>
<li> Add multiple elements or CSS selectors per line to ignore multiple parts of the HTML. </li>
<li> Remove HTML element(s) by CSS and XPath selectors before text conversion. </li>
<li> Don't paste HTML here, use only CSS and XPath selectors </li>
<li> Add multiple elements, CSS or XPath selectors per line to ignore multiple parts of the HTML. </li>
</ul>
</span>
</fieldset>
Expand Down
14 changes: 13 additions & 1 deletion changedetectionio/tests/test_element_removal.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,26 @@ def test_element_removal_output():
Some initial text<br>
<p>across multiple lines</p>
<div id="changetext">Some text that changes</div>
<div>Some text should be matched by xPath // selector</div>
<div>Some text should be matched by xPath selector</div>
<div>Some text should be matched by xPath1 selector</div>
</body>
<footer>
<p>Footer</p>
</footer>
</html>
"""
html_blob = element_removal(
["header", "footer", "nav", "#changetext"], html_content=content
[
"header",
"footer",
"nav",
"#changetext",
"//*[contains(text(), 'xPath // selector')]",
"xpath://*[contains(text(), 'xPath selector')]",
"xpath1://*[contains(text(), 'xPath1 selector')]"
],
html_content=content
)
text = get_text(html_blob)
assert (
Expand Down
49 changes: 49 additions & 0 deletions changedetectionio/tests/test_unique_lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ def set_original_ignore_response():
<p>Some initial text</p>
<p>Which is across multiple lines</p>
<p>So let's see what happens.</p>
<p>&nbsp; So let's see what happens. <br> </p>
<p>A - sortable line</p>
</body>
</html>
"""
Expand Down Expand Up @@ -164,5 +166,52 @@ def test_sort_lines_functionality(client, live_server, measure_memory_usage):
assert res.data.find(b'A uppercase') < res.data.find(b'Z last')
assert res.data.find(b'Some initial text') < res.data.find(b'Which is across multiple lines')

res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data


def test_extra_filters(client, live_server, measure_memory_usage):
#live_server_setup(live_server)

set_original_ignore_response()

# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
wait_for_all_checks(client)

# Add our URL to the import page
res = client.post(
url_for("edit_page", uuid="first"),
data={"remove_duplicate_lines": "y",
"trim_text_whitespace": "y",
"sort_text_alphabetically": "", # leave this OFF for testing
"url": test_url,
"fetch_backend": "html_requests"},
follow_redirects=True
)
assert b"Updated watch." in res.data
# Give the thread time to pick it up
wait_for_all_checks(client)
# Trigger a check
client.get(url_for("form_watch_checknow"), follow_redirects=True)

# Give the thread time to pick it up
wait_for_all_checks(client)

res = client.get(
url_for("preview_page", uuid="first")
)

assert res.data.count(b"see what happens.") == 1

# still should remain unsorted ('A - sortable line') stays at the end
assert res.data.find(b'A - sortable line') > res.data.find(b'Which is across multiple lines')

res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ dnspython==2.6.1 # related to eventlet fixes
# jq not available on Windows so must be installed manually

# Notification library
apprise~=1.8.1
apprise==1.9.0

# apprise mqtt https://github.com/dgtlmoon/changedetection.io/issues/315
# and 2.0.0 https://github.com/dgtlmoon/changedetection.io/issues/2241 not yet compatible
Expand Down

0 comments on commit 5c8c442

Please sign in to comment.