Skip to content

Commit

Permalink
Restock/Price detection - Better catching of errors when parsing meta…
Browse files Browse the repository at this point in the history
…data documents for restock/price check (#2602)
  • Loading branch information
dgtlmoon authored Sep 1, 2024
1 parent 4225900 commit 55fe2ab
Showing 1 changed file with 6 additions and 3 deletions.
9 changes: 6 additions & 3 deletions changedetectionio/processors/restock_diff/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,16 @@ def get_itemprop_availability(html_content) -> Restock:
import extruct
logger.trace(f"Imported extruct module in {time.time() - now:.3f}s")

value = {}
now = time.time()
# Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest.

# Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest.
syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph']
try:
data = extruct.extract(html_content, syntaxes=syntaxes)
except Exception as e:
logger.warning(f"Unable to extract data, document parsing with extruct failed with {type(e).__name__} - {str(e)}")
return Restock()

data = extruct.extract(html_content, syntaxes=syntaxes)
logger.trace(f"Extruct basic extract of all metadata done in {time.time() - now:.3f}s")

# First phase, dead simple scanning of anything that looks useful
Expand Down

0 comments on commit 55fe2ab

Please sign in to comment.