Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use lxml if available and parse head only. #10

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 28 additions & 11 deletions extraction/techniques.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,26 @@
"This file contains techniques for extracting data from HTML pages."
import bs4

LXML_AVAILABLE = False

def init_bs(html):
return bs4.BeautifulSoup(html, features="html5lib")
try:
import lxml
LXML_AVAILABLE = True
except ImportError:
pass


def init_bs(html, head_only=False):
features = "html5lib"
parse_only = None

if LXML_AVAILABLE:
features = "lxml"

if head_only:
parse_only = bs4.SoupStrainer("head")

return bs4.BeautifulSoup(html, features=features, parse_only=parse_only)


class Technique(object):
Expand All @@ -14,7 +31,7 @@ def __init__(self, extractor=None, *args, **kwargs):
"""
self.extractor = extractor
super(Technique, self).__init__(*args, **kwargs)

def extract(self, html):
"Extract data from a string representing an HTML document."
return {'titles': [],
Expand Down Expand Up @@ -48,7 +65,7 @@ class HeadTags(Technique):
def extract(self, html):
"Extract data from meta, link and title tags within the head tag."
extracted = {}
soup = init_bs(html)
soup = init_bs(html, head_only=True)
# extract data from title tag
title_tag = soup.find('title')
if title_tag:
Expand Down Expand Up @@ -114,7 +131,7 @@ class FacebookOpengraphTags(Technique):
def extract(self, html):
"Extract data from Facebook Opengraph tags."
extracted = {}
soup = init_bs(html)
soup = init_bs(html, head_only=True)
for meta_tag in soup.find_all('meta'):
if self.key_attr in meta_tag.attrs and 'content' in meta_tag.attrs:
property = meta_tag[self.key_attr]
Expand Down Expand Up @@ -144,7 +161,7 @@ class HTML5SemanticTags(Technique):
The HTML5 `article` tag, and also the `video` tag give us some useful
hints for extracting page information for the sites which happen to
utilize these tags.

This technique will extract information from pages formed like::

<html>
Expand All @@ -166,7 +183,7 @@ class HTML5SemanticTags(Technique):
of cases where it hits, and otherwise expects `SemanticTags` to run sweep
behind it for the lower quality, more abundant hits it discovers.
"""

def extract(self, html):
"Extract data from HTML5 semantic tags."
titles = []
Expand Down Expand Up @@ -207,12 +224,12 @@ class SemanticTags(Technique):
]
# format is ("name of tag", "destination list", "name of attribute" store_first_n)
extract_attr = [('img', 'images', 'src', 10)]

def extract(self, html):
"Extract data from usual semantic tags."
extracted = {}
soup = init_bs(html)

for tag, dest, max_to_store in self.extract_string:
for found in soup.find_all(tag)[:max_to_store] or []:
if dest not in extracted:
Expand All @@ -227,5 +244,5 @@ def extract(self, html):
extracted[dest].append(found[attribute])

return extracted