Skip to content

Commit

Permalink
Adding beautifulsoup support.
Browse files Browse the repository at this point in the history
  • Loading branch information
Iury O. G. Figueiredo committed Jul 22, 2017
1 parent 434fab5 commit 699264b
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 5 deletions.
36 changes: 36 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ construct json-like structures for the data thats extracted from the pages.

- **Support for LXML**

- **Support for BeautifulSoup4**

- **Non-blocking I/O**

### Basic example
Expand Down Expand Up @@ -161,6 +163,39 @@ The structure would look like:
[(tag_name, {'quote': 'The quote text.', 'author': "The author description from the about link'}), ...]
~~~

This other example uses beautifulsoup4 to extract merely the quotes. It follows pagination as well.

~~~python
from sukhoi import MinerBS4, core

class QuoteMiner(MinerBS4):
def run(self, dom):
elems = dom.find_all('div', {'class':'quote'})
self.extend(map(self.extract_quote, elems))

elem = dom.find('li', {'class', 'next'})
if elem: self.next(elem.find('a').get('href'))

def extract_quote(self, elem):
quote = elem.find('span', {'class': 'text'})
return quote.text

if __name__ == '__main__':
URL = 'http://quotes.toscrape.com/'
quotes = QuoteMiner(URL)
core.gear.mainloop()

print quotes

~~~

The structure would be:

~~~
[quote0, quote1, ...]
~~~


# Install

~~~
Expand All @@ -178,3 +213,4 @@ an issue about a donation :)




33 changes: 33 additions & 0 deletions demo/bs4_quotes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""
This example extract just the quotes, you end up with a structure like:
[quote0, quote1, ...]
Note: It uses beautifulsoup4 :)
"""

from sukhoi import MinerBS4, core

class QuoteMiner(MinerBS4):
def run(self, dom):
elems = dom.find_all('div', {'class':'quote'})
self.extend(map(self.extract_quote, elems))

elem = dom.find('li', {'class', 'next'})
if elem: self.next(elem.find('a').get('href'))

def extract_quote(self, elem):
quote = elem.find('span', {'class': 'text'})
return quote.text

if __name__ == '__main__':
URL = 'http://quotes.toscrape.com/'
quotes = QuoteMiner(URL)
core.gear.mainloop()

print quotes






7 changes: 5 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
untwisted==1.3.2
websnake==1.1.0
untwisted==1.4.1
websnake==1.3.0
ehp==1.1.0
lxml
bs4





Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from distutils.core import setup
setup(name="sukhoi",
version="0.0.5",
version="0.0.6",
py_modules=["sukhoi"],
author="Iury O. G. Figueiredo",
author_email="ioliveira@id.uff.br",
Expand Down Expand Up @@ -39,5 +39,6 @@






19 changes: 17 additions & 2 deletions sukhoi.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from websnake import ResponseHandle, get, post
from ehp import Html as EhpHtml
import lxml.html as LxmlHtml

from websnake import ResponseHandle, get, post
from bs4 import BeautifulSoup
from untwisted.iostd import LOST
from untwisted.core import die
from untwisted.task import Task, DONE
Expand Down Expand Up @@ -134,18 +134,33 @@ def run(self, dom):
pass

class MinerEHP(Miner):
"""
Use EHP to build the dom structure.
"""

html = EhpHtml()

def build_dom(self, data):
dom = self.html.feed(data)
self.run(dom)

class MinerLXML(Miner):
"""
Use lxml to build the structure.
"""

def build_dom(self, data):
dom = LxmlHtml.fromstring(data)
self.run(dom)

class MinerBS4(Miner):
"""
Use lxml parser with beautifulsoup4.
"""

def build_dom(self, data):
dom = BeautifulSoup(data, 'lxml')
self.run(dom)



0 comments on commit 699264b

Please sign in to comment.