Adding beautifulsoup support.

untwisted · Jul 22, 2017 · 699264b · 699264b
1 parent 434fab5
commit 699264b
Show file tree

Hide file tree

Showing 5 changed files with 93 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -22,6 +22,8 @@ construct json-like structures for the data thats extracted from the pages.
 
 - **Support for LXML**
 
+- **Support for BeautifulSoup4**
+
 - **Non-blocking I/O**
 
 ### Basic example
@@ -161,6 +163,39 @@ The structure would look like:
 [(tag_name, {'quote': 'The quote text.', 'author': "The author description from the about link'}), ...]
 ~~~
 
+This other example uses beautifulsoup4 to extract merely the quotes. It follows pagination as well.
+
+~~~python
+from sukhoi import MinerBS4, core
+
+class QuoteMiner(MinerBS4):
+    def run(self, dom):
+        elems = dom.find_all('div', {'class':'quote'})
+        self.extend(map(self.extract_quote, elems))
+
+        elem = dom.find('li', {'class', 'next'})
+        if elem: self.next(elem.find('a').get('href'))
+
+    def extract_quote(self, elem):
+        quote = elem.find('span', {'class': 'text'})
+        return quote.text
+
+if __name__ == '__main__':
+    URL = 'http://quotes.toscrape.com/'
+    quotes = QuoteMiner(URL)
+    core.gear.mainloop()
+
+    print quotes
+
+~~~
+
+The structure would be:
+
+~~~
+[quote0, quote1, ...]
+~~~
+
+
 # Install
 
 ~~~
@@ -178,3 +213,4 @@ an issue about a donation :)
 
 
 
+
diff --git a/demo/bs4_quotes.py b/demo/bs4_quotes.py
@@ -0,0 +1,33 @@
+"""
+This example extract just the quotes, you end up with a structure like:
+    [quote0, quote1, ...]
+
+Note: It uses beautifulsoup4 :)
+"""
+
+from sukhoi import MinerBS4, core
+
+class QuoteMiner(MinerBS4):
+    def run(self, dom):
+        elems = dom.find_all('div', {'class':'quote'})
+        self.extend(map(self.extract_quote, elems))
+
+        elem = dom.find('li', {'class', 'next'})
+        if elem: self.next(elem.find('a').get('href'))
+
+    def extract_quote(self, elem):
+        quote = elem.find('span', {'class': 'text'})
+        return quote.text
+
+if __name__ == '__main__':
+    URL = 'http://quotes.toscrape.com/'
+    quotes = QuoteMiner(URL)
+    core.gear.mainloop()
+
+    print quotes
+
+
+
+
+
+
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,10 @@
-untwisted==1.3.2
-websnake==1.1.0
+untwisted==1.4.1
+websnake==1.3.0
 ehp==1.1.0
 lxml
+bs4
+
+
 
 
 

diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 from distutils.core import setup
 setup(name="sukhoi",
-      version="0.0.5",
+      version="0.0.6",
       py_modules=["sukhoi"],
       author="Iury O. G. Figueiredo",
       author_email="ioliveira@id.uff.br",
@@ -39,5 +39,6 @@
 
 
 
+
 
 
diff --git a/sukhoi.py b/sukhoi.py
@@ -1,7 +1,7 @@
+from websnake import ResponseHandle, get, post
 from ehp import Html as EhpHtml
 import lxml.html as LxmlHtml
-
-from websnake import ResponseHandle, get, post
+from bs4 import BeautifulSoup
 from untwisted.iostd import LOST
 from untwisted.core import die
 from untwisted.task import Task, DONE
@@ -134,18 +134,33 @@ def run(self, dom):
         pass
 
 class MinerEHP(Miner):
+    """
+    Use EHP to build the dom structure.
+    """
+
     html = EhpHtml()
 
     def build_dom(self, data):
         dom  = self.html.feed(data)
         self.run(dom)
 
 class MinerLXML(Miner):
+    """
+    Use lxml to build the structure.
+    """
+
     def build_dom(self, data):
         dom = LxmlHtml.fromstring(data)
         self.run(dom)
 
+class MinerBS4(Miner):
+    """
+    Use lxml parser with beautifulsoup4.
+    """
 
+    def build_dom(self, data):
+        dom = BeautifulSoup(data, 'lxml')
+        self.run(dom)