Initial commit - HN spider

scrapinghub · Apr 10, 2015 · e3b7ee1 · e3b7ee1
commit e3b7ee1
Show file tree

Hide file tree

Showing 9 changed files with 100 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/README.rst b/README.rst
diff --git a/blog/hn_scraper/hn_scraper/__init__.py b/blog/hn_scraper/hn_scraper/__init__.py
diff --git a/blog/hn_scraper/hn_scraper/items.py b/blog/hn_scraper/hn_scraper/items.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+import scrapy
+
+
+class HnArticleItem(scrapy.Item):
+    id = scrapy.Field()
+    url = scrapy.Field()
+    title = scrapy.Field()
+    author = scrapy.Field()
diff --git a/blog/hn_scraper/hn_scraper/pipelines.py b/blog/hn_scraper/hn_scraper/pipelines.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+class HnScraperPipeline(object):
+    def process_item(self, item, spider):
+        return item
diff --git a/blog/hn_scraper/hn_scraper/settings.py b/blog/hn_scraper/hn_scraper/settings.py
@@ -0,0 +1,17 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for hn_scraper project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#
+
+BOT_NAME = 'hn_scraper'
+
+SPIDER_MODULES = ['hn_scraper.spiders']
+NEWSPIDER_MODULE = 'hn_scraper.spiders'
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'hn_scraper (+http://www.yourdomain.com)'
diff --git a/blog/hn_scraper/hn_scraper/spiders/HackerNews.py b/blog/hn_scraper/hn_scraper/spiders/HackerNews.py
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from scrapy.contrib.spiders import CrawlSpider, Rule
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.selector import Selector
+
+from hn_scraper.items import HnArticleItem
+
+
+class HackernewsSpider(CrawlSpider):
+    name = "HackerNews"
+    allowed_domains = ["news.ycombinator.com"]
+    start_urls = ('https://news.ycombinator.com/', )
+    rules = (Rule(SgmlLinkExtractor(allow=('news', ),
+                                    restrict_xpaths=('//a[text()="More"]', )),
+                  callback='parse_item',
+                  follow=True), )
+
+    def extract_one(self, selector, xpath, default=None):
+        extracted = selector.xpath(xpath).extract()
+        if extracted:
+            return extracted[0]
+        return default
+
+    def parse_item(self, response):
+        selector = Selector(response)
+
+        rows = selector.xpath('//table[@id="hnmain"]//td[count(table) = 1]' \
+                              '//table[count(tr) > 1]//tr[count(td) = 3]')
+        for row in rows:
+            item = HnArticleItem()
+
+            article = row.xpath('td[@class="title" and count(a) = 1]//a')
+            article_url = self.extract_one(article, './@href', '')
+            article_title = self.extract_one(article, './text()', '')
+            item['url'] = article_url
+            item['title'] = article_title
+
+            subtext = row.xpath(
+                './following-sibling::tr[1]//td[@class="subtext" and count(a) = 3]')
+            if subtext:
+                item_author = self.extract_one(subtext, './/a[1]/@href', '')
+                item_id = self.extract_one(subtext, './/a[2]/@href', '')
+                item['author'] = item_author[8:]
+                item['id'] = int(item_id[8:])
+
+            yield item
diff --git a/blog/hn_scraper/hn_scraper/spiders/__init__.py b/blog/hn_scraper/hn_scraper/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/blog/hn_scraper/scrapy.cfg b/blog/hn_scraper/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# http://doc.scrapy.org/en/latest/topics/scrapyd.html
+
+[settings]
+default = hn_scraper.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = hn_scraper