From e3b7ee133eecab5afc40da6e40e4ec59969f3d86 Mon Sep 17 00:00:00 2001
From: Richard Dowinton <richard.dowinton@gmail.com>
Date: Fri, 10 Apr 2015 20:57:07 +0100
Subject: [PATCH] Initial commit - HN spider

---
 .gitignore                                    |  1 +
 README.rst                                    |  0
 blog/hn_scraper/hn_scraper/__init__.py        |  0
 blog/hn_scraper/hn_scraper/items.py           |  9 ++++
 blog/hn_scraper/hn_scraper/pipelines.py       | 11 +++++
 blog/hn_scraper/hn_scraper/settings.py        | 17 +++++++
 .../hn_scraper/spiders/HackerNews.py          | 47 +++++++++++++++++++
 .../hn_scraper/hn_scraper/spiders/__init__.py |  4 ++
 blog/hn_scraper/scrapy.cfg                    | 11 +++++
 9 files changed, 100 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 README.rst
 create mode 100644 blog/hn_scraper/hn_scraper/__init__.py
 create mode 100644 blog/hn_scraper/hn_scraper/items.py
 create mode 100644 blog/hn_scraper/hn_scraper/pipelines.py
 create mode 100644 blog/hn_scraper/hn_scraper/settings.py
 create mode 100644 blog/hn_scraper/hn_scraper/spiders/HackerNews.py
 create mode 100644 blog/hn_scraper/hn_scraper/spiders/__init__.py
 create mode 100644 blog/hn_scraper/scrapy.cfg

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0d20b64
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..e69de29
diff --git a/blog/hn_scraper/hn_scraper/__init__.py b/blog/hn_scraper/hn_scraper/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/blog/hn_scraper/hn_scraper/items.py b/blog/hn_scraper/hn_scraper/items.py
new file mode 100644
index 0000000..25709cc
--- /dev/null
+++ b/blog/hn_scraper/hn_scraper/items.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+import scrapy
+
+
+class HnArticleItem(scrapy.Item):
+    id = scrapy.Field()
+    url = scrapy.Field()
+    title = scrapy.Field()
+    author = scrapy.Field()
diff --git a/blog/hn_scraper/hn_scraper/pipelines.py b/blog/hn_scraper/hn_scraper/pipelines.py
new file mode 100644
index 0000000..245ac75
--- /dev/null
+++ b/blog/hn_scraper/hn_scraper/pipelines.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+class HnScraperPipeline(object):
+    def process_item(self, item, spider):
+        return item
diff --git a/blog/hn_scraper/hn_scraper/settings.py b/blog/hn_scraper/hn_scraper/settings.py
new file mode 100644
index 0000000..0bb0861
--- /dev/null
+++ b/blog/hn_scraper/hn_scraper/settings.py
@@ -0,0 +1,17 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for hn_scraper project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#
+
+BOT_NAME = 'hn_scraper'
+
+SPIDER_MODULES = ['hn_scraper.spiders']
+NEWSPIDER_MODULE = 'hn_scraper.spiders'
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'hn_scraper (+http://www.yourdomain.com)'
diff --git a/blog/hn_scraper/hn_scraper/spiders/HackerNews.py b/blog/hn_scraper/hn_scraper/spiders/HackerNews.py
new file mode 100644
index 0000000..4284a42
--- /dev/null
+++ b/blog/hn_scraper/hn_scraper/spiders/HackerNews.py
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from scrapy.contrib.spiders import CrawlSpider, Rule
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.selector import Selector
+
+from hn_scraper.items import HnArticleItem
+
+
+class HackernewsSpider(CrawlSpider):
+    name = "HackerNews"
+    allowed_domains = ["news.ycombinator.com"]
+    start_urls = ('https://news.ycombinator.com/', )
+    rules = (Rule(SgmlLinkExtractor(allow=('news', ),
+                                    restrict_xpaths=('//a[text()="More"]', )),
+                  callback='parse_item',
+                  follow=True), )
+
+    def extract_one(self, selector, xpath, default=None):
+        extracted = selector.xpath(xpath).extract()
+        if extracted:
+            return extracted[0]
+        return default
+
+    def parse_item(self, response):
+        selector = Selector(response)
+
+        rows = selector.xpath('//table[@id="hnmain"]//td[count(table) = 1]' \
+                              '//table[count(tr) > 1]//tr[count(td) = 3]')
+        for row in rows:
+            item = HnArticleItem()
+
+            article = row.xpath('td[@class="title" and count(a) = 1]//a')
+            article_url = self.extract_one(article, './@href', '')
+            article_title = self.extract_one(article, './text()', '')
+            item['url'] = article_url
+            item['title'] = article_title
+
+            subtext = row.xpath(
+                './following-sibling::tr[1]//td[@class="subtext" and count(a) = 3]')
+            if subtext:
+                item_author = self.extract_one(subtext, './/a[1]/@href', '')
+                item_id = self.extract_one(subtext, './/a[2]/@href', '')
+                item['author'] = item_author[8:]
+                item['id'] = int(item_id[8:])
+
+            yield item
diff --git a/blog/hn_scraper/hn_scraper/spiders/__init__.py b/blog/hn_scraper/hn_scraper/spiders/__init__.py
new file mode 100644
index 0000000..ebd689a
--- /dev/null
+++ b/blog/hn_scraper/hn_scraper/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/blog/hn_scraper/scrapy.cfg b/blog/hn_scraper/scrapy.cfg
new file mode 100644
index 0000000..05d9562
--- /dev/null
+++ b/blog/hn_scraper/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# http://doc.scrapy.org/en/latest/topics/scrapyd.html
+
+[settings]
+default = hn_scraper.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = hn_scraper