From e3b7ee133eecab5afc40da6e40e4ec59969f3d86 Mon Sep 17 00:00:00 2001 From: Richard Dowinton Date: Fri, 10 Apr 2015 20:57:07 +0100 Subject: [PATCH] Initial commit - HN spider --- .gitignore | 1 + README.rst | 0 blog/hn_scraper/hn_scraper/__init__.py | 0 blog/hn_scraper/hn_scraper/items.py | 9 ++++ blog/hn_scraper/hn_scraper/pipelines.py | 11 +++++ blog/hn_scraper/hn_scraper/settings.py | 17 +++++++ .../hn_scraper/spiders/HackerNews.py | 47 +++++++++++++++++++ .../hn_scraper/hn_scraper/spiders/__init__.py | 4 ++ blog/hn_scraper/scrapy.cfg | 11 +++++ 9 files changed, 100 insertions(+) create mode 100644 .gitignore create mode 100644 README.rst create mode 100644 blog/hn_scraper/hn_scraper/__init__.py create mode 100644 blog/hn_scraper/hn_scraper/items.py create mode 100644 blog/hn_scraper/hn_scraper/pipelines.py create mode 100644 blog/hn_scraper/hn_scraper/settings.py create mode 100644 blog/hn_scraper/hn_scraper/spiders/HackerNews.py create mode 100644 blog/hn_scraper/hn_scraper/spiders/__init__.py create mode 100644 blog/hn_scraper/scrapy.cfg diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0d20b64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.pyc diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..e69de29 diff --git a/blog/hn_scraper/hn_scraper/__init__.py b/blog/hn_scraper/hn_scraper/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/blog/hn_scraper/hn_scraper/items.py b/blog/hn_scraper/hn_scraper/items.py new file mode 100644 index 0000000..25709cc --- /dev/null +++ b/blog/hn_scraper/hn_scraper/items.py @@ -0,0 +1,9 @@ +# -*- coding: utf-8 -*- +import scrapy + + +class HnArticleItem(scrapy.Item): + id = scrapy.Field() + url = scrapy.Field() + title = scrapy.Field() + author = scrapy.Field() diff --git a/blog/hn_scraper/hn_scraper/pipelines.py b/blog/hn_scraper/hn_scraper/pipelines.py new file mode 100644 index 0000000..245ac75 --- /dev/null +++ b/blog/hn_scraper/hn_scraper/pipelines.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html + + +class HnScraperPipeline(object): + def process_item(self, item, spider): + return item diff --git a/blog/hn_scraper/hn_scraper/settings.py b/blog/hn_scraper/hn_scraper/settings.py new file mode 100644 index 0000000..0bb0861 --- /dev/null +++ b/blog/hn_scraper/hn_scraper/settings.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for hn_scraper project +# +# For simplicity, this file contains only the most important settings by +# default. All the other settings are documented here: +# +# http://doc.scrapy.org/en/latest/topics/settings.html +# + +BOT_NAME = 'hn_scraper' + +SPIDER_MODULES = ['hn_scraper.spiders'] +NEWSPIDER_MODULE = 'hn_scraper.spiders' + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'hn_scraper (+http://www.yourdomain.com)' diff --git a/blog/hn_scraper/hn_scraper/spiders/HackerNews.py b/blog/hn_scraper/hn_scraper/spiders/HackerNews.py new file mode 100644 index 0000000..4284a42 --- /dev/null +++ b/blog/hn_scraper/hn_scraper/spiders/HackerNews.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +import scrapy +from scrapy.contrib.spiders import CrawlSpider, Rule +from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor +from scrapy.selector import Selector + +from hn_scraper.items import HnArticleItem + + +class HackernewsSpider(CrawlSpider): + name = "HackerNews" + allowed_domains = ["news.ycombinator.com"] + start_urls = ('https://news.ycombinator.com/', ) + rules = (Rule(SgmlLinkExtractor(allow=('news', ), + restrict_xpaths=('//a[text()="More"]', )), + callback='parse_item', + follow=True), ) + + def extract_one(self, selector, xpath, default=None): + extracted = selector.xpath(xpath).extract() + if extracted: + return extracted[0] + return default + + def parse_item(self, response): + selector = Selector(response) + + rows = selector.xpath('//table[@id="hnmain"]//td[count(table) = 1]' \ + '//table[count(tr) > 1]//tr[count(td) = 3]') + for row in rows: + item = HnArticleItem() + + article = row.xpath('td[@class="title" and count(a) = 1]//a') + article_url = self.extract_one(article, './@href', '') + article_title = self.extract_one(article, './text()', '') + item['url'] = article_url + item['title'] = article_title + + subtext = row.xpath( + './following-sibling::tr[1]//td[@class="subtext" and count(a) = 3]') + if subtext: + item_author = self.extract_one(subtext, './/a[1]/@href', '') + item_id = self.extract_one(subtext, './/a[2]/@href', '') + item['author'] = item_author[8:] + item['id'] = int(item_id[8:]) + + yield item diff --git a/blog/hn_scraper/hn_scraper/spiders/__init__.py b/blog/hn_scraper/hn_scraper/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/blog/hn_scraper/hn_scraper/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/blog/hn_scraper/scrapy.cfg b/blog/hn_scraper/scrapy.cfg new file mode 100644 index 0000000..05d9562 --- /dev/null +++ b/blog/hn_scraper/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# http://doc.scrapy.org/en/latest/topics/scrapyd.html + +[settings] +default = hn_scraper.settings + +[deploy] +#url = http://localhost:6800/ +project = hn_scraper