Skip to content

Commit

Permalink
Initial commit - HN spider
Browse files Browse the repository at this point in the history
  • Loading branch information
rdowinton committed Apr 10, 2015
0 parents commit e3b7ee1
Show file tree
Hide file tree
Showing 9 changed files with 100 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.pyc
Empty file added README.rst
Empty file.
Empty file.
9 changes: 9 additions & 0 deletions blog/hn_scraper/hn_scraper/items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# -*- coding: utf-8 -*-
import scrapy


class HnArticleItem(scrapy.Item):
id = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
author = scrapy.Field()
11 changes: 11 additions & 0 deletions blog/hn_scraper/hn_scraper/pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


class HnScraperPipeline(object):
def process_item(self, item, spider):
return item
17 changes: 17 additions & 0 deletions blog/hn_scraper/hn_scraper/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-

# Scrapy settings for hn_scraper project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#

BOT_NAME = 'hn_scraper'

SPIDER_MODULES = ['hn_scraper.spiders']
NEWSPIDER_MODULE = 'hn_scraper.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'hn_scraper (+http://www.yourdomain.com)'
47 changes: 47 additions & 0 deletions blog/hn_scraper/hn_scraper/spiders/HackerNews.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector

from hn_scraper.items import HnArticleItem


class HackernewsSpider(CrawlSpider):
name = "HackerNews"
allowed_domains = ["news.ycombinator.com"]
start_urls = ('https://news.ycombinator.com/', )
rules = (Rule(SgmlLinkExtractor(allow=('news', ),
restrict_xpaths=('//a[text()="More"]', )),
callback='parse_item',
follow=True), )

def extract_one(self, selector, xpath, default=None):
extracted = selector.xpath(xpath).extract()
if extracted:
return extracted[0]
return default

def parse_item(self, response):
selector = Selector(response)

rows = selector.xpath('//table[@id="hnmain"]//td[count(table) = 1]' \
'//table[count(tr) > 1]//tr[count(td) = 3]')
for row in rows:
item = HnArticleItem()

article = row.xpath('td[@class="title" and count(a) = 1]//a')
article_url = self.extract_one(article, './@href', '')
article_title = self.extract_one(article, './text()', '')
item['url'] = article_url
item['title'] = article_title

subtext = row.xpath(
'./following-sibling::tr[1]//td[@class="subtext" and count(a) = 3]')
if subtext:
item_author = self.extract_one(subtext, './/a[1]/@href', '')
item_id = self.extract_one(subtext, './/a[2]/@href', '')
item['author'] = item_author[8:]
item['id'] = int(item_id[8:])

yield item
4 changes: 4 additions & 0 deletions blog/hn_scraper/hn_scraper/spiders/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
11 changes: 11 additions & 0 deletions blog/hn_scraper/scrapy.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# http://doc.scrapy.org/en/latest/topics/scrapyd.html

[settings]
default = hn_scraper.settings

[deploy]
#url = http://localhost:6800/
project = hn_scraper

0 comments on commit e3b7ee1

Please sign in to comment.