Skip to content

Commit

Permalink
Merge pull request #1 from plafl/master
Browse files Browse the repository at this point in the history
Added crawlfrontier SQLAlchemy backend
  • Loading branch information
rdowinton committed Apr 16, 2015
2 parents e3b7ee1 + b2d22db commit 6cdb29b
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 6 deletions.
5 changes: 5 additions & 0 deletions blog/hn_scraper/hn_scraper/frontier_settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
BACKEND = 'frontera.contrib.backends.sqlalchemy.FIFO'
SQLALCHEMYBACKEND_ENGINE = 'sqlite:///hn_frontier.db'
MAX_REQUESTS = 2000
MAX_NEXT_REQUESTS = 10
DELAY_ON_EMPTY = 0.0
14 changes: 14 additions & 0 deletions blog/hn_scraper/hn_scraper/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,19 @@
SPIDER_MODULES = ['hn_scraper.spiders']
NEWSPIDER_MODULE = 'hn_scraper.spiders'

#--------------------------------------------------------------------------
# Frontier Settings
#--------------------------------------------------------------------------
SPIDER_MIDDLEWARES = {}
DOWNLOADER_MIDDLEWARES = {}
SPIDER_MIDDLEWARES.update(
{'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 999},
)
DOWNLOADER_MIDDLEWARES.update(
{'frontera.contrib.scrapy.middlewares.schedulers.SchedulerDownloaderMiddleware': 999}
)
SCHEDULER = 'frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler'
FRONTERA_SETTINGS = 'hn_scraper.frontier_settings'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'hn_scraper (+http://www.yourdomain.com)'
22 changes: 16 additions & 6 deletions blog/hn_scraper/hn_scraper/spiders/HackerNews.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,37 @@
# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.http import Request
from scrapy.spider import Spider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector

from hn_scraper.items import HnArticleItem


class HackernewsSpider(CrawlSpider):
class HackernewsSpider(Spider):
name = "HackerNews"
allowed_domains = ["news.ycombinator.com"]
start_urls = ('https://news.ycombinator.com/', )
rules = (Rule(SgmlLinkExtractor(allow=('news', ),
restrict_xpaths=('//a[text()="More"]', )),
callback='parse_item',
follow=True), )

link_extractor = SgmlLinkExtractor(allow=('news', ),
restrict_xpaths=('//a[text()="More"]', ))

def extract_one(self, selector, xpath, default=None):
extracted = selector.xpath(xpath).extract()
if extracted:
return extracted[0]
return default

def parse(self, response):
for link in self.link_extractor.extract_links(response):
request = Request(url=link.url)
request.meta.update(link_text=link.text)
yield request

for item in self.parse_item(response):
yield item


def parse_item(self, response):
selector = Selector(response)

Expand Down

0 comments on commit 6cdb29b

Please sign in to comment.