diff --git a/blog/hn_scraper/hn_scraper/frontier_settings.py b/blog/hn_scraper/hn_scraper/frontera_settings.py similarity index 100% rename from blog/hn_scraper/hn_scraper/frontier_settings.py rename to blog/hn_scraper/hn_scraper/frontera_settings.py diff --git a/blog/hn_scraper/hn_scraper/settings.py b/blog/hn_scraper/hn_scraper/settings.py index ca5e4ea..3161c64 100644 --- a/blog/hn_scraper/hn_scraper/settings.py +++ b/blog/hn_scraper/hn_scraper/settings.py @@ -18,14 +18,15 @@ #-------------------------------------------------------------------------- SPIDER_MIDDLEWARES = {} DOWNLOADER_MIDDLEWARES = {} -SPIDER_MIDDLEWARES.update( - {'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 999}, -) -DOWNLOADER_MIDDLEWARES.update( - {'frontera.contrib.scrapy.middlewares.schedulers.SchedulerDownloaderMiddleware': 999} -) +SPIDER_MIDDLEWARES.update({ + 'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 999 +}, ) +DOWNLOADER_MIDDLEWARES.update({ + 'frontera.contrib.scrapy.middlewares.schedulers.SchedulerDownloaderMiddleware': + 999 +}) SCHEDULER = 'frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler' -FRONTERA_SETTINGS = 'hn_scraper.frontier_settings' +FRONTERA_SETTINGS = 'hn_scraper.frontera_settings' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'hn_scraper (+http://www.yourdomain.com)' diff --git a/blog/hn_scraper/hn_scraper/spiders/HackerNews.py b/blog/hn_scraper/hn_scraper/spiders/HackerNews.py index 4062393..46534aa 100644 --- a/blog/hn_scraper/hn_scraper/spiders/HackerNews.py +++ b/blog/hn_scraper/hn_scraper/spiders/HackerNews.py @@ -13,8 +13,9 @@ class HackernewsSpider(Spider): allowed_domains = ["news.ycombinator.com"] start_urls = ('https://news.ycombinator.com/', ) - link_extractor = SgmlLinkExtractor(allow=('news', ), - restrict_xpaths=('//a[text()="More"]', )) + link_extractor = SgmlLinkExtractor( + allow=('news', ), + restrict_xpaths=('//a[text()="More"]', )) def extract_one(self, selector, xpath, default=None): extracted = selector.xpath(xpath).extract() @@ -31,7 +32,6 @@ def parse(self, response): for item in self.parse_item(response): yield item - def parse_item(self, response): selector = Selector(response)