Skip to content

Commit

Permalink
rough etymology scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
shbhrsaha committed May 6, 2014
1 parent eaa54f2 commit c6f2326
Show file tree
Hide file tree
Showing 9 changed files with 80 additions and 1 deletion.
Empty file.
10 changes: 10 additions & 0 deletions features/etymology/etym_scraper/items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

from scrapy.item import Item, Field

class Entry(Item):
word = Field()
etym = Field()
8 changes: 8 additions & 0 deletions features/etymology/etym_scraper/pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

class EtymScraperPipeline(object):
def process_item(self, item, spider):
return item
15 changes: 15 additions & 0 deletions features/etymology/etym_scraper/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Scrapy settings for course_scraper project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#

BOT_NAME = 'etym_scraper'

SPIDER_MODULES = ['etym_scraper.spiders']
NEWSPIDER_MODULE = 'etym_scraper.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'etym_scraper (+http://www.yourdomain.com)'
4 changes: 4 additions & 0 deletions features/etymology/etym_scraper/spiders/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
31 changes: 31 additions & 0 deletions features/etymology/etym_scraper/spiders/etym_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from etym_scraper.items import Entry

class EtymSpider(CrawlSpider):
name = "etym_scraper"
allowed_domains = ["etymonline.com"]
start_urls = [
"http://www.etymonline.com/index.php"
]

rules = (
Rule(SgmlLinkExtractor(allow=('index.php')), callback='parse_details'),
)

def parse_details(self, response):
sel = Selector(response)

all_dt = sel.xpath('//dt')

results = []

for dt in all_dt:
entry = Entry()
entry['word'] = dt.xpath('.//a[1]/text()').extract()
entry['etym'] = dt.xpath('./following-sibling::dd/text()').extract()

results.append(entry)

return results
Empty file.
11 changes: 11 additions & 0 deletions features/etymology/scrapy.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# http://doc.scrapy.org/en/latest/topics/scrapyd.html

[settings]
default = etym_scraper.settings

[deploy]
#url = http://localhost:6800/
project = etym_scraper
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ PyYAML==3.11
nltk==2.0.4
numpy==1.8.1
wsgiref==0.1.2
python-Levenshtein
python-Levenshtein

0 comments on commit c6f2326

Please sign in to comment.