rough etymology scraper

dmrd · May 6, 2014 · c6f2326 · c6f2326
1 parent eaa54f2
commit c6f2326
Show file tree

Hide file tree

Showing 9 changed files with 80 additions and 1 deletion.
diff --git a/features/etymology/etym_scraper/__init__.py b/features/etymology/etym_scraper/__init__.py
diff --git a/features/etymology/etym_scraper/items.py b/features/etymology/etym_scraper/items.py
@@ -0,0 +1,10 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+from scrapy.item import Item, Field
+
+class Entry(Item):
+    word = Field()
+    etym = Field()
diff --git a/features/etymology/etym_scraper/pipelines.py b/features/etymology/etym_scraper/pipelines.py
@@ -0,0 +1,8 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+class EtymScraperPipeline(object):
+    def process_item(self, item, spider):
+        return item
diff --git a/features/etymology/etym_scraper/settings.py b/features/etymology/etym_scraper/settings.py
@@ -0,0 +1,15 @@
+# Scrapy settings for course_scraper project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#
+
+BOT_NAME = 'etym_scraper'
+
+SPIDER_MODULES = ['etym_scraper.spiders']
+NEWSPIDER_MODULE = 'etym_scraper.spiders'
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'etym_scraper (+http://www.yourdomain.com)'
diff --git a/features/etymology/etym_scraper/spiders/__init__.py b/features/etymology/etym_scraper/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/features/etymology/etym_scraper/spiders/etym_spider.py b/features/etymology/etym_scraper/spiders/etym_spider.py
@@ -0,0 +1,31 @@
+from scrapy.contrib.spiders import CrawlSpider, Rule
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.selector import Selector
+from etym_scraper.items import Entry
+
+class EtymSpider(CrawlSpider):
+    name = "etym_scraper"
+    allowed_domains = ["etymonline.com"]
+    start_urls = [
+        "http://www.etymonline.com/index.php"
+    ]
+
+    rules = (
+        Rule(SgmlLinkExtractor(allow=('index.php')), callback='parse_details'),
+    )
+
+    def parse_details(self, response):
+        sel = Selector(response)
+
+        all_dt = sel.xpath('//dt')
+
+        results = []
+
+        for dt in all_dt:
+            entry = Entry()
+            entry['word'] = dt.xpath('.//a[1]/text()').extract()
+            entry['etym'] = dt.xpath('./following-sibling::dd/text()').extract()
+
+            results.append(entry)
+
+        return results
diff --git a/features/etymology/requirements.txt b/features/etymology/requirements.txt
diff --git a/features/etymology/scrapy.cfg b/features/etymology/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# http://doc.scrapy.org/en/latest/topics/scrapyd.html
+
+[settings]
+default = etym_scraper.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = etym_scraper
diff --git a/requirements.txt b/requirements.txt
@@ -2,4 +2,4 @@ PyYAML==3.11
 nltk==2.0.4
 numpy==1.8.1
 wsgiref==0.1.2
-python-Levenshtein
+python-Levenshtein