forked from mattbierner/DreamScrape
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract.py
76 lines (58 loc) · 4.77 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import re
import requests
import json
from bs4 import BeautifulSoup
from collections import OrderedDict
DREAMBANK = "http://dreambank.net/random_sample.cgi?series={0}&min=1&max=9999&n=10000"
# Dream collections to download
DREAMERS = {
"alta": "Alta: a detailed dreamer", "angie": "Angie: age 18 & 20", "arlie": "Arlie: a middle-aged woman", "b": "Barb Sanders", "b2": "Barb Sanders #2", "b-baseline": "Barb Sanders: baseline", "bay_area_girls_456": "Bay Area girls: Grades 4-6", "bay_area_girls_789": "Bay Area girls: Grades 7-9", "bea1": "Bea 1: a high school student", "bea2": "Bea 2: a college student", "blind-f": "Blind dreamers (F)", "blind-m": "Blind dreamers (M)", "chris": "Chris: a transvestite", "chuck": "Chuck: a physical scientist", "hall_female": "College women, late 1940s", "dahlia": "Dahlia: concerns with appearance", "david": "David: teenage dreams", "vonuslar.de": "Detlev von Uslar, auf Deutsch", "dorothea": "Dorothea: 53 years of dreams", "ed": "Ed: dreams of his late wife", "edna": "Edna: a blind woman", "elizabeth": "Elizabeth: a woman in her 40s", "emmas_husband": "Emma's Husband", "emma": "Emma: 48 years of dreams", "esther": "Esther: an adolescent girl", "german-f.de": "German dreams (F)", "german-m.de": "German dreams (M)", "norms-f": "Hall/VdC Norms: Female", "norms-m": "Hall/VdC Norms: Male", "jasmine1": "Jasmine 1: middle school", "jasmine2": "Jasmine 2: high school", "jasmine3": "Jasmine 3: college 1", "jasmine4": "Jasmine 4: college 2", "jeff": "Jeff: a lucid dreamer", "joan": "Joan: a lesbian", "kenneth": "Kenneth", "lawrence": "Lawrence, a young man", "mack": "Mack: A poor recaller", "madeline1-hs": "Madeline 1: High School", "madeline2-dorms": "Madeline 2: College Dorms", "madeline3-offcampus": "Madeline 3: Off-Campus", "madeline4-postgrad": "Madeline 4: After College", "mark": "Mark: a young boy", "melissa": "Melissa: a young girl", "melora": "Melora (Melvin's wife)", "melvin": "Melvin (Melora's husband)", "merri": "Merri: an artist", "miami-home": "Miami Home-Lab: Home", "miami-lab": "Miami Home-Lab: Lab", "midwest_teens-f": "Midwest teenagers (F)", "midwest_teens-m": "Midwest teenagers (M)", "nancy": "Nancy: Caring & headstrong", "natural_scientist": "The Natural Scientist", "norman": "Norman: a child molester", "pegasus": "Pegasus: a factory worker", "peru-m": "Peruvian men", "peru-f": "Peruvian women", "phil1": "Phil 1: teens", "phil2": "Phil 2: late 20s", "phil3": "Phil 3: retirement", "physiologist": "The Physiologist", "ringo": "Ringo: from the 1960s", "bosnak": "Robert Bosnak: A dream analyst", "samantha": "Samantha: in her 20s", "seventh_graders": "Seventh grade girls", "zurich-f.de": "Swiss children, auf Deutsch (F)", "zurich-m.de": "Swiss children, auf Deutsch (M)", "toby": "Toby: A friendly party animal", "tom": "Tom: An outgoing man", "ucsc_women": "UCSC women, 1996", "vickie": "Vickie: a 10-year-old girl", "vietnam_vet": "Vietnam Vet: 1970-2008 war dreams", "vietnam_vet2": "Vietnam Vet: 2015 dreams", "wedding": "Wedding dreams", "west_coast_teens": "West Coast teenage girls" }
NUMBER_RE = r'^\#(\S+)'
HEAD_RE = r'^\((.+?)\)(\w)'
def process_dream_span(span):
text = span.text.encode('utf-8').strip()
# remove number
number_groups = re.match(NUMBER_RE, text)
number = number_groups.group(1).strip()
text = re.sub(NUMBER_RE, '', text).strip()
# remove word count
text = re.sub(r'\s*\(\d+\s+words\)\s*$', '', text, re.I)
# Split nb-space as paragraphs
text = re.sub('\\s*?(\xc2\xa0)+\\s*', '\n', text).strip()
# sep desc
head_match = re.match(HEAD_RE, text)
if head_match is None:
return OrderedDict([
('number', number),
('content', text)])
head = head_match.group(1).strip()
text = re.sub(HEAD_RE, lambda x: x.group(2), text).strip()
return OrderedDict([
('number', number),
('head', head),
('content', text)])
def process_dream_page(text):
soup = BeautifulSoup(text, 'html.parser')
dreams = soup.find_all('span')
for dream in soup.find_all('span'):
data = process_dream_span(dream)
if data is not None:
yield data
def download_dreams(dreamer):
url = DREAMBANK.format(dreamer)
r = requests.get(url)
if r.status_code != 200:
print("error getting dreams")
return None
return r.text
def collect_dreams(dreamer, desc):
text = download_dreams(dreamer)
return OrderedDict([
('dreamer', dreamer),
('description', desc),
('dreams', list(process_dream_page(text)))])
for dreamer, desc in DREAMERS.iteritems():
dreams = collect_dreams(dreamer, desc)
if dreams is not None:
with open('dreams/' + dreamer + '.json', 'w') as out_file:
json.dump(dreams, out_file, sort_keys=False, indent=4, ensure_ascii=False)