-
Notifications
You must be signed in to change notification settings - Fork 0
/
retriever.py
57 lines (43 loc) · 1.65 KB
/
retriever.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/env python
import datetime
import mechanize
import codecs
import os
SOURCE_FILE_ENCODING = 'iso-8859-1'
URL_FORMAT_STR = "http://www.mediabase.com/whatsong/whatsong.asp?var_s=087066073071045070077&MONDTE=%s"
DATA_DIR = "data"
class Retriever():
def __init__(self):
self.today = datetime.datetime.now()
one_day = datetime.timedelta(days=1)
self.yesterday = (self.today - one_day)
def format_date_for_url(self, dt):
return dt.strftime("%m/%d/%Y")
def format_date_for_file(self, dt):
return dt.strftime("%Y%m%d")
def filename_for_date(self, dt):
return os.path.join(DATA_DIR, "{0}.html".format(self.format_date_for_file(dt)))
def open_file_for_date(self, dt, mode):
return open(self.filename_for_date(dt), mode)
def latest_two_filenames(self):
filenames = []
for date in [self.today, self.yesterday]:
name = self.filename_for_date(date)
if os.path.isfile(name):
filenames.append(name)
else:
raise IOError("You must run the retriever first, as we don't have current files.")
return filenames
def retrieve(self):
for date in [self.today, self.yesterday]:
if os.path.isfile(self.filename_for_date(date)):
continue
file = self.open_file_for_date(date, "w")
url = URL_FORMAT_STR % self.format_date_for_url(date)
browser = mechanize.Browser()
data = browser.open(url).get_data()
file.write(data)
file.close()
if __name__== '__main__':
retriever = Retriever()
retriever.retrieve()