-
Notifications
You must be signed in to change notification settings - Fork 2
/
newsextractor.py
256 lines (236 loc) · 16.9 KB
/
newsextractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
# -*- coding: utf-8 -*-
import re
from BeautifulSoup import BeautifulSoup
from datetime import tzinfo, timedelta, datetime
# news (and blogs) extractor for Fang Ning's sources
# notes added by Anqi
class NewsBlogExtractor:
# [ matching pattern ]
# moved inside the class by Anqi
# TODO: need to be moved to a separate config file
__flags__ = { \
'cnn': {'content':['p', 'class' , 'cnn_storypgraphtxt'], 'title':['h1'], 'date':['div','class','cnn_strytmstmp'], 'date_extract':['(\w+) ([0-9]{1,2}), ([0-9]{4})']}, \
'economist': {'content':['id', 'id' , 'ec-article-body'], 'title':['h1'], 'date':['p','class','ec-article-info'], 'date_extract':['(\w+) ([0-9]{1,2})\w* ([0-9]{4})']}, \
'foxnews': {'content':['div', 'class' , 'article-text'], 'title':['h1'], 'date':['p','class','published updated dtstamp'], 'date_extract':['Published (\w+) ([0-9]{1,2}), ([0-9]{4})']}, \
'ibtimes': {'content':['id', 'id' , 'content'], 'title':['h1'], 'date':['p','class','story_on'], 'date_extract':['(\w+) ([0-9]{1,2}), ([0-9]{4}) ([0-9]{1,2}:[0-9]{1,2}) (\w+) (\w+)'], 'page':['http://www.ibtimes.com']}, \
'newyorktimes': {'content':['p', 'itemprop' ,'articleBody'], 'title':['h1'], 'date':['h6','class','dateline'], 'date_extract':['Published: (\w+) ([0-9]{1,2}), ([0-9]{4})'], 'page':['http://www.nytimes.com'], 'image':['http://yoursdp.org']}, \
'rfi': {'content':['div', 'class' , 'article-main-text'], 'title':['h1'], 'date':['div','class','article-header-date'], 'date_extract':['([0-9]{1,2}) (\w+) ([0-9]{4})$']}, \
'smh': {'content':['div', 'class' , 'articleBody'], 'title':['h1'], 'date':['tag','tag','cite'], 'date_extract':['(\w+) ([0-9]{1,2}), ([0-9]{4})']}, \
'straitstimes': {'content':['div', 'class' , 'storyRight'], 'title':['h1'], 'date':['div','class','published'], 'date_extract':['(\w+) ([0-9]{1,2}), ([0-9]{4})$']}, \
'swissinfo': {'content':['div', 'class' , 'fl-l'], 'title':['h1'], 'date':['div','class','date grey-dark italic'], 'date_extract':['(\w+) ([0-9]{1,2}), ([0-9]{4}) - (\d+:\d+)']}, \
'thesun': {'content':['id', 'id' , 'bodyText'], 'title':['h1'], 'date':['div','class','published-date-text'], 'date_extract':['\w+: ([0-9]{1,2})\w* (\w+) ([0-9]{4})']}, \
'irishsun': {'content':['tag', 'tag' , 'article'], 'title':['h2'], 'date':['irishsun', '',''], 'date_extract':['([0-9]{1,2})\w* (\w+), ([0-9]{4})$']}, \
'dailymail': {'content':['id' , 'id' , 'js-article-text'], 'title':['h1'], 'date':['span','class','article-timestamp'], 'date_extract':['([0-9]{1,2}:[0-9]{1,2}) (\w+), ([0-9]{1,2})\w* (\w+) ([0-9]{4})$']}, \
'guardian': {'content':['id' , 'id' ,'content'], 'title':['h1'], 'date':['tag','tag','time'], 'date_extract':['\w+ ([0-9]{1,2}) (\w+) ([0-9]{4}) ([0-9]{1,2}.[0-9]{1,2}) (\w+)']}, \
'ananova': {'content':['div', 'class' , 'articleinner' ], 'title':['h1'], 'date':['div','class','meta'], 'date_extract':['([0-9]{1,2}) (\w+) ([0-9]{4}), ([0-9]{1,2}:[0-9]{1,2})']}, \
'corkman': {'content':['div', 'class' , 'body' ], 'title':['head'], 'date':['p','class','published'], 'date_extract':['\w+ (\w+) ([0-9]{1,2}) ([0-9]{4})']}, \
'ladyironchef':{'content':['id' , 'id' ,'contentleft'], 'title':['h1'], 'date':['p','class','date'], 'date_extract':['(\w+) ([0-9]{1,2}), ([0-9]{4})']}, \
'noobcook': {'content':['div', 'class' , 'entry-content' ], 'title':['h1'], 'date':['p','class','headline_meta'], 'date_extract':['(\w+) ([0-9]{1,2}), ([0-9]{4})']}, \
'vivawoman': {'content':['id', 'id' , 'content'], 'title':['h1'], 'date':['span','class','date published time'], 'date_extract':['([0-9]{1,2}) (\w+) ([0-9]{4})$']}, \
'xuxiaoming': {'removehead':True, 'content':['div', 'id' , 'sina_keyword_ad_area2'], 'title':['h2'], 'date':['span','class','time SG_txtc'], 'date_extract':['\(([0-9]{4})-([0-9]{1,2})-([0-9]{1,2}) ([0-9]{1,2}:[0-9]{1,2}):[0-9]{1,2}\)']}, \
'twocold': {'removehead':True, 'content':['div', 'id' , 'sina_keyword_ad_area2'], 'title':['h2'], 'date':['span','class','time SG_txtc'], 'date_extract':['\(([0-9]{4})-([0-9]{1,2})-([0-9]{1,2}) ([0-9]{1,2}:[0-9]{1,2}):[0-9]{1,2}\)']},\
'jpost':{'content':['div', 'class' , 'body'], 'title':['h1'], 'date':['div','class','date'], 'date_extract':['([0-9]{1,2})/([0-9]{1,2})/([0-9]{4}) ([0-9]{1,2}:[0-9]{1,2})']}, \
'channelnewsasia':{'content':['id', 'id' , 'articlecontent'], 'title':['head'], 'date':['p','class','header'], 'date_extract':['\w+: ([0-9]{1,2})\w* (\w+) ([0-9]{4})']}, \
'bbc':{'content':['tag', 'tag' , 'p'], 'title':['h1'], 'date':['span','class','story-date'], 'date_extract':['([0-9]{1,2}) (\w+) ([0-9]{4}) \w+ \w+ \w+ ([0-9]{1,2}:[0-9]{1,2}) (\w+)']}, \
'ynet':{'content':['id', 'id' , 'article_content'], 'title':['h1'], 'date':['p','style','margin-top:8'], 'date_extract':['([0-9]{1,2}).([0-9]{1,2}).([0-9]{1,2}), ([0-9]{1,2}:[0-9]{1,2})']}, \
'newscomau':{'content':['div', 'class' , 'story-body lead-media-none'], 'title':['h1'], 'date':['li','class','date-and-time last'], 'date_extract':['(\w+) ([0-9]{1,2}), ([0-9]{4}) ([0-9]{1,2}:[0-9]{1,2})(\w+)']}, \
'MissTamChiakSingaporeFoodBlogchinese' : {'content':['div', 'class' , 'post-body entry-content'], 'title':['h3'], 'date':['h2','class','date-header'], 'date_extract':['([0-9]{1,2}) (\w+), ([0-9]{4})']}, \
'mrbrown' : {'content':['div', 'class' , 'entry-body'], 'title':['h3'], 'date':['h2','class','date-header'], 'date_extract':['\w+, (\w+) ([0-9]{1,2}), ([0-9]{4})']}, \
'ieatishootipost':{'content':['div', 'class' , 'post-body entry-content'], 'title':['h3'], 'date':['a','class','timestamp-link'], 'date_extract':['\w+, (\w+) ([0-9]{1,2}), ([0-9]{4})']}, \
'yawningbread':{'content':['div', 'class' , 'entry-content'], 'title':['h3'], 'date':['abbr','class','published'], 'date_extract':['([0-9]{1,2}) (\w+) ([0-9]{4})']}, \
'yoursdp':{'content':['tag', 'tag' , 'p'], 'title':['head'], 'date':['td','class','createdate'], 'date_extract':['([0-9]{1,2}) (\w+) ([0-9]{4})']}, \
'habitatnews':{'content':['p', 'class' , 'blog'], 'title':['h3'], 'date':['p','class','blogdate'], 'date_extract':['\w+ ([0-9]{1,2}) (\w+) ([0-9]{4})']}, \
'singaporedaily':{'content':['p', 'class' , 'blog'], 'title':['h2'], 'date':['h2','class','post_name'], 'date_extract':['\w+ \w+: ([0-9]{1,2}) (\w+) ([0-9]{4})']}, \
'karencheng':{'content':['tag', 'tag' , 'p'], 'title':['h2'], 'date':['karencheng', '',''], 'date_extract':['([0-9]{1,2}) (\w+) ([0-9]{4})']}, \
'sabrinasg':{'content':['div', 'class' , 'p_contentbody'], 'title':['h1'], 'date':['div', 'class','p_submitted'], 'date_extract':['(\w+) ([0-9]{1,2}), ([0-9]{4})']}, \
'soapz':{'content':['div', 'class' , 'date-posts'], 'title':['h3'], 'date':['h2', 'class','date-header'], 'date_extract':['\w+, (\w+) ([0-9]{1,2}), ([0-9]{4})']}, \
'yahoo':{'content':['div', 'class' , 'entry-content'], 'title':['h1'], 'date':['cite', 'class','byline vcard'], 'date_extract':['(\w+) ([0-9]{1,2}), ([0-9]{4})$']}, \
'wsj':{'content':['div', 'id' , 'recipeACShopAndBuyText'], 'title':['h1'], 'date':['li', 'class','dateStamp'], 'date_extract':['(\w+) ([0-9]{1,2}), ([0-9]{4}),']}, \
'msn':{'content':['tag', 'tag' , 'p'], 'title':['h1'], 'date':['li', 'class','dateStamp'], 'date_extract':['(\w+) ([0-9]{1,2}), ([0-9]{4}),']}, \
'time':{'content':['span', 'class' , 'lingo_region'], 'title':['h1'], 'date':['span', 'class','date'], 'date_extract':['(\w+) ([0-9]{1,2}), ([0-9]{4})$']}, \
'scmp':{'content':['span', 'class' , 'article_body'], 'title':['h1'], 'date':['span', 'class','article_byline'], 'date_extract':['(\w+) ([0-9]{1,2}), ([0-9]{4})$']}, \
}
def convertMonth(self,string):
table = {'jan':'1', 'january':'1',\
'feb':'2', 'February':'2',\
'mar':'3','march':'3',\
'apr':'4','april':'4',\
'may':'5',\
'jun':'6','june':'6',\
'jul':'7','july':'7',\
'aug':'8','august':'8',\
'sep':'9','september':'9',\
'oct':'10','october':'10',\
'nov':'11','november':'11',\
'dec':'12','december':'12'}
if string.lower() in table.keys():
return table[string.lower()]
else:
return '0'
# main extractor function
# returns None if the resource is not available
# otherwise, returns an object to the indexing program
# page is a text string containing the html content, not a link
# sourcename is the specified resource name
def extract(self, stream, sourcename = None):
# check if the sourcename is available
if not self.__flags__.has_key(sourcename):
return None
res = self.__flags__[sourcename]
# special for sina blogs
# the <head> part of sina's blogs has something wrong (?) which
# blocks BeautifulSoup continuing parsing the body...
# it results in an html with only the head part
# so we remove the head part for now
if 'removehead' in res and res['removehead'] == True:
soup = BeautifulSoup(stream[:stream.find('<head>')] + stream[stream.find('<body>'):])
else:
soup = BeautifulSoup(stream)
# remove scripts
for script in soup("script"):
soup.script.extract()
# extract title
# res['title'] is a list ([0]) but not a simple string: for flexibility, extension in the future
try:
if res['title'][0] in set(['h1', 'h2', 'h3']):
title = soup.find(res['title'][0]).get_text()
elif res['title'][0] == 'head': # special processing for channelnewsasia , corkman, yoursdp , e.g. , <title>Suu Kyi makes Myanmar parliament debut - Channel NewsAsia </title>
title = soup.html.head.title.string.split('-')[0]
title = re.sub(u'[\u0000-\u0020]+', ' ', title).strip()
except:
title = ''
# extract date
try:
# TODO: give some descriptions or examples of "data" for better understanding
if res['date'][0] == 'tag': # just find according to unique tag ID as <time> ***</time>
data = soup.find(res['date'][2]).get_text()
# print "date = " + data
elif res['date'][0] == 'id': # just find according to a tag as <date id='***'> *** <date>
# print "date = " + soup.find(id=res['date'][2]).get_text()
data = soup.find(id=res['date'][2]).get_text()
elif res['date'][0] == 'irishsun': # special for irishsun , find according to relative position in tags
data = soup.h2.findNextSibling('p').get_text()
# print "date = " + data
elif res['date'][0] == 'karencheng': # special for karencheng , find according to relative position in tags
data = soup.h2.findNextSibling('small').get_text()
else: # find according to unique tag type as <div class='date'> ***</div>
data = soup.find(res['date'][0], {res['date'][1]:res['date'][2]}).get_text()
# print "data = "+ data
data = re.sub(u'[\u0000-\u0020]+', ' ', data).strip()
# extracted date format: (month, day, year, [hour:minute], [am|pm], [time zone]), e.g., ('June', '01' , '2012' , '10:10', 'AM' , 'GMT')
# TODO: unify the string formats, e.g. Jun = June, AM=am
date = re.search(res['date_extract'][0], data)
if sourcename in set(['singaporedaily' ,'habitatnews','yoursdp' , 'yawningbread' , 'MissTamChiakSingaporeFoodBlogchinese' ,'rfi' ,'thesun' ,'irishsun' , 'vivawoman', 'channelnewsasia']): # e.g. 01 June 2012
# print "111 = " + date.group(1)
new_date = datetime(date.group(3), self.convertMonth(date.group(2)), date.group(1)).isoformat(' ')
# t = (date.group(2) , date.group(1), date.group(3))
# new_date = str(t)
elif sourcename == 'dailymail': # 14:21 GMT, 8 June 2012
# print "222 = " + date.group(0)
new_date = datetime(date.group(5), self.convertMonth(date.group(4)), date.group(3)).isoformat(' ')
# t = (date.group(4) , date.group(3), date.group(5), date.group(1), date.group(2))
# new_date = str(t)
elif sourcename == 'guardian': # 8 June 2012 19.47 BST
new_date = datetime(date.group(3), self.convertMonth(date.group(2)), date.group(1)).isoformat(' ')
# t = (date.group(2) , date.group(1), date.group(3), date.group(4).replace('.',':'), date.group(5))
# new_date = str(t)
elif sourcename == 'ananova': # 10 June 2012, 4:58
new_date = datetime(date.group(3), self.convertMonth(date.group(2)), date.group(1)).isoformat(' ')
# t = (date.group(2) , date.group(1), date.group(3), date.group(4))
# new_date = str(t)
elif sourcename == 'xuxiaoming' or sourcename == 'twocold': # 2012-05-10 03:21:27
new_date = datetime(date.group(3), self.convertMonth(date.group(2)), date.group(1)).isoformat(' ')
# t = (date.group(2) , date.group(3), date.group(1), date.group(4))
# new_date = str(t)
elif sourcename == 'bbc': # 22 June 2012 Last updated at 17:43 GMT
new_date = datetime(date.group(3), self.convertMonth(date.group(2)), date.group(1)).isoformat(' ')
# t = (date.group(2) , date.group(1), date.group(3), date.group(4), date.group(5))
# new_date = str(t)
elif sourcename == 'ynet': # 06.22.12, 09:33
new_date = datetime('20'+date.group(3), date.group(1), date.group(2)).isoformat(' ')
# t = (date.group(1) , date.group(2), '20'+date.group(3), date.group(4))
# new_date = str(t)
else : # June 01 2012 10:20 AM GMT
new_date = datetime(date.group(3), self.convertMonth(date.group(1)), date.group(2)).isoformat(' ')
# new_date = str(date.groups())
# print new_date
except:
print "there is an error"
new_date = ''
# extract content
try:
if res['content'][0] == 'tag':
contents=soup.findAll(res['content'][2])
elif res['content'][0] == 'id':
contents=soup.findAll(id=res['content'][2])
elif res['content'][0] == 'div' or res['content'][0] == 'p':
contents=soup.findAll(res['content'][0], {res['content'][1] : res['content'][2]})
text = ''
for line in contents:
text +=line.get_text()
# print text.encode('utf-8')
except:
text = ''
# Create an object to return
# Note: the "maincontent" must exist
# other fields are optional, thus must have a postfix. _s for string type, _s_notindex for string but not indexed
extracted = {'maincontent': text, 'title_s':title}
if len(new_date) > 0:
extracted['date_s_notindex'] = new_date
return extracted
# merge multiple pages into one webpage
# pages is an array, containing elements of string, each of which is an html
# output is the merged plain body text
def extractMultiple(self, pages, sourcename = None):
text = ''
new_date = ''
for stream in pages:
d = self.extract(stream, sourcename)
text += d['maincontent']
title = d['title_s']
if len(d['date_s_notindex']) > 0:
new_date = d['date_s_notindex']
# Create an object to return
extracted = {'maincontent': text, 'title_s':title}
if len(new_date) > 0:
extracted['date_s_notindex'] = new_date
return extracted
# find next page
# the format (nav bar) is defined in __flags__
def findNextPage(self, page, sourcename = None):
if not self.__flags__.has_key(sourcename):
return None
res = self.__flags__[sourcename]
if not res.has_key('page'):
return None
soup = BeautifulSoup(page)
link = soup.find('a', { "class" : "next" })
if link != None and link['href'][0] == '/':
return res['page'][0]+link['href']
else:
return None
# findAllImages
def findAllImages(self, page, netloc, sourcename = None):
# if not self.__flags__.has_key(sourcename):
# return None
link_head = ''
soup = BeautifulSoup(page)
try:
if soup.html.head.base.has_key('href'):
link_head =soup.html.head.base['href'].strip(' ')
else:
link_head = netloc
except:
pass
images = {}
for x in soup.findAll('img'):
try:
link = x['src'].strip(' ')
if link[:7] != 'http://':
images[link] = link_head+link
else:
images[link] = link
except:
pass
# print images
return images