forked from codelucas/newspaper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathurls.py
304 lines (237 loc) · 9.98 KB
/
urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
# -*- coding: utf-8 -*-
"""
Newspaper treats urls for news articles as critical components.
Hence, we have an entire module dedicated to them.
"""
__title__ = 'newspaper'
__author__ = 'Lucas Ou-Yang'
__license__ = 'MIT'
__copyright__ = 'Copyright 2014, Lucas Ou-Yang'
import logging
import re
from urllib.parse import parse_qs, urljoin, urlparse, urlsplit, urlunsplit
from tldextract import tldextract
log = logging.getLogger(__name__)
MAX_FILE_MEMO = 20000
_STRICT_DATE_REGEX_PREFIX = r'(?<=\W)'
DATE_REGEX = r'([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})?'
STRICT_DATE_REGEX = _STRICT_DATE_REGEX_PREFIX + DATE_REGEX
ALLOWED_TYPES = ['html', 'htm', 'md', 'rst', 'aspx', 'jsp', 'rhtml', 'cgi',
'xhtml', 'jhtml', 'asp', 'shtml']
GOOD_PATHS = ['story', 'article', 'feature', 'featured', 'slides',
'slideshow', 'gallery', 'news', 'video', 'media',
'v', 'radio', 'press']
BAD_CHUNKS = ['careers', 'contact', 'about', 'faq', 'terms', 'privacy',
'advert', 'preferences', 'feedback', 'info', 'browse', 'howto',
'account', 'subscribe', 'donate', 'shop', 'admin']
BAD_DOMAINS = ['amazon', 'doubleclick', 'twitter']
def remove_args(url, keep_params=(), frags=False):
"""
Remove all param arguments from a url.
"""
parsed = urlsplit(url)
filtered_query = '&'.join(
qry_item for qry_item in parsed.query.split('&')
if qry_item.startswith(keep_params)
)
if frags:
frag = parsed[4:]
else:
frag = ('',)
return urlunsplit(parsed[:3] + (filtered_query,) + frag)
def redirect_back(url, source_domain):
"""
Some sites like Pinterest have api's that cause news
args to direct to their site with the real news url as a
GET param. This method catches that and returns our param.
"""
parse_data = urlparse(url)
domain = parse_data.netloc
query = parse_data.query
# If our url is even from a remotely similar domain or
# sub domain, we don't need to redirect.
if source_domain in domain or domain in source_domain:
return url
query_item = parse_qs(query)
if query_item.get('url'):
# log.debug('caught redirect %s into %s' % (url, query_item['url'][0]))
return query_item['url'][0]
return url
def prepare_url(url, source_url=None):
"""
Operations that purify a url, removes arguments,
redirects, and merges relatives with absolutes.
"""
try:
if source_url is not None:
source_domain = urlparse(source_url).netloc
proper_url = urljoin(source_url, url)
proper_url = redirect_back(proper_url, source_domain)
# proper_url = remove_args(proper_url)
else:
# proper_url = remove_args(url)
proper_url = url
except ValueError as e:
log.critical('url %s failed on err %s' % (url, str(e)))
proper_url = ''
return proper_url
def valid_url(url, verbose=False, test=False):
"""
Is this URL a valid news-article url?
Perform a regex check on an absolute url.
First, perform a few basic checks like making sure the format of the url
is right, (scheme, domain, tld).
Second, make sure that the url isn't some static resource, check the
file type.
Then, search of a YYYY/MM/DD pattern in the url. News sites
love to use this pattern, this is a very safe bet.
Separators can be [\.-/_]. Years can be 2 or 4 digits, must
have proper digits 1900-2099. Months and days can be
ambiguous 2 digit numbers, one is even optional, some sites are
liberal with their formatting also matches snippets of GET
queries with keywords inside them. ex: asdf.php?topic_id=blahlbah
We permit alphanumeric, _ and -.
Our next check makes sure that a keyword is within one of the
separators in a url (subdomain or early path separator).
cnn.com/story/blah-blah-blah would pass due to "story".
We filter out articles in this stage by aggressively checking to
see if any resemblance of the source& domain's name or tld is
present within the article title. If it is, that's bad. It must
be a company link, like 'cnn is hiring new interns'.
We also filter out articles with a subdomain or first degree path
on a registered bad keyword.
"""
# If we are testing this method in the testing suite, we actually
# need to preprocess the url like we do in the article's constructor!
if test:
url = prepare_url(url)
# 11 chars is shortest valid url length, eg: http://x.co
if url is None or len(url) < 11:
if verbose: print('\t%s rejected because len of url is less than 11' % url)
return False
r1 = ('mailto:' in url) # TODO not sure if these rules are redundant
r2 = ('http://' not in url) and ('https://' not in url)
if r1 or r2:
if verbose: print('\t%s rejected because len of url structure' % url)
return False
path = urlparse(url).path
# input url is not in valid form (scheme, netloc, tld)
if not path.startswith('/'):
return False
# the '/' which may exist at the end of the url provides us no information
if path.endswith('/'):
path = path[:-1]
# '/story/cnn/blahblah/index.html' --> ['story', 'cnn', 'blahblah', 'index.html']
path_chunks = [x for x in path.split('/') if len(x) > 0]
# siphon out the file type. eg: .html, .htm, .md
if len(path_chunks) > 0:
file_type = url_to_filetype(url)
# if the file type is a media type, reject instantly
if file_type and file_type not in ALLOWED_TYPES:
if verbose: print('\t%s rejected due to bad filetype' % url)
return False
last_chunk = path_chunks[-1].split('.')
# the file type is not of use to use anymore, remove from url
if len(last_chunk) > 1:
path_chunks[-1] = last_chunk[-2]
# Index gives us no information
if 'index' in path_chunks:
path_chunks.remove('index')
# extract the tld (top level domain)
tld_dat = tldextract.extract(url)
subd = tld_dat.subdomain
tld = tld_dat.domain.lower()
url_slug = path_chunks[-1] if path_chunks else ''
if tld in BAD_DOMAINS:
if verbose: print('%s caught for a bad tld' % url)
return False
if len(path_chunks) == 0:
dash_count, underscore_count = 0, 0
else:
dash_count = url_slug.count('-')
underscore_count = url_slug.count('_')
# If the url has a news slug title
if url_slug and (dash_count > 4 or underscore_count > 4):
if dash_count >= underscore_count:
if tld not in [x.lower() for x in url_slug.split('-')]:
if verbose: print('%s verified for being a slug' % url)
return True
if underscore_count > dash_count:
if tld not in [x.lower() for x in url_slug.split('_')]:
if verbose: print('%s verified for being a slug' % url)
return True
# There must be at least 2 subpaths
if len(path_chunks) <= 1:
if verbose: print('%s caught for path chunks too small' % url)
return False
# Check for subdomain & path red flags
# Eg: http://cnn.com/careers.html or careers.cnn.com --> BAD
for b in BAD_CHUNKS:
if b in path_chunks or b == subd:
if verbose: print('%s caught for bad chunks' % url)
return False
match_date = re.search(DATE_REGEX, url)
# if we caught the verified date above, it's an article
if match_date is not None:
if verbose: print('%s verified for date' % url)
return True
for GOOD in GOOD_PATHS:
if GOOD.lower() in [p.lower() for p in path_chunks]:
if verbose: print('%s verified for good path' % url)
return True
if verbose: print('%s caught for default false' % url)
return False
def url_to_filetype(abs_url):
"""
Input a URL and output the filetype of the file
specified by the url. Returns None for no filetype.
'http://blahblah/images/car.jpg' -> 'jpg'
'http://yahoo.com' -> None
"""
path = urlparse(abs_url).path
# Eliminate the trailing '/', we are extracting the file
if path.endswith('/'):
path = path[:-1]
path_chunks = [x for x in path.split('/') if len(x) > 0]
last_chunk = path_chunks[-1].split('.') # last chunk == file usually
if len(last_chunk) < 2:
return None
file_type = last_chunk[-1]
# Assume that file extension is maximum 5 characters long
if len(file_type) <= 5 or file_type.lower() in ALLOWED_TYPES:
return file_type.lower()
return None
def get_domain(abs_url, **kwargs):
"""
returns a url's domain, this method exists to
encapsulate all url code into this file
"""
if abs_url is None:
return None
return urlparse(abs_url, **kwargs).netloc
def get_scheme(abs_url, **kwargs):
"""
"""
if abs_url is None:
return None
return urlparse(abs_url, **kwargs).scheme
def get_path(abs_url, **kwargs):
"""
"""
if abs_url is None:
return None
return urlparse(abs_url, **kwargs).path
def is_abs_url(url):
"""
this regex was brought to you by django!
"""
regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
r'localhost|' # localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # ...or ipv4
r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
c_regex = re.compile(regex)
return (c_regex.search(url) is not None)