12
12
import warnings
13
13
14
14
SKIP_ARGS = ['ref_src' , 'utm' ]
15
- SKIP_PREFIX = ['//www.' , '//mobile.' , '//m.' , 'www.' , 'mobile.' , 'm.' ]
15
+ SKIP_PREFIX = ['//www.' , '//mobile.' , '//m.' ]
16
16
GOOG_STATIC = 'www.gstatic.com'
17
17
G_M_LOGO_URL = 'https://www.gstatic.com/m/images/icons/googleg.gif'
18
18
GOOG_IMG = '/images/branding/searchlogo/1x/googlelogo'
@@ -152,11 +152,12 @@ def get_first_link(soup: BeautifulSoup) -> str:
152
152
return ''
153
153
154
154
155
- def get_site_alt (link : str ) -> str :
155
+ def get_site_alt (link : str , site_alts : dict = SITE_ALTS ) -> str :
156
156
"""Returns an alternative to a particular site, if one is configured
157
157
158
158
Args:
159
- link: A string result URL to check against the SITE_ALTS map
159
+ link: A string result URL to check against the site_alts map
160
+ site_alts: A map of site alternatives to replace with. defaults to SITE_ALTS
160
161
161
162
Returns:
162
163
str: An updated (or ignored) result link
@@ -178,9 +179,9 @@ def get_site_alt(link: str) -> str:
178
179
# "https://medium.com/..." should match, but "philomedium.com" should not)
179
180
hostcomp = f'{ parsed_link .scheme } ://{ hostname } '
180
181
181
- for site_key in SITE_ALTS .keys ():
182
+ for site_key in site_alts .keys ():
182
183
site_alt = f'{ parsed_link .scheme } ://{ site_key } '
183
- if not hostname or site_alt not in hostcomp or not SITE_ALTS [site_key ]:
184
+ if not hostname or site_alt not in hostcomp or not site_alts [site_key ]:
184
185
continue
185
186
186
187
# Wikipedia -> Wikiless replacements require the subdomain (if it's
@@ -193,9 +194,8 @@ def get_site_alt(link: str) -> str:
193
194
elif 'medium' in hostname and len (subdomain ) > 0 :
194
195
hostname = f'{ subdomain } .{ hostname } '
195
196
196
- parsed_alt = urlparse .urlparse (SITE_ALTS [site_key ])
197
- link = link .replace (hostname , SITE_ALTS [site_key ]) + params
198
-
197
+ parsed_alt = urlparse .urlparse (site_alts [site_key ])
198
+ link = link .replace (hostname , site_alts [site_key ]) + params
199
199
# If a scheme is specified in the alternative, this results in a
200
200
# replaced link that looks like "https://http://altservice.tld".
201
201
# In this case, we can remove the original scheme from the result
@@ -205,9 +205,12 @@ def get_site_alt(link: str) -> str:
205
205
206
206
for prefix in SKIP_PREFIX :
207
207
if parsed_alt .scheme :
208
- link = link .replace (prefix , '' )
208
+ # If a scheme is specified, remove everything before the
209
+ # first occurence of it
210
+ link = f'{ parsed_alt .scheme } { link .split (parsed_alt .scheme , 1 )[- 1 ]} '
209
211
else :
210
- link = link .replace (prefix , '//' )
212
+ # Otherwise, replace the first occurrence of the prefix
213
+ link = link .replace (prefix , '//' , 1 )
211
214
break
212
215
213
216
return link
0 commit comments