Skip to content

Commit

Permalink
link functions
Browse files Browse the repository at this point in the history
  • Loading branch information
mabrownnyu committed Aug 10, 2018
1 parent d301604 commit 42d86cd
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 4 deletions.
7 changes: 5 additions & 2 deletions urlexpander/core/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,14 +141,15 @@ def _expand(link, timeout=2, **kwargs):
except requests.exceptions.RequestException as e:
domain, url_long = _parse_error(str(e))

if domain in constants.url_appenders:
# replace list with constants.url_appenders
if domain in ['ln.is', 'linkis.com']:
url_long = link.replace(domain, '')
domain = get_domain(url_long)

elif domain in constants.short_domain_ad_redirects or domain == -1:
url_long = unshortenit.UnshortenIt().unshorten(link,
timeout=timeout)
domain = get_domain(url_long)


return dict(original_url=link,
resolved_domain=domain,
Expand Down Expand Up @@ -205,6 +206,7 @@ def expand(links_to_unshorten, chunksize=1280, n_workers=1,

# chunk the list of arguments
if verbose:
print("There are {} links to unshorten".format(len(links_to_unshorten)))
chunk_iter = tqdm(_chunks(links_to_unshorten, chunksize=chunksize))
else:
chunk_iter = _chunks(links_to_unshorten, chunksize=chunksize)
Expand All @@ -223,6 +225,7 @@ def expand(links_to_unshorten, chunksize=1280, n_workers=1,
if verbose:
print("{} failed to resolve due to error: {}".format(chunk[i],
str(type(exc))))

finally:
if isinstance(data, dict):
unshortened_urls.append(data)
Expand Down
3 changes: 2 additions & 1 deletion urlexpander/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,10 @@
'on.wsj.com',
]

# there are link shorteners with the actual link appended on the end
url_appenders = ['ln.is', 'linkis.com']

all_short_domains = short_domain_ad_redirects + short_domain + ['ln.is']
all_short_domains = short_domain_ad_redirects + short_domain + url_appenders

congress_dataset_url = ('https://raw.githubusercontent.com/SMAPPNYU/'
'urlExpander/master/datasets/'
Expand Down
46 changes: 45 additions & 1 deletion urlexpander/core/tweet_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from urlexpander.core.api import get_domain

__all__ = ['get_link', 'count_matrix']
__all__ = ['get_link', 'count_matrix', 'strip_tweet_link']
__author__= 'Leon Yin'

def _get_full_text(tweet):
Expand Down Expand Up @@ -117,3 +117,47 @@ def count_matrix(df, user_col='user_id', domain_col='link_domain',

return matrix

def _strip_tweet_link(link):
'''
Best attempt at stripping twitter links for screen names and tweet ids.
:input link: a link with the domain 'twitter.com'
:returns: a list of dictionaries with the original link, the screen name, and the tweet_id of the link
'''
dict_ = {}

dict_['resolved_url'] = link.lower()
if 'status' in link.split('/'):
list_ = link.lower().split('/')
try:
if 'i/web/status' in link: dict_['linked_screen_name'] == None
else: dict_['linked_screen_name'] = list_[list_.index('status') - 1]
dict_['linked_tweet_id'] = list_[list_.index('status') + 1]
except:
dict_['linked_screen_name'] = None
dict_['linked_tweet_id'] = None

else:
dict_['linked_screen_name'] = link
dict_['linked_tweet_id'] = link

return dict_

def strip_tweet_link(link):
'''
Parses a link to twitter to get the screen name and tweet id
:input link: a link or list of links with the domain 'twitter.com'
:returns: a list of dictionaries with the original link, the screen name, and the tweet_id of the link
'''

if isinstance(link, str):
return _strip_tweet_link(link)
elif isinstance(link, list):
links_to_strip = list(set(link))
links = [_strip_tweet_link(link) for link in links_to_strip]
return links
else:
links_to_strip = list(set(list(link)))
links = [_strip_tweet_link(link) for link in links_to_strip]
return links
56 changes: 56 additions & 0 deletions urlexpander/core/youtube_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
__all__ = ['strip_yt_link']

def _strip_yt_link(link):
'''
Parses a link to youtube to get the screen name and tweet id
:input link: a link with the domain 'youtube.com' or 'youtu.be'
:returns: a list of dictionaries with the original link, the video id, or the channel id
'''

dict_ = {}
dict_['resolved_url'] = link


if 'v=' in link:
vid = link[link.index('v='):].strip('v=')
if len(vid) > 11:
vid = vid[:11]
channel = None
elif 'channel' in link:
channel = link[link.index('channel'):].strip('channel/')
if len(channel) > 24:
channel = channel[:24]
vid = None
elif 'youtu.be' in link:
vid = link.replace('https://', '').replace('http://', '').replace('youtu.be','').strip('/').strip('?a')
channel = None
else:
vid = None
channel = None

dict_['video_id'] = vid
dict_['channel'] = channel

return dict_

def strip_yt_link(link):
'''
Parses a link to twitter to get the screen name and tweet id
:input link: a link or list of links with the domain 'youtube.com' or 'youtu.be'
:returns: a list of dictionaries with the original link, the video id, or the channel id
'''

if isinstance(link, str):
return _strip_yt_link(link)

elif isinstance(link, list):
links_to_strip = list(set(link))
links = [_strip_yt_link(link) for link in links_to_strip]
return links

else:
links_to_strip = list(set(list(link)))
links = [_strip_yt_link(link) for link in links_to_strip]
return links

0 comments on commit 42d86cd

Please sign in to comment.