link functions

SMAPPNYU · Aug 10, 2018 · 42d86cd · 42d86cd
1 parent d301604
commit 42d86cd
Show file tree

Hide file tree

Showing 4 changed files with 108 additions and 4 deletions.
diff --git a/urlexpander/core/api.py b/urlexpander/core/api.py
@@ -141,14 +141,15 @@ def _expand(link, timeout=2, **kwargs):
     except requests.exceptions.RequestException as e:
         domain, url_long = _parse_error(str(e))
 
-    if domain in constants.url_appenders:
+    # replace list with constants.url_appenders
+    if domain in ['ln.is', 'linkis.com']:
         url_long = link.replace(domain, '')
         domain = get_domain(url_long)
-
     elif domain in constants.short_domain_ad_redirects or domain == -1:
         url_long = unshortenit.UnshortenIt().unshorten(link,
                                                        timeout=timeout)
         domain = get_domain(url_long)
+
 
     return dict(original_url=link,
                 resolved_domain=domain,
@@ -205,6 +206,7 @@ def expand(links_to_unshorten, chunksize=1280, n_workers=1,
 
         # chunk the list of arguments
         if verbose:
+            print("There are {} links to unshorten".format(len(links_to_unshorten)))
             chunk_iter = tqdm(_chunks(links_to_unshorten, chunksize=chunksize))
         else:
             chunk_iter = _chunks(links_to_unshorten, chunksize=chunksize)
@@ -223,6 +225,7 @@ def expand(links_to_unshorten, chunksize=1280, n_workers=1,
                         if verbose:
                             print("{} failed to resolve due to error: {}".format(chunk[i],
                                                                                  str(type(exc))))
+
                     finally:
                         if isinstance(data, dict):
                             unshortened_urls.append(data)

diff --git a/urlexpander/core/constants.py b/urlexpander/core/constants.py
@@ -55,9 +55,10 @@
     'on.wsj.com',
 ]
 
+# there are link shorteners with the actual link appended on the end
 url_appenders = ['ln.is', 'linkis.com']
 
-all_short_domains = short_domain_ad_redirects + short_domain + ['ln.is']
+all_short_domains = short_domain_ad_redirects + short_domain + url_appenders
 
 congress_dataset_url = ('https://raw.githubusercontent.com/SMAPPNYU/'
                         'urlExpander/master/datasets/'

diff --git a/urlexpander/core/tweet_utils.py b/urlexpander/core/tweet_utils.py
@@ -5,7 +5,7 @@
 
 from urlexpander.core.api import get_domain
 
-__all__ = ['get_link', 'count_matrix']
+__all__ = ['get_link', 'count_matrix', 'strip_tweet_link']
 __author__= 'Leon Yin'
 
 def _get_full_text(tweet):
@@ -117,3 +117,47 @@ def count_matrix(df, user_col='user_id', domain_col='link_domain',
 
     return matrix
 
+def _strip_tweet_link(link):
+    '''
+    Best attempt at stripping twitter links for screen names and tweet ids.
+    
+    :input link: a link with the domain 'twitter.com'
+    :returns: a list of dictionaries with the original link, the screen name, and the tweet_id of the link
+    '''
+    dict_ = {}
+
+    dict_['resolved_url'] = link.lower()
+    if 'status' in link.split('/'):
+        list_ = link.lower().split('/')
+        try:
+            if 'i/web/status' in link: dict_['linked_screen_name'] == None
+            else: dict_['linked_screen_name'] = list_[list_.index('status') - 1]
+            dict_['linked_tweet_id'] = list_[list_.index('status') + 1]
+        except:
+            dict_['linked_screen_name'] = None
+            dict_['linked_tweet_id'] = None
+
+    else:
+        dict_['linked_screen_name'] = link
+        dict_['linked_tweet_id'] = link
+
+    return dict_
+
+def strip_tweet_link(link):
+    '''
+    Parses a link to twitter to get the screen name and tweet id
+    
+    :input link: a link or list of links with the domain 'twitter.com'
+    :returns: a list of dictionaries with the original link, the screen name, and the tweet_id of the link
+    '''
+
+    if isinstance(link, str):
+        return _strip_tweet_link(link)
+    elif isinstance(link, list):
+        links_to_strip = list(set(link))
+        links = [_strip_tweet_link(link) for link in links_to_strip]
+        return links
+    else:
+        links_to_strip = list(set(list(link)))
+        links = [_strip_tweet_link(link) for link in links_to_strip]
+        return links
diff --git a/urlexpander/core/youtube_utils.py b/urlexpander/core/youtube_utils.py
@@ -0,0 +1,56 @@
+__all__ = ['strip_yt_link']
+
+def _strip_yt_link(link):
+    '''
+    Parses a link to youtube to get the screen name and tweet id
+    
+    :input link: a link with the domain 'youtube.com' or 'youtu.be'
+    :returns: a list of dictionaries with the original link, the video id, or the channel id
+    '''
+
+    dict_ = {}
+    dict_['resolved_url'] = link
+
+
+    if 'v=' in link:
+        vid = link[link.index('v='):].strip('v=')
+        if len(vid) > 11:
+            vid = vid[:11]
+        channel = None
+    elif 'channel' in link:
+        channel = link[link.index('channel'):].strip('channel/')
+        if len(channel) > 24:
+            channel = channel[:24]
+        vid = None
+    elif 'youtu.be' in link:
+        vid = link.replace('https://', '').replace('http://', '').replace('youtu.be','').strip('/').strip('?a')
+        channel = None
+    else:
+        vid = None
+        channel = None
+
+    dict_['video_id'] = vid
+    dict_['channel'] = channel
+
+    return dict_
+
+def strip_yt_link(link):
+    '''
+    Parses a link to twitter to get the screen name and tweet id
+    
+    :input link: a link or list of links with the domain 'youtube.com' or 'youtu.be'
+    :returns: a list of dictionaries with the original link, the video id, or the channel id
+    '''
+
+    if isinstance(link, str):
+        return _strip_yt_link(link)
+
+    elif isinstance(link, list):
+        links_to_strip = list(set(link))
+        links = [_strip_yt_link(link) for link in links_to_strip]
+        return links
+
+    else:
+        links_to_strip = list(set(list(link)))
+        links = [_strip_yt_link(link) for link in links_to_strip]
+        return links