Skip to content

Commit

Permalink
BUG->BIG
Browse files Browse the repository at this point in the history
  • Loading branch information
Hobson Lane committed Jan 4, 2019
1 parent 0ab4996 commit 7f14445
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 23 deletions.
5 changes: 3 additions & 2 deletions src/nlpia/book/examples/ch06_word2vec_embedding_viz.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
import gensim
import numpy as np
import tensorflow as tf
from nlpia.loaders import get_data
from tensorflow.contrib.tensorboard.plugins import projector

words = ('Sacramento', 'California', 'Oregon', 'Salem', 'Washington', 'Olympia')

# loading your gensim
# model = gensim.models.KeyedVectors.load_word2vec_format('~/Downloads/GoogleNews-vectors-negative300.bin.gz', binary=True, limit=200000)
from nlpia.loaders import get_data

model = get_data('w2v', limit=200000) # <1>

# project part of vocab, 10K of 300 dimension
Expand Down Expand Up @@ -45,4 +46,4 @@
saver.save(sess, '/Users/hannes/Downloads/prefix_model.ckpt', global_step=1000)

# open tensorboard with logdir, check localhost:6006 for viewing your embedding.
# tensorboard --logdir="./projector/"
# tensorboard --logdir="./projector/"
42 changes: 21 additions & 21 deletions src/nlpia/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ def load_anki_df(language='deu'):
'https://nlp.stanford.edu/data/glove.twitter.27B.zip',
1000000000, # FIXME: make sure size check is `>=`
),
'glove_sm': (
'glove_small': (
'https://nlp.stanford.edu/data/glove.6B.zip',
862182613,
os.path.join('glove.6B', 'glove.6B.50d.txt'),
Expand Down Expand Up @@ -355,10 +355,10 @@ def load_anki_df(language='deu'):
for yr in range(2011, 2017):
BIG_URLS['cdc' + str(yr)[-2:]] = ('https://www.cdc.gov/brfss/annual_data/{yr}/files/LLCP{yr}ASC.zip'.format(yr=yr), None)
BIG_URLS['word2vec'] = BIG_URLS['wv'] = BIG_URLS['w2v']
BIG_URLS['glove'] = BIG_URLS['glove_small'] = BIG_URLS['glove-small'] = BIG_URLS['glovesm'] = BIG_URLS['glove-sm'] = BIG_URLS['glove_sm']
BIG_URLS['glove'] = BIG_URLS['glovesm'] = BIG_URLS['glove-sm'] = BIG_URLS['glove_sm'] = BIG_URLS['glove-small'] = BIG_URLS['glove_small']
BIG_URLS['ubuntu'] = BIG_URLS['ubuntu_dialog'] = BIG_URLS['ubuntu_dialog_1500k']
BUG_URLS['glovelg'] = BUG_URLS['glove_lg'] = BUG_URLS['glove-lg'] = BUG_URLS['glove-large'] = BIG_URLS['glove_large']
BUG_URLS['glovemed'] = BUG_URLS['glove_med'] = BUG_URLS['glove-med'] = BUG_URLS['glove-medium'] = BIG_URLS['glove_medium']
BIG_URLS['glovelg'] = BIG_URLS['glove_lg'] = BIG_URLS['glove-lg'] = BIG_URLS['glove-large'] = BIG_URLS['glove_large']
BIG_URLS['glovemed'] = BIG_URLS['glove_med'] = BIG_URLS['glove-med'] = BIG_URLS['glove-medium'] = BIG_URLS['glove_medium']

ANKI_LANGUAGES = 'afr arq ara aze eus bel ben ber bul yue cat cbk cmn chv hrv ces dan nld est fin fra glg kat ' \
'deu ell heb hin hun isl ind ita jpn kha khm kor lvs lit nds mkd zsm mal mri mar max nob pes ' \
Expand All @@ -380,7 +380,7 @@ def load_anki_df(language='deu'):
GOOGLE_NGRAM_NAMES = '0 1 2 3 4 5 6 7 8 9 a b c d e f g h i j k l m n o other p pos punctuation q r s t u v w x y z'.split()
GOOGLE_NGRAM_FILE = 'googlebooks-eng-all-1gram-20120701-{}.gz'

for name in GOOGLE_NGRAM_NAMES:
for name in GOOGLE_NGRAM_NAMES:
BIG_URLS['1gram_{}'.format(name)] = (GOOGLE_NGRAM_URL + GOOGLE_NGRAM_FILE.format(name),
1000, GOOGLE_NGRAM_FILE.format(name),
pd.read_table,
Expand Down Expand Up @@ -423,7 +423,7 @@ def load_anki_df(language='deu'):


def rename_file(source, dest):
""" Rename (mv) file(s) from source to dest
""" Rename (mv) file(s) from source to dest
>>> from tempfile import mkdtemp
>>> tmpdir = mkdtemp(suffix='doctest_rename_file', prefix='tmp')
Expand Down Expand Up @@ -571,7 +571,7 @@ def looks_like_index(series, index_names=('Unnamed: 0', 'pk', 'index', '')):


def get_longest_table(url='https://www.openoffice.org/dev_docs/source/file_extensions.html', header=0):
""" Retrieve the HTML tables from a URL and return the longest DataFrame found
""" Retrieve the HTML tables from a URL and return the longest DataFrame found
>>> get_longest_table('https://en.wikipedia.org/wiki/List_of_sovereign_states').columns
Index(['Common and formal names', 'Membership within the UN System[a]',
Expand All @@ -584,7 +584,7 @@ def get_longest_table(url='https://www.openoffice.org/dev_docs/source/file_exten


def get_leet_map():
""" Retrieve mapping from English letters to l33t like E => 3 or A => /\ or /-\ or @ """
r""" Retrieve mapping from English letters to l33t like E => 3 or A => /\ or /-\ or @ """
df = get_longest_table(
'https://sites.google.com/site/inhainternetlanguage/different-internet-languages/l33t/list-of-l33ts', header=None)
df = df.drop(index=0).iloc[:, :2]
Expand Down Expand Up @@ -616,7 +616,7 @@ def get_netspeak_map():


def longest_table(dfs):
""" Return this single longest DataFrame that among an array/list/tuple of DataFrames
""" Return this single longest DataFrame that among an array/list/tuple of DataFrames
Useful for automagically finding the DataFrame you want when using pd.read_html() on a Wikipedia page.
"""
Expand Down Expand Up @@ -731,7 +731,7 @@ def ensure_open(f, mode='r'):


def wc(f, verbose=False, nrows=None):
r""" Count lines in a text file
r""" Count lines in a text file
References:
https://stackoverflow.com/q/845058/623735
Expand Down Expand Up @@ -801,12 +801,12 @@ def no_tqdm(it, total=1, **kwargs):


def expand_filepath(filepath):
""" Expand any '~', '.', '*' variables in filepath.
""" Expand any '~', '.', '*' variables in filepath.
See also: pugnlp.futil.expand_path
>>> len(expand_filepath('~')) > 3
True
True
"""
return os.path.abspath(os.path.expandvars(os.path.expanduser(filepath)))

Expand Down Expand Up @@ -863,7 +863,7 @@ def normalize_ext(filepath):


def normalize_filepath(filepath):
r""" Lowercase the filename and ext, expanding extensions like .tgz to .tar.gz.
r""" Lowercase the filename and ext, expanding extensions like .tgz to .tar.gz.
>>> normalize_filepath('/Hello_World.txt\n')
'hello_world.txt'
Expand Down Expand Up @@ -926,7 +926,7 @@ def normalize_glove(filepath):


def unzip(filepath, verbose=True):
r""" Unzip GloVE models and convert to word2vec binary models (gensim.KeyedVectors)
r""" Unzip GloVE models and convert to word2vec binary models (gensim.KeyedVectors)
The only kinds of files that are returned are "*.asc" and "*.txt" and only after renaming.
"""
Expand Down Expand Up @@ -963,7 +963,7 @@ def unzip(filepath, verbose=True):
repr(glove_input_file), repr(word2vec_output_file)))
try:
glove2word2vec(glove_input_file=glove_input_file, word2vec_output_file=word2vec_output_file)
except:
except: # noqa
logger.info('Failed to convert GloVE format to Word2vec: {} -> {}'.format(
repr(glove_input_file), repr(word2vec_output_file)))

Expand Down Expand Up @@ -1025,8 +1025,8 @@ def get_ftp_filemeta(parsed_url, username='anonymous', password='nlpia@totalgood
username=(parsed_url.username or username),
remote_size=-1,
filename=os.path.basename(parsed_url.path))
ftp = ftplib.FTP(parsed_url.hostname)
ftp.login(username, password)
ftp = ftplib.FTP(parsed_url.hostname)
ftp.login(username, password)
ftp.cwd(parsed_url.path)
ftp.retrbinary("RETR " + filename, open(filename, 'wb').write)
ftp.quit()
Expand Down Expand Up @@ -1348,19 +1348,19 @@ def get_data(name='sms-spam', nrows=None, limit=None):
if filepathlow.endswith('.gz'):
try:
filepath = ensure_open(filepath)
except:
except: # noqa
pass
if re.match(r'.json([.][a-z]{0,3}){0,2}', filepathlow):
return read_json(filepath)
if filepathlow.endswith('.tsv.gz') or filepathlow.endswith('.tsv'):
try:
return pd.read_table(filepath)
except:
except: # noqa
pass
if filepathlow.endswith('.csv.gz') or filepathlow.endswith('.csv'):
try:
return read_csv(filepath)
except:
except: # noqa
pass
if filepathlow.endswith('.txt'):
try:
Expand Down Expand Up @@ -1488,7 +1488,7 @@ def clean_column_values(df, inplace=True):
values = values.astype(float)
except ValueError:
values = None
except:
except: # noqa
logger.error('Error on column {} with dtype {}'.format(c, df[c].dtype))
raise

Expand Down

0 comments on commit 7f14445

Please sign in to comment.