Skip to content

Commit

Permalink
add tensorflow-gpu to conda env, fix ch10_translate and unify futil.r…
Browse files Browse the repository at this point in the history
…ead_text read_txt
  • Loading branch information
hobs committed May 3, 2019
1 parent efa0112 commit 4d4f3bd
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 16 deletions.
4 changes: 2 additions & 2 deletions conda/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ dependencies:
- scipy==1.1.0
- spacy==2.0.18
- swig>=3.0.12
- tensorflow>=1.13.1
- tensorflow-gpu>=1.13.1
- tensorflow-hub>=0.4.0
- theano==1.0.2
- theano>=1.0.2
# this will fail on windows:
# - python-annoy>=1.9.5,<2.0.0

Expand Down
4 changes: 4 additions & 0 deletions src/nlpia/book/examples/ch10_translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,10 @@
from keras.callbacks import ModelCheckpoint # noqa
from nlpia.constants import BIGDATA_PATH # noqa
checkpoint_path = os.path.join(BIGDATA_PATH, 'checkpoints')
try:
os.mkdir(checkpoint_path)
except FileExistsError:
pass
checkpoint_path = os.path.join(checkpoint_path, 'nlpia-seq2seq-translation-weights.{epoch:02d}-{val_loss:.2f}.hdf5')


Expand Down
13 changes: 5 additions & 8 deletions src/nlpia/futil.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
""" File utilities comparable to similarly named bash utils: rm_rf(), rm_f(), and mkdir_p()
""" File utilities comparable to similarly named bash utils: rm_rf(), rm_f(), and mkdir_p()
dataset1.0 is in files like: PPE1.rar PPE2.zip PPE3.zip PP4.7zip
dataset2.0 is in gs:/Buckets/safety_monitoring/data/obj/supplemental/"""
Expand Down Expand Up @@ -123,7 +123,7 @@ def rm_r(path, force=False):
elif os.path.isdir(path):
try:
return os.rmdir(path)
except OSError: # OSError: [Errno 66] Directory not empty:
except OSError: # OSError: [Errno 66] Directory not empty:
pass
except:
if not force:
Expand Down Expand Up @@ -223,7 +223,7 @@ def ensure_open(f, mode='r'):
return open(f, mode=mode)
f = fin # reset path in case it is the text that needs to be opened with StringIO
else:
f = io.StringIO(f)
f = io.StringIO(f)
elif f and getattr(f, 'closed', None):
if hasattr(f, '_write_gzip_header'):
return gzip.open(f.name, mode=mode)
Expand Down Expand Up @@ -341,11 +341,11 @@ def update_dict_types(d, update_keys=True, update_values=True, typ=(int,)):
di[ki] = vi
d.update(di)
return d


def read_json(filepath, intkeys=True, intvalues=True):
""" read text from filepath (`open(find_filepath(expand_filepath(fp)))`) then json.loads()
>>> read_json('HTTP_1.1 Status Code Definitions.html.json')
{'100': 'Continue',
'101': 'Switching Protocols',...
Expand Down Expand Up @@ -432,6 +432,3 @@ def read_text(forfn, nrows=None, verbose=True):
) / float(len(lines)) > .05:
return np.array(html2text(EOL.join(lines)).split(EOL))
return lines


read_txt = read_text
12 changes: 6 additions & 6 deletions src/nlpia/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ def load_anki_df(language='deu'):
'http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip',
9916637,
'cornell_movie_dialogs_corpus',

),
'save_dialog_tweets': (
'https://www.dropbox.com/s/tlrr9bm45uzm9yl/save_dialog_tweets.txt.gz?dl=1',
Expand Down Expand Up @@ -981,7 +981,7 @@ def read_named_csv(name, data_path=DATA_PATH, nrows=None, verbose=True):
except (IOError, pd.errors.ParserError):
pass
try:
return read_txt(name, nrows=nrows)
return read_text(name, nrows=nrows)
except (IOError, UnicodeDecodeError):
pass
data_path = expand_filepath(data_path)
Expand All @@ -1006,7 +1006,7 @@ def read_named_csv(name, data_path=DATA_PATH, nrows=None, verbose=True):
except IOError:
pass
try:
return read_txt(os.path.join(data_path, name + '.txt'), verbose=verbose)
return read_text(os.path.join(data_path, name + '.txt'), verbose=verbose)
except IOError:
pass

Expand All @@ -1019,7 +1019,7 @@ def read_named_csv(name, data_path=DATA_PATH, nrows=None, verbose=True):
except ValueError:
pass
try:
return read_txt(os.path.join(BIGDATA_PATH, name + '.txt'), verbose=verbose)
return read_text(os.path.join(BIGDATA_PATH, name + '.txt'), verbose=verbose)
except IOError:
pass

Expand Down Expand Up @@ -1093,7 +1093,7 @@ def get_data(name='sms-spam', nrows=None, limit=None):
pass
if filepathlow.endswith('.txt'):
try:
return read_txt(filepath)
return read_text(filepath)
except (TypeError, UnicodeError):
pass
return filepaths[name]
Expand Down Expand Up @@ -1290,7 +1290,7 @@ def cleaner(row):

def clean_cornell_movies(filename='cornell_movie_dialogs_corpus.zip', subdir='cornell movie-dialogs corpus'):
""" Load a dataframe of ~100k raw (uncollated) movie lines from the cornell movies dialog corpus
>>> local_filepath = download_file(BIG_URLS['cornell_movie_dialogs_corpus'][0])
>>> df = clean_cornell_movies(filename='cornell_movie_dialogs_corpus.zip')
>>> df.describe(include='all')
Expand Down

0 comments on commit 4d4f3bd

Please sign in to comment.