diff --git a/conda/environment.yml b/conda/environment.yml index ca24d09..7db7bbd 100644 --- a/conda/environment.yml +++ b/conda/environment.yml @@ -33,9 +33,9 @@ dependencies: - scipy==1.1.0 - spacy==2.0.18 - swig>=3.0.12 - - tensorflow>=1.13.1 + - tensorflow-gpu>=1.13.1 - tensorflow-hub>=0.4.0 - - theano==1.0.2 + - theano>=1.0.2 # this will fail on windows: # - python-annoy>=1.9.5,<2.0.0 diff --git a/src/nlpia/book/examples/ch10_translate.py b/src/nlpia/book/examples/ch10_translate.py index aff1f0e..1f2cec0 100644 --- a/src/nlpia/book/examples/ch10_translate.py +++ b/src/nlpia/book/examples/ch10_translate.py @@ -102,6 +102,10 @@ from keras.callbacks import ModelCheckpoint # noqa from nlpia.constants import BIGDATA_PATH # noqa checkpoint_path = os.path.join(BIGDATA_PATH, 'checkpoints') +try: + os.mkdir(checkpoint_path) +except FileExistsError: + pass checkpoint_path = os.path.join(checkpoint_path, 'nlpia-seq2seq-translation-weights.{epoch:02d}-{val_loss:.2f}.hdf5') diff --git a/src/nlpia/futil.py b/src/nlpia/futil.py index d56bb9f..fa0cbf0 100644 --- a/src/nlpia/futil.py +++ b/src/nlpia/futil.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -""" File utilities comparable to similarly named bash utils: rm_rf(), rm_f(), and mkdir_p() +""" File utilities comparable to similarly named bash utils: rm_rf(), rm_f(), and mkdir_p() dataset1.0 is in files like: PPE1.rar PPE2.zip PPE3.zip PP4.7zip dataset2.0 is in gs:/Buckets/safety_monitoring/data/obj/supplemental/""" @@ -123,7 +123,7 @@ def rm_r(path, force=False): elif os.path.isdir(path): try: return os.rmdir(path) - except OSError: # OSError: [Errno 66] Directory not empty: + except OSError: # OSError: [Errno 66] Directory not empty: pass except: if not force: @@ -223,7 +223,7 @@ def ensure_open(f, mode='r'): return open(f, mode=mode) f = fin # reset path in case it is the text that needs to be opened with StringIO else: - f = io.StringIO(f) + f = io.StringIO(f) elif f and getattr(f, 'closed', None): if hasattr(f, '_write_gzip_header'): return gzip.open(f.name, mode=mode) @@ -341,11 +341,11 @@ def update_dict_types(d, update_keys=True, update_values=True, typ=(int,)): di[ki] = vi d.update(di) return d - + def read_json(filepath, intkeys=True, intvalues=True): """ read text from filepath (`open(find_filepath(expand_filepath(fp)))`) then json.loads() - + >>> read_json('HTTP_1.1 Status Code Definitions.html.json') {'100': 'Continue', '101': 'Switching Protocols',... @@ -432,6 +432,3 @@ def read_text(forfn, nrows=None, verbose=True): ) / float(len(lines)) > .05: return np.array(html2text(EOL.join(lines)).split(EOL)) return lines - - -read_txt = read_text \ No newline at end of file diff --git a/src/nlpia/loaders.py b/src/nlpia/loaders.py index ce338c4..bc70c1c 100644 --- a/src/nlpia/loaders.py +++ b/src/nlpia/loaders.py @@ -312,7 +312,7 @@ def load_anki_df(language='deu'): 'http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip', 9916637, 'cornell_movie_dialogs_corpus', - + ), 'save_dialog_tweets': ( 'https://www.dropbox.com/s/tlrr9bm45uzm9yl/save_dialog_tweets.txt.gz?dl=1', @@ -981,7 +981,7 @@ def read_named_csv(name, data_path=DATA_PATH, nrows=None, verbose=True): except (IOError, pd.errors.ParserError): pass try: - return read_txt(name, nrows=nrows) + return read_text(name, nrows=nrows) except (IOError, UnicodeDecodeError): pass data_path = expand_filepath(data_path) @@ -1006,7 +1006,7 @@ def read_named_csv(name, data_path=DATA_PATH, nrows=None, verbose=True): except IOError: pass try: - return read_txt(os.path.join(data_path, name + '.txt'), verbose=verbose) + return read_text(os.path.join(data_path, name + '.txt'), verbose=verbose) except IOError: pass @@ -1019,7 +1019,7 @@ def read_named_csv(name, data_path=DATA_PATH, nrows=None, verbose=True): except ValueError: pass try: - return read_txt(os.path.join(BIGDATA_PATH, name + '.txt'), verbose=verbose) + return read_text(os.path.join(BIGDATA_PATH, name + '.txt'), verbose=verbose) except IOError: pass @@ -1093,7 +1093,7 @@ def get_data(name='sms-spam', nrows=None, limit=None): pass if filepathlow.endswith('.txt'): try: - return read_txt(filepath) + return read_text(filepath) except (TypeError, UnicodeError): pass return filepaths[name] @@ -1290,7 +1290,7 @@ def cleaner(row): def clean_cornell_movies(filename='cornell_movie_dialogs_corpus.zip', subdir='cornell movie-dialogs corpus'): """ Load a dataframe of ~100k raw (uncollated) movie lines from the cornell movies dialog corpus - + >>> local_filepath = download_file(BIG_URLS['cornell_movie_dialogs_corpus'][0]) >>> df = clean_cornell_movies(filename='cornell_movie_dialogs_corpus.zip') >>> df.describe(include='all')