diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..756b5d9c --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +* text=auto diff --git a/.travis.yml b/.travis.yml index 455b55e9..d7bf1a7e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,15 +20,15 @@ before_install: - sudo mkdir /scratch/ - sudo chmod 777 /scratch/ - travis/miniconda.sh -b + - mv travis/.condarc $HOME - export PATH=/home/travis/anaconda/bin:$PATH - conda update --yes conda install: - - conda install --yes pip python=$TRAVIS_PYTHON_VERSION atlas numpy scipy beautiful-soup six scikit-learn - - if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then pip install --use-mirrors configparser; fi - - pip install -r requirements.txt --use-mirrors - - pip install python-coveralls --use-mirrors - - pip install nose-cov --use-mirrors - - if [ $GRIDMAP == "true" ]; then pip install --use-mirrors git+git://github.com/dan-blanchard/drmaa-python gridmap; fi + - conda install --yes pip python=$TRAVIS_PYTHON_VERSION atlas numpy scipy beautiful-soup six scikit-learn joblib prettytable python-coveralls + - if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then conda install --yes configparser futures logutils; fi + - if [ $GRIDMAP == "true" ]; then conda install --yes drmaa gridmap; fi + # Have to use pip for nose-cov because its entry points are not supported by conda yet + - pip install --use-mirrors nose-cov - sudo rm -rf /dev/shm - sudo ln -s /run/shm /dev/shm - python setup.py install diff --git a/requirements.txt b/requirements.txt index cbcb836e..967186eb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ PrettyTable beautifulsoup4 numpy scipy +joblib diff --git a/setup.py b/setup.py index 7a43a6f9..baccef44 100644 --- a/setup.py +++ b/setup.py @@ -35,11 +35,15 @@ def requirements(): author='Daniel Blanchard', author_email='dblanchard@ets.org', license='BSD 3 clause', - packages=['skll'], - scripts=['scripts/filter_megam', 'scripts/generate_predictions', - 'scripts/join_megam', 'scripts/megam_to_libsvm', - 'scripts/print_model_weights', 'scripts/run_experiment', - 'scripts/skll_convert', 'scripts/summarize_results'], + packages=['skll', 'skll.utilities'], + entry_points={'console_scripts': ['filter_megam = skll.utilities.filter_megam:main', + 'generate_predictions = skll.utilities.generate_predictions:main', + 'join_megam = skll.utilities.join_megam:main', + 'megam_to_libsvm = skll.utilities.megam_to_libsvm:main', + 'print_model_weights = skll.utilities.print_model_weights:main', + 'run_experiment = skll.utilities.run_experiment:main', + 'skll_convert = skll.utilities.skll_convert:main', + 'summarize_results = skll.utilities.summarize_results:main']}, install_requires=requirements(), classifiers=['Intended Audience :: Science/Research', 'Intended Audience :: Developers', diff --git a/skll/experiments.py b/skll/experiments.py index fa76e83f..3553591d 100644 --- a/skll/experiments.py +++ b/skll/experiments.py @@ -587,11 +587,32 @@ def _classify_featureset(args): featureset), file=log_file) + # check whether a trained model on the same data with the same + # featureset already exists if so, load it and then use it on test data + modelfile = os.path.join(model_path, '{}.model'.format(job_name)) + # load the training and test examples - train_examples = _load_featureset(train_path, featureset, suffix, - label_col=label_col, - ids_to_floats=ids_to_floats, - quiet=quiet, class_map=class_map) + if task == 'cross_validate' or (not os.path.exists(modelfile) or + overwrite): + train_examples = _load_featureset(train_path, featureset, suffix, + label_col=label_col, + ids_to_floats=ids_to_floats, + quiet=quiet, class_map=class_map) + # initialize a classifer object + learner = Learner(learner_name, + probability=probability, + feature_scaling=feature_scaling, + model_kwargs=fixed_parameters, + pos_label_str=pos_label_str, + min_feature_count=min_feature_count) + # load the model if it already exists + else: + if os.path.exists(modelfile) and not overwrite: + print(('\tloading pre-existing {} ' + + 'model: {}').format(learner_name, modelfile)) + learner = Learner.from_file(modelfile) + + # Load test set if there is one if task == 'evaluate' or task == 'predict': test_examples = _load_featureset(test_path, featureset, suffix, label_col=label_col, @@ -599,18 +620,6 @@ def _classify_featureset(args): quiet=quiet, class_map=class_map, unlabelled=True) - # initialize a classifer object - learner = Learner(learner_name, - probability=probability, - feature_scaling=feature_scaling, - model_kwargs=fixed_parameters, - pos_label_str=pos_label_str, - min_feature_count=min_feature_count) - - # check whether a trained model on the same data with the same - # featureset already exists if so, load it (and the feature - # vocabulary) and then use it on the test data - modelfile = os.path.join(model_path, '{}.model'.format(job_name)) # create a list of dictionaries of the results information learner_result_dict_base = {'experiment_name': experiment_name, @@ -639,14 +648,8 @@ def _classify_featureset(args): param_grid=param_grid, grid_jobs=grid_search_jobs) else: - # load the model if it already exists - if os.path.exists(modelfile) and not overwrite: - print(('\tloading pre-existing {} ' + - 'model: {}').format(learner_name, modelfile)) - learner.load(modelfile) - # if we have do not have a saved model, we need to train one. - else: + if not os.path.exists(modelfile) or overwrite: print(('\tfeaturizing and training new ' + '{} model').format(learner_name), file=log_file) diff --git a/skll/learner.py b/skll/learner.py index 2fb8de81..8524a6d1 100644 --- a/skll/learner.py +++ b/skll/learner.py @@ -18,6 +18,7 @@ from functools import wraps from multiprocessing import cpu_count +import joblib import numpy as np import scipy.sparse as sp from six import iteritems, itervalues @@ -443,23 +444,22 @@ def from_file(cls, learner_path): ''' :returns: New instance of Learner from the pickle at the specified path. ''' - with open(learner_path, "rb") as f: - skll_version, learner = pickle.load(f) - # Check that we've actually loaded a Learner (or sub-class) - if not isinstance(learner, cls): - raise ValueError(('The pickle stored at {} does not contain ' + - 'a {} object.').format(learner_path, cls)) - # Check that versions are compatible. (Currently, this just checks - # that major versions match) - elif skll_version[0] == VERSION[0]: - return learner - else: - raise Exception(("{} stored in pickle file {} was " + - "created with version {} of SKLL, which is " + - "incompatible with the current version " + - "{}").format(cls, learner_path, - '.'.join(skll_version), - '.'.join(VERSION))) + skll_version, learner = joblib.load(learner_path) + # Check that we've actually loaded a Learner (or sub-class) + if not isinstance(learner, cls): + raise ValueError(('The pickle stored at {} does not contain ' + + 'a {} object.').format(learner_path, cls)) + # Check that versions are compatible. (Currently, this just checks + # that major versions match) + elif skll_version[0] == VERSION[0]: + return learner + else: + raise ValueError(("{} stored in pickle file {} was " + + "created with version {} of SKLL, which is " + + "incompatible with the current version " + + "{}").format(cls, learner_path, + '.'.join(skll_version), + '.'.join(VERSION))) @property def model_type(self): @@ -548,8 +548,7 @@ def save(self, learner_path): if not os.path.exists(learner_dir): os.makedirs(learner_dir) # write out the files - with open(learner_path, "wb") as f: - pickle.dump((VERSION, self), f, -1) + joblib.dump((VERSION, self), learner_path) def _create_estimator(self): ''' diff --git a/skll/utilities/__init__.py b/skll/utilities/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/filter_megam b/skll/utilities/filter_megam.py similarity index 97% rename from scripts/filter_megam rename to skll/utilities/filter_megam.py index f8fcf3f8..a6a6fd5b 100755 --- a/scripts/filter_megam +++ b/skll/utilities/filter_megam.py @@ -37,7 +37,10 @@ from skll.version import __version__ -if __name__ == '__main__': +def main(): + ''' + Handles command line arguments and gets things started. + ''' # Get command line arguments parser = argparse.ArgumentParser(description="Filter MegaM file to remove\ features with names in stop\ @@ -98,3 +101,7 @@ print(" ", end='') print('{} {}'.format(feature, value), end="") print() + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/scripts/generate_predictions b/skll/utilities/generate_predictions.py similarity index 100% rename from scripts/generate_predictions rename to skll/utilities/generate_predictions.py diff --git a/scripts/join_megam b/skll/utilities/join_megam.py similarity index 98% rename from scripts/join_megam rename to skll/utilities/join_megam.py index 4897990f..534c638f 100755 --- a/scripts/join_megam +++ b/skll/utilities/join_megam.py @@ -96,7 +96,10 @@ def get_unique_name(feature_name, prev_feature_set, filename): return new_feature_name -if __name__ == '__main__': +def main(): + ''' + Handles command line arguments and gets things started. + ''' # Get command line arguments parser = argparse.ArgumentParser(description="Combine MegaM files that \ contain features for the same\ @@ -213,3 +216,7 @@ def get_unique_name(feature_name, prev_feature_set, filename): print("# {}".format(curr_filename).encode('utf-8')) print("{}\t{}".format(class_dict[curr_filename], feature_dict[curr_filename].strip()).encode('utf-8')) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/scripts/megam_to_libsvm b/skll/utilities/megam_to_libsvm.py similarity index 100% rename from scripts/megam_to_libsvm rename to skll/utilities/megam_to_libsvm.py diff --git a/scripts/print_model_weights b/skll/utilities/print_model_weights.py similarity index 95% rename from scripts/print_model_weights rename to skll/utilities/print_model_weights.py index 19186215..60450e21 100755 --- a/scripts/print_model_weights +++ b/skll/utilities/print_model_weights.py @@ -36,7 +36,10 @@ from skll.version import __version__ -if __name__ == '__main__': +def main(): + ''' + Handles command line arguments and gets things started. + ''' parser = argparse.ArgumentParser(description="Prints out the weights of a \ given model.", conflict_handler='resolve', @@ -63,3 +66,7 @@ for feat, val in sorted(iteritems(weights), key=lambda x: -abs(x[1]))[:k]: print("{:.12f}\t{}".format(val, feat)) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/scripts/run_experiment b/skll/utilities/run_experiment.py similarity index 100% rename from scripts/run_experiment rename to skll/utilities/run_experiment.py diff --git a/scripts/skll_convert b/skll/utilities/skll_convert.py similarity index 97% rename from scripts/skll_convert rename to skll/utilities/skll_convert.py index 775732fe..f89ef3c0 100755 --- a/scripts/skll_convert +++ b/skll/utilities/skll_convert.py @@ -37,7 +37,10 @@ from skll.version import __version__ -if __name__ == '__main__': +def main(): + ''' + Handles command line arguments and gets things started. + ''' # Get command line arguments parser = argparse.ArgumentParser(description="Takes an input feature file \ and converts it to another \ @@ -111,3 +114,6 @@ write_feature_file(args.outfile, ids, classes, feature_dicts, arff_regression=args.arff_regression, arff_relation=args.arff_relation) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/scripts/summarize_results b/skll/utilities/summarize_results.py similarity index 100% rename from scripts/summarize_results rename to skll/utilities/summarize_results.py diff --git a/skll/version.py b/skll/version.py index ba3c42f4..6dfbc3c0 100644 --- a/skll/version.py +++ b/skll/version.py @@ -7,5 +7,5 @@ :organization: ETS ''' -__version__ = '0.22.0' +__version__ = '0.22.1' VERSION = tuple(int(x) for x in __version__.split('.')) diff --git a/tests/test_skll.py b/tests/test_skll.py index 8300c765..7aacc24c 100644 --- a/tests/test_skll.py +++ b/tests/test_skll.py @@ -912,9 +912,6 @@ def check_convert_featureset(from_suffix, to_suffix): # the path to the unmerged feature files dirpath = os.path.join(_my_dir, 'train', 'test_conversion') - # get the path to the conversion script - converter_path = os.path.abspath(os.path.join(_my_dir, '..', 'scripts', 'skll_convert')) - # get the feature name prefix feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.')) @@ -925,9 +922,8 @@ def check_convert_featureset(from_suffix, to_suffix): feature, from_suffix)) output_file_path = os.path.join(dirpath, '{}_{}{}'.format(feature_name_prefix, feature, to_suffix)) - convert_cmd = shlex.split('{} --quiet {} {}'.format(converter_path, - input_file_path, - output_file_path)) + convert_cmd = shlex.split('skll_convert --quiet {} {}'.format(input_file_path, + output_file_path)) subprocess.check_call(convert_cmd) # now load and merge all unmerged, converted features in the `to_suffix` format diff --git a/travis/.condarc b/travis/.condarc new file mode 100644 index 00000000..18b61bfd --- /dev/null +++ b/travis/.condarc @@ -0,0 +1,5 @@ +# a condarc file should be placed in $HOME/.condarc + +channels: + - https://conda.binstar.org/dan_blanchard + - defaults