Merge branch 'release/0.18.1'

EducationalTestingService · Oct 24, 2013 · cd67355 · cd67355
2 parents 833b7da + 85bc9de
commit cd67355
Show file tree

Hide file tree

Showing 9 changed files with 48 additions and 45 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -11,8 +11,6 @@ notifications:
   email: false
 
 # Install stuff
-virtualenv:
-  system_site_packages: true
 before_install:
   - if [ $GRIDMAP == "true" ]; then travis/install_sge.sh; fi
   - export SGE_ROOT=/var/lib/gridengine
@@ -22,10 +20,15 @@ before_install:
   - sudo mkdir /scratch/
   - sudo chmod 777 /scratch/
   - sudo apt-get update -qq
-  - sudo apt-get install libatlas-dev libatlas-base-dev liblapack-dev gfortran
+  - if [ ${TRAVIS_PYTHON_VERSION:0:1} == "3" ]; then sudo apt-get install libatlas-dev libatlas-base-dev liblapack-dev gfortran; fi
+  - if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then wget http://repo.continuum.io/miniconda/Miniconda-2.0.0-Linux-x86_64.sh -O miniconda.sh; else wget http://repo.continuum.io/miniconda/Miniconda3-2.0.0-Linux-x86_64.sh -O miniconda.sh; fi
+  - chmod +x miniconda.sh
+  - ./miniconda.sh -b
+  - export PATH=/home/travis/anaconda/bin:$PATH
 install:
+  - conda install --yes pip python=$TRAVIS_PYTHON_VERSION numpy scipy
+  - if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then conda install --yes scikit-learn; fi
   - if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then pip install --use-mirrors configparser; fi
-  - pip install -q numpy --use-mirrors
   - pip install -r requirements.txt --use-mirrors
   - pip install python-coveralls --use-mirrors
   - pip install nose-cov --use-mirrors
@@ -36,7 +39,7 @@ install:
 
 # Run test
 script:
-  - nosetests --with-cov --cov skll --cov-config .coveragerc --logging-level=INFO
+  - nosetests --with-cov --cov skll --cov-config .coveragerc --logging-level=WARNING
 
 # Calculate coverage
 after_success:

diff --git a/README.rst b/README.rst
@@ -63,6 +63,13 @@ Requirements
 Changelog
 ~~~~~~~~~
 
+-  v0.18.1
+
+   +  Updated `generate_predictions` to use latest API.
+   +  Switched to using multiprocessing-compatible logging. This should fix some
+      intermittent deadlocks.
+   +  Switched to using miniconda for install Python on Travis-CI.
+
 -  v0.18.0
 
    +  Fixed crash when ``modelpath`` is blank and ``task`` is not

diff --git a/scripts/generate_predictions b/scripts/generate_predictions
@@ -25,7 +25,7 @@ Loads a trained model and outputs predictions based on input feature files.
 :date: February 2013
 '''
 
-from __future__ import print_function, unicode_literals
+from __future__ import absolute_import, print_function, unicode_literals
 
 import argparse
 
@@ -40,13 +40,12 @@ class Predictor(object):
     predictions for feature strings.
     """
 
-    def __init__(self, model_prefix, threshold=None, positive_class=1):
+    def __init__(self, model_path, threshold=None, positive_class=1):
         '''
         Initialize the predictor.
 
-        :param model_prefix: Prefix to use when loading trained model (and its
-                             vocab).
-        :type model_prefix: basestring
+        :param model_path: Path to use when loading trained model.
+        :type model_path: str
         :param threshold: If the model we're using is generating probabilities
                           of the positive class, return 1 if it meets/exceeds
                           the given threshold and 0 otherwise.
@@ -58,18 +57,14 @@ class Predictor(object):
                                for binary classification.
         :type positive_class: int
         '''
-        self._learner = Learner()
-        self._learner.load('{}.model'.format(model_prefix))
+        self._learner = Learner.from_file(model_path)
         self._pos_index = positive_class
         self.threshold = threshold
 
     def predict(self, data):
         '''
-        Return a list of predictions for a given numpy array of examples
-        (which are dicts)
+        Return a list of predictions for a given ExamplesTuple of examples.
         '''
-        # Must make a list around a dictionary to fit format that
-        # Learner.predict expects
         preds = self._learner.predict(data).tolist()
 
         if self._learner.probability:
@@ -92,18 +87,14 @@ def main():
                      on input feature files.",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         conflict_handler='resolve')
-    parser.add_argument('model_prefix', help='Prefix to use when loading \
-                                              trained model (and its vocab).')
+    parser.add_argument('model_file',
+                        help='Model file to load and use for generating \
+                              predictions.')
     parser.add_argument('input_file',
                         help='A csv file, json file, or megam file \
                               (with or without the label column), \
                               with the appropriate suffix.',
                         nargs='+')
-    parser.add_argument('-l', '--has_labels',
-                        help="Indicates that the input file includes \
-                              labels and that the features start at the \
-                              2nd column for csv and megam files.",
-                        action='store_true')
     parser.add_argument('-p', '--positive_class',
                         help="If the model is only being used to predict the \
                               probability of a particular class, this \
@@ -112,23 +103,31 @@ def main():
                               for binary classification. Keep in mind that \
                               classes are sorted lexicographically.",
                         default=1, type=int)
+    parser.add_argument('-q', '--quiet',
+                        help='Suppress printing of "Loading..." messages.',
+                        action='store_true')
     parser.add_argument('-t', '--threshold',
                         help="If the model we're using is generating \
                               probabilities of the positive class, return 1 \
                               if it meets/exceeds the given threshold and 0 \
                               otherwise.",
                         type=float)
+    parser.add_argument('--tsv_label',
+                        help='Name of the column which contains \
+                              the class labels in TSV files.',
+                        default='y')
     parser.add_argument('--version', action='version',
                         version='%(prog)s {0}'.format(__version__))
     args = parser.parse_args()
 
     # Create the classifier and load the model
-    predictor = Predictor(args.model_prefix,
+    predictor = Predictor(args.model_file,
                           positive_class=args.positive_class,
                           threshold=args.threshold)
 
     for input_file in args.input_file:
-        data = load_examples(input_file, has_labels=args.has_labels)
+        data = load_examples(input_file, quiet=args.quiet,
+                             tsv_label=args.tsv_label)
         for pred in predictor.predict(data):
             print(pred)
 

diff --git a/scripts/run_experiment b/scripts/run_experiment
@@ -81,10 +81,6 @@ def main():
                         version='%(prog)s {0}'.format(__version__))
     args = parser.parse_args()
 
-    # initialize the logger
-    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s',
-                        level=logging.INFO)
-
     machines = None
     if args.machines:
         machines = args.machines.split(',')

diff --git a/skll/data.py b/skll/data.py
@@ -27,14 +27,14 @@
 from __future__ import print_function, unicode_literals
 
 import json
-import logging
 import os
 import sys
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from csv import DictReader, DictWriter, excel_tab
 from decimal import Decimal
 from itertools import islice
 from io import open
+from multiprocessing import log_to_stderr
 from operator import itemgetter
 
 import numpy as np
@@ -413,7 +413,7 @@ def load_examples(path, quiet=False, sparse=True, tsv_label='y',
              the feature matrix.
     '''
     # Setup logger
-    logger = logging.getLogger(__name__)
+    logger = log_to_stderr()
 
     logger.debug('Path: {}'.format(path))
 
@@ -544,7 +544,7 @@ def write_feature_file(path, ids, classes, features, feat_vectorizer=None,
     :type tsv_label: str
     '''
     # Setup logger
-    logger = logging.getLogger(__name__)
+    logger = log_to_stderr()
 
     logger.debug('Feature vectorizer: {}'.format(feat_vectorizer))
     logger.debug('Features: {}'.format(features))

diff --git a/skll/experiments.py b/skll/experiments.py
@@ -29,15 +29,14 @@
 import datetime
 import errno
 import json
-import logging
 import math
 import os
 import sys
 import tempfile
 from collections import defaultdict
 from io import open
 from itertools import chain
-from multiprocessing import Pool
+from multiprocessing import log_to_stderr, Pool
 
 import configparser  # Backported version from Python 3
 import numpy as np
@@ -114,7 +113,7 @@ def _write_summary_file(result_json_paths, output_file, ablation=False):
     '''
     learner_result_dicts = []
     all_features = set()
-    logger = logging.getLogger(__name__)
+    logger = log_to_stderr()
     for json_path in result_json_paths:
         if not os.path.exists(json_path):
             logger.error(('JSON results file {} not found. Skipping summary ' +
@@ -714,7 +713,7 @@ def run_configuration(config_file, local=False, overwrite=True, queue='all.q',
     # Read configuration
     config = _parse_config_file(config_file)
 
-    logger = logging.getLogger(__name__)
+    logger = log_to_stderr()
     if not local and not _HAVE_GRIDMAP:
         local = True
         logger.warning('gridmap 0.10.1+ not available. Forcing local ' +
@@ -969,7 +968,7 @@ def _check_job_results(job_results):
     '''
     See if we have a complete results dictionary for every job.
     '''
-    logger = logging.getLogger(__name__)
+    logger = log_to_stderr()
     logger.info('checking job results')
     for result_dicts in job_results:
         if not result_dicts or 'task' not in result_dicts[0]:
@@ -1060,7 +1059,7 @@ def run_ablation(config_path, local=False, overwrite=True, queue='all.q',
     # Read configuration
     config = _parse_config_file(config_path)
 
-    logger = logging.getLogger(__name__)
+    logger = log_to_stderr()
 
     featuresets = json.loads(_fix_json(config.get("Input", "featuresets")))
     featureset_names = json.loads(_fix_json(config.get("Input",
@@ -1094,7 +1093,7 @@ def run_ablation(config_path, local=False, overwrite=True, queue='all.q',
         try:
             result_json_paths.extend(chain(*pool.map(_run_experiment_without_feature,
                                                      list(arg_tuples))))
-        # If we run_ablation is run via a subprocess (like nose does),
+        # If run_experiment is run via a subprocess (like nose does),
         # this will fail, so just do things serially then.
         except AssertionError:
             del pool

diff --git a/skll/learner.py b/skll/learner.py
@@ -27,12 +27,11 @@
 from __future__ import absolute_import, print_function, unicode_literals
 
 import inspect
-import logging
 import os
 import sys
 from collections import defaultdict
 from functools import wraps
-from multiprocessing import cpu_count
+from multiprocessing import cpu_count, log_to_stderr
 
 import numpy as np
 import scipy.sparse as sp
@@ -107,7 +106,7 @@ def __init__(self, labels, keep, examples):
         self._warned = False
 
     def __iter__(self):
-        logger = logging.getLogger(__name__)
+        logger = log_to_stderr()
         for train_index, test_index in super(FilteredLeaveOneLabelOut,
                                              self).__iter__():
             train_len = len(train_index)

diff --git a/skll/metrics.py b/skll/metrics.py
@@ -27,7 +27,7 @@
 
 from __future__ import print_function, unicode_literals
 
-import logging
+from multiprocessing import log_to_stderr
 
 import numpy as np
 from scipy.stats import kendalltau, spearmanr, pearsonr
@@ -75,7 +75,7 @@ def kappa(y_true, y_pred, weights=None, allow_off_by_one=False):
                              for when building the weights matrix.
     :type allow_off_by_one: bool
     '''
-    logger = logging.getLogger(__name__)
+    logger = log_to_stderr()
 
     # Ensure that the lists are both the same length
     assert(len(y_true) == len(y_pred))

diff --git a/skll/version.py b/skll/version.py
@@ -23,5 +23,5 @@
 :organization: ETS
 '''
 
-__version__ = '0.18.0'
+__version__ = '0.18.1'
 VERSION = tuple(int(x) for x in __version__.split('.'))