From 9eba67d49d57fcff49676d59db2628d95b4141a5 Mon Sep 17 00:00:00 2001 From: Afshin Rostamizadeh Date: Fri, 6 Oct 2017 18:57:13 -0400 Subject: [PATCH] First commit. --- CONTRIBUTING.md | 23 ++ LICENSE | 202 ++++++++++ README.md | 171 +++++++++ __init__.py | 14 + requirements.txt | 9 + run_experiment.py | 344 +++++++++++++++++ sampling_methods/__init__.py | 14 + sampling_methods/bandit_discrete.py | 125 ++++++ sampling_methods/constants.py | 127 ++++++ sampling_methods/graph_density.py | 92 +++++ .../hierarchical_clustering_AL.py | 362 ++++++++++++++++++ sampling_methods/informative_diverse.py | 101 +++++ sampling_methods/kcenter_greedy.py | 123 ++++++ sampling_methods/margin_AL.py | 64 ++++ sampling_methods/mixture_of_samplers.py | 110 ++++++ sampling_methods/represent_cluster_centers.py | 78 ++++ sampling_methods/sampling_def.py | 54 +++ sampling_methods/simulate_batch.py | 261 +++++++++++++ sampling_methods/uniform_sampling.py | 52 +++ sampling_methods/utils/__init__.py | 14 + sampling_methods/utils/tree.py | 158 ++++++++ sampling_methods/utils/tree_test.py | 79 ++++ sampling_methods/wrapper_sampler_def.py | 50 +++ utils/__init__.py | 14 + utils/allconv.py | 196 ++++++++++ utils/chart_data.py | 230 +++++++++++ utils/create_data.py | 284 ++++++++++++++ utils/kernel_block_solver.py | 185 +++++++++ utils/small_cnn.py | 199 ++++++++++ utils/utils.py | 336 ++++++++++++++++ 30 files changed, 4071 insertions(+) create mode 100644 CONTRIBUTING.md create mode 100644 LICENSE create mode 100644 README.md create mode 100644 __init__.py create mode 100644 requirements.txt create mode 100644 run_experiment.py create mode 100644 sampling_methods/__init__.py create mode 100644 sampling_methods/bandit_discrete.py create mode 100644 sampling_methods/constants.py create mode 100644 sampling_methods/graph_density.py create mode 100644 sampling_methods/hierarchical_clustering_AL.py create mode 100644 sampling_methods/informative_diverse.py create mode 100644 sampling_methods/kcenter_greedy.py create mode 100644 sampling_methods/margin_AL.py create mode 100644 sampling_methods/mixture_of_samplers.py create mode 100644 sampling_methods/represent_cluster_centers.py create mode 100644 sampling_methods/sampling_def.py create mode 100644 sampling_methods/simulate_batch.py create mode 100644 sampling_methods/uniform_sampling.py create mode 100644 sampling_methods/utils/__init__.py create mode 100644 sampling_methods/utils/tree.py create mode 100644 sampling_methods/utils/tree_test.py create mode 100644 sampling_methods/wrapper_sampler_def.py create mode 100644 utils/__init__.py create mode 100644 utils/allconv.py create mode 100644 utils/chart_data.py create mode 100644 utils/create_data.py create mode 100644 utils/kernel_block_solver.py create mode 100644 utils/small_cnn.py create mode 100644 utils/utils.py diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..ae319c7 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,23 @@ +# How to Contribute + +We'd love to accept your patches and contributions to this project. There are +just a few small guidelines you need to follow. + +## Contributor License Agreement + +Contributions to this project must be accompanied by a Contributor License +Agreement. You (or your employer) retain the copyright to your contribution, +this simply gives us permission to use and redistribute your contributions as +part of the project. Head over to to see +your current agreements on file or to sign a new one. + +You generally only need to submit a CLA once, so if you've already submitted one +(even if it was for a different project), you probably don't need to do it +again. + +## Code reviews + +All submissions, including submissions by project members, require review. We +use GitHub pull requests for this purpose. Consult +[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more +information on using pull requests. diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..7a4a3ea --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..41e2565 --- /dev/null +++ b/README.md @@ -0,0 +1,171 @@ +# Active Learning Playground + +## Introduction + +This is a python module for experimenting with different active learning +algorithms. There are a few key components to running active learning +experiments: + +* Main experiment script is + [`run_experiment.py`](run_experiment.py) + with many flags for different run options. + +* Supported datasets can be downloaded to a specified directory by running + [`utils/create_data.py`](utils/create_data.py). + +* Supported active learning methods are in + [`sampling_methods`](sampling_methods/). + +Below I will go into each component in more detail. + +DISCLAIMER: This is not an official Google product. + +## Setup +The dependencies are in [`requirements.txt`](requirements.txt). Please make sure these packages are +installed before running experiments. If GPU capable `tensorflow` is desired, please follow +instructions [here](https://www.tensorflow.org/install/). + +It is highly suggested that you install all dependencies into a separate `virtualenv` for +easy package management. + +## Getting benchmark datasets + +By default the datasets are saved to `/tmp/data`. You can specify another directory via the +`--save_dir` flag. + +Redownloading all the datasets will be very time consuming so please be patient. +You can specify a subset of the data to download by passing in a comma separated +string of datasets via the `--datasets` flag. + +## Running experiments + +There are a few key flags for +[`run_experiment.py`](run_experiment.py): + +* `dataset`: name of the dataset, must match the save name used in + `create_data.py`. Must also exist in the data_dir. + +* `sampling_method`: active learning method to use. Must be specified in + [`sampling_methods/constants.py`](sampling_methods/constants.py). + +* `warmstart_size`: initial batch of uniformly sampled examples to use as seed + data. Float indicates percentage of total training data and integer + indicates raw size. + +* `batch_size`: number of datapoints to request in each batch. Float indicates + percentage of total training data and integer indicates raw size. + +* `score_method`: model to use to evaluate the performance of the sampling + method. Must be in `get_model` method of + [`utils/utils.py`](utils/utils.py). + +* `data_dir`: directory with saved datasets. + +* `save_dir`: directory to save results. + +This is just a subset of all the flags. There are also options for +preprocessing, introducing labeling noise, dataset subsampling, and using a +different model to select than to score/evaluate. + +## Available active learning methods + +All named active learning methods are in +[`sampling_methods/constants.py`](sampling_methods/constants.py). + +You can also specify a mixture of active learning methods by following the +pattern of `[sampling_method]-[mixture_weight]` separated by dashes; i.e. +`mixture_of_samplers-margin-0.33-informative_diverse-0.33-uniform-0.34`. + +Some supported sampling methods include: + +* Uniform: samples are selected via uniform sampling. + +* Margin: uncertainty based sampling method. + +* Informative and diverse: margin and cluster based sampling method. + +* k-center greedy: representative strategy that greedily forms a batch of + points to minimize maximum distance from a labeled point. + +* Graph density: representative strategy that selects points in dense regions + of pool. + +* Exp3 bandit: meta-active learning method that tries to learns optimal + sampling method using a popular multi-armed bandit algorithm. + +### Adding new active learning methods + +Implement either a base sampler that inherits from +[`SamplingMethod`](sampling_methods/sampling_def.py) +or a meta-sampler that calls base samplers which inherits from +[`WrapperSamplingMethod`](sampling_methods/wrapper_sampler_def.py). + +The only method that must be implemented by any sampler is `select_batch_`, +which can have arbitrary named arguments. The only restriction is that the name +for the same input must be consistent across all the samplers (i.e. the indices +for already selected examples all have the same name across samplers). Adding a +new named argument that hasn't been used in other sampling methods will require +feeding that into the `select_batch` call in +[`run_experiment.py`](run_experiment.py). + +After implementing your sampler, be sure to add it to +[`constants.py`](sampling_methods/constants.py) +so that it can be called from +[`run_experiment.py`](run_experiment.py). + +## Available models + +All available models are in the `get_model` method of +[`utils/utils.py`](utils/utils.py). + +Supported methods: + +* Linear SVM: scikit method with grid search wrapper for regularization + parameter. + +* Kernel SVM: scikit method with grid search wrapper for regularization + parameter. + +* Logistc Regression: scikit method with grid search wrapper for + regularization parameter. + +* Small CNN: 4 layer CNN optimized using rmsprop implemented in Keras with + tensorflow backend. + +* Kernel Least Squares Classification: block gradient descient solver that can + use multiple cores so is often faster than scikit Kernel SVM. + +### Adding new models + +New models must follow the scikit learn api and implement the following methods + +* `fit(X, y[, sample_weight])`: fit the model to the input features and + target. + +* `predict(X)`: predict the value of the input features. + +* `score(X, y)`: returns target metric given test features and test targets. + +* `decision_function(X)` (optional): return class probabilities, distance to + decision boundaries, or other metric that can be used by margin sampler as a + measure of uncertainty. + +See +[`small_cnn.py`](utils/small_cnn.py) +for an example. + +After implementing your new model, be sure to add it to `get_model` method of +[`utils/utils.py`](utils/utils.py). + +Currently models must be added on a one-off basis and not all scikit-learn +classifiers are supported due to the need for user input on whether and how to +tune the hyperparameters of the model. However, it is very easy to add a +scikit-learn model with hyperparameter search wrapped around as a supported +model. + +## Collecting results and charting + +The +[`utils/chart_data.py`](utils/chart_data.py) +script handles processing of data and charting for a specified dataset and +source directory. diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..3eeb306 --- /dev/null +++ b/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1840c77 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +numpy>=1.13 +scipy>=0.19 +pandas>=0.20 +scikit-learn>=0.19 +matplotlib>=2.0.2 +tensorflow>=1.3 +keras>=2.0.8 +google-apputils>=0.4.2 + diff --git a/run_experiment.py b/run_experiment.py new file mode 100644 index 0000000..aad001b --- /dev/null +++ b/run_experiment.py @@ -0,0 +1,344 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Run active learner on classification tasks. + +Supported datasets include mnist, letter, cifar10, newsgroup20, rcv1, +wikipedia attack, and select classification datasets from mldata. +See utils/create_data.py for all available datasets. + +For binary classification, mnist_4_9 indicates mnist filtered down to just 4 and +9. +By default uses logistic regression but can also train using kernel SVM. +2 fold cv is used to tune regularization parameter over a exponential grid. + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import pickle +import sys +from time import gmtime +from time import strftime + +import numpy as np +from sklearn.preprocessing import normalize +from sklearn.preprocessing import StandardScaler + +from google.apputils import app +import gflags as flags +from tensorflow import gfile + +from sampling_methods.constants import AL_MAPPING +from sampling_methods.constants import get_AL_sampler +from sampling_methods.constants import get_wrapper_AL_mapping +from utils import utils + +flags.DEFINE_string("dataset", "letter", "Dataset name") +flags.DEFINE_string("sampling_method", "margin", + ("Name of sampling method to use, can be any defined in " + "AL_MAPPING in sampling_methods.constants")) +flags.DEFINE_float( + "warmstart_size", 0.02, + ("Can be float or integer. Float indicates percentage of training data " + "to use in the initial warmstart model") +) +flags.DEFINE_float( + "batch_size", 0.02, + ("Can be float or integer. Float indicates batch size as a percentage " + "of training data size.") +) +flags.DEFINE_integer("trials", 1, + "Number of curves to create using different seeds") +flags.DEFINE_integer("seed", 1, "Seed to use for rng and random state") +# TODO(lisha): add feature noise to simulate data outliers +flags.DEFINE_string("confusions", "0.", "Percentage of labels to randomize") +flags.DEFINE_string("active_sampling_percentage", "1.0", + "Mixture weights on active sampling.") +flags.DEFINE_string( + "score_method", "logistic", + "Method to use to calculate accuracy.") +flags.DEFINE_string( + "select_method", "None", + "Method to use for selecting points.") +flags.DEFINE_string("normalize_data", "False", "Whether to normalize the data.") +flags.DEFINE_string("standardize_data", "True", + "Whether to standardize the data.") +flags.DEFINE_string("save_dir", "/tmp/toy_experiments", + "Where to save outputs") +flags.DEFINE_string("data_dir", "/tmp/data", + "Directory with predownloaded and saved datasets.") +flags.DEFINE_string("max_dataset_size", "15000", + ("maximum number of datapoints to include in data " + "zero indicates no limit")) +flags.DEFINE_float("train_horizon", "1.0", + "how far to extend learning curve as a percent of train") +flags.DEFINE_string("do_save", "True", + "whether to save log and results") +FLAGS = flags.FLAGS + + +get_wrapper_AL_mapping() + + +def generate_one_curve(X, + y, + sampler, + score_model, + seed, + warmstart_size, + batch_size, + select_model=None, + confusion=0., + active_p=1.0, + max_points=None, + standardize_data=False, + norm_data=False, + train_horizon=0.5): + """Creates one learning curve for both active and passive learning. + + Will calculate accuracy on validation set as the number of training data + points increases for both PL and AL. + Caveats: training method used is sensitive to sorting of the data so we + resort all intermediate datasets + + Args: + X: training data + y: training labels + sampler: sampling class from sampling_methods, assumes reference + passed in and sampler not yet instantiated. + score_model: model used to score the samplers. Expects fit and predict + methods to be implemented. + seed: seed used for data shuffle and other sources of randomness in sampler + or model training + warmstart_size: float or int. float indicates percentage of train data + to use for initial model + batch_size: float or int. float indicates batch size as a percent of + training data + select_model: defaults to None, in which case the score model will be + used to select new datapoints to label. Model must implement fit, predict + and depending on AL method may also need decision_function. + confusion: percentage of labels of one class to flip to the other + active_p: percent of batch to allocate to active learning + max_points: limit dataset size for preliminary + standardize_data: wheter to standardize the data to 0 mean unit variance + norm_data: whether to normalize the data. Default is False for logistic + regression. + train_horizon: how long to draw the curve for. Percent of training data. + + Returns: + results: dictionary of results for all samplers + sampler_states: dictionary of sampler objects for debugging + """ + # TODO(lishal): add option to find best hyperparameter setting first on + # full dataset and fix the hyperparameter for the rest of the routine + # This will save computation and also lead to more stable behavior for the + # test accuracy + + # TODO(lishal): remove mixture parameter and have the mixture be specified as + # a mixture of samplers strategy + def select_batch(sampler, uniform_sampler, mixture, N, already_selected, + **kwargs): + n_active = int(mixture * N) + n_passive = N - n_active + kwargs["N"] = n_active + kwargs["already_selected"] = already_selected + batch_AL = sampler.select_batch(**kwargs) + already_selected = already_selected + batch_AL + kwargs["N"] = n_passive + kwargs["already_selected"] = already_selected + batch_PL = uniform_sampler.select_batch(**kwargs) + return batch_AL + batch_PL + + np.random.seed(seed) + data_splits = [2./3, 1./6, 1./6] + + # 2/3 of data for training + if max_points is None: + max_points = len(y) + train_size = int(min(max_points, len(y)) * data_splits[0]) + if batch_size < 1: + batch_size = int(batch_size * train_size) + else: + batch_size = int(batch_size) + if warmstart_size < 1: + # Set seed batch to provide enough samples to get at least 4 per class + # TODO(lishal): switch to sklearn stratified sampler + seed_batch = int(warmstart_size * train_size) + else: + seed_batch = int(warmstart_size) + seed_batch = max(seed_batch, 6 * len(np.unique(y))) + + indices, X_train, y_train, X_val, y_val, X_test, y_test, y_noise = ( + utils.get_train_val_test_splits(X,y,max_points,seed,confusion, + seed_batch, split=data_splits)) + + # Preprocess data + if norm_data: + print("Normalizing data") + X_train = normalize(X_train) + X_val = normalize(X_val) + X_test = normalize(X_test) + if standardize_data: + print("Standardizing data") + scaler = StandardScaler().fit(X_train) + X_train = scaler.transform(X_train) + X_val = scaler.transform(X_val) + X_test = scaler.transform(X_test) + print("active percentage: " + str(active_p) + " warmstart batch: " + + str(seed_batch) + " batch size: " + str(batch_size) + " confusion: " + + str(confusion) + " seed: " + str(seed)) + + # Initialize samplers + uniform_sampler = AL_MAPPING["uniform"](X_train, y_train, seed) + sampler = sampler(X_train, y_train, seed) + + results = {} + data_sizes = [] + accuracy = [] + selected_inds = range(seed_batch) + + # If select model is None, use score_model + same_score_select = False + if select_model is None: + select_model = score_model + same_score_select = True + + n_batches = int(np.ceil((train_horizon * train_size - seed_batch) * + 1.0 / batch_size)) + 1 + for b in range(n_batches): + n_train = seed_batch + min(train_size - seed_batch, b * batch_size) + print("Training model on " + str(n_train) + " datapoints") + + assert n_train == len(selected_inds) + data_sizes.append(n_train) + + # Sort active_ind so that the end results matches that of uniform sampling + partial_X = X_train[sorted(selected_inds)] + partial_y = y_train[sorted(selected_inds)] + score_model.fit(partial_X, partial_y) + if not same_score_select: + select_model.fit(partial_X, partial_y) + acc = score_model.score(X_test, y_test) + accuracy.append(acc) + print("Sampler: %s, Accuracy: %.2f%%" % (sampler.name, accuracy[-1]*100)) + + n_sample = min(batch_size, train_size - len(selected_inds)) + select_batch_inputs = { + "model": select_model, + "labeled": dict(zip(selected_inds, y_train[selected_inds])), + "eval_acc": accuracy[-1], + "X_test": X_val, + "y_test": y_val, + "y": y_train + } + new_batch = select_batch(sampler, uniform_sampler, active_p, n_sample, + selected_inds, **select_batch_inputs) + selected_inds.extend(new_batch) + print('Requested: %d, Selected: %d' % (n_sample, len(new_batch))) + assert len(new_batch) == n_sample + assert len(list(set(selected_inds))) == len(selected_inds) + + # Check that the returned indice are correct and will allow mapping to + # training set from original data + assert all(y_noise[indices[selected_inds]] == y_train[selected_inds]) + results["accuracy"] = accuracy + results["selected_inds"] = selected_inds + results["data_sizes"] = data_sizes + results["indices"] = indices + results["noisy_targets"] = y_noise + return results, sampler + + +def main(argv): + del argv + + if not gfile.Exists(FLAGS.save_dir): + try: + gfile.MkDir(FLAGS.save_dir) + except: + print(('WARNING: error creating save directory, ' + 'directory most likely already created.')) + + save_dir = os.path.join( + FLAGS.save_dir, + FLAGS.dataset + "_" + FLAGS.sampling_method) + do_save = FLAGS.do_save == "True" + + if do_save: + if not gfile.Exists(save_dir): + try: + gfile.MkDir(save_dir) + except: + print(('WARNING: error creating save directory, ' + 'directory most likely already created.')) + # Set up logging + filename = os.path.join( + save_dir, "log-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime()) + ".txt") + sys.stdout = utils.Logger(filename) + + confusions = [float(t) for t in FLAGS.confusions.split(" ")] + mixtures = [float(t) for t in FLAGS.active_sampling_percentage.split(" ")] + all_results = {} + max_dataset_size = None if FLAGS.max_dataset_size == "0" else int( + FLAGS.max_dataset_size) + normalize_data = FLAGS.normalize_data == "True" + standardize_data = FLAGS.standardize_data == "True" + X, y = utils.get_mldata(FLAGS.data_dir, FLAGS.dataset) + starting_seed = FLAGS.seed + + for c in confusions: + for m in mixtures: + for seed in range(starting_seed, starting_seed + FLAGS.trials): + sampler = get_AL_sampler(FLAGS.sampling_method) + score_model = utils.get_model(FLAGS.score_method, seed) + if (FLAGS.select_method == "None" or + FLAGS.select_method == FLAGS.score_method): + select_model = None + else: + select_model = utils.get_model(FLAGS.select_method, seed) + results, sampler_state = generate_one_curve( + X, y, sampler, score_model, seed, FLAGS.warmstart_size, + FLAGS.batch_size, select_model, c, m, max_dataset_size, + standardize_data, normalize_data, FLAGS.train_horizon) + key = (FLAGS.dataset, FLAGS.sampling_method, FLAGS.score_method, + FLAGS.select_method, m, FLAGS.warmstart_size, FLAGS.batch_size, + c, standardize_data, normalize_data, seed) + sampler_output = sampler_state.to_dict() + results["sampler_output"] = sampler_output + all_results[key] = results + fields = [ + "dataset", "sampler", "score_method", "select_method", + "active percentage", "warmstart size", "batch size", "confusion", + "standardize", "normalize", "seed" + ] + all_results["tuple_keys"] = fields + + if do_save: + filename = ("results_score_" + FLAGS.score_method + + "_select_" + FLAGS.select_method + + "_norm_" + str(normalize_data) + + "_stand_" + str(standardize_data)) + existing_files = gfile.Glob(os.path.join(save_dir, filename + "*.pkl")) + filename = os.path.join(save_dir, + filename + "_" + str(1000+len(existing_files))[1:] + ".pkl") + pickle.dump(all_results, gfile.GFile(filename, "w")) + sys.stdout.flush_file() + + +if __name__ == "__main__": + app.run() diff --git a/sampling_methods/__init__.py b/sampling_methods/__init__.py new file mode 100644 index 0000000..3eeb306 --- /dev/null +++ b/sampling_methods/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/sampling_methods/bandit_discrete.py b/sampling_methods/bandit_discrete.py new file mode 100644 index 0000000..44f1bf4 --- /dev/null +++ b/sampling_methods/bandit_discrete.py @@ -0,0 +1,125 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Bandit wrapper around base AL sampling methods. + +Assumes adversarial multi-armed bandit setting where arms correspond to +mixtures of different AL methods. + +Uses EXP3 algorithm to decide which AL method to use to create the next batch. +Similar to Hsu & Lin 2015, Active Learning by Learning. +https://www.csie.ntu.edu.tw/~htlin/paper/doc/aaai15albl.pdf +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from sampling_methods.wrapper_sampler_def import AL_MAPPING, WrapperSamplingMethod + + +class BanditDiscreteSampler(WrapperSamplingMethod): + """Wraps EXP3 around mixtures of indicated methods. + + Uses EXP3 mult-armed bandit algorithm to select sampler methods. + """ + def __init__(self, + X, + y, + seed, + reward_function = lambda AL_acc: AL_acc[-1], + gamma=0.5, + samplers=[{'methods':('margin','uniform'),'weights':(0,1)}, + {'methods':('margin','uniform'),'weights':(1,0)}]): + """Initializes sampler with indicated gamma and arms. + + Args: + X: training data + y: labels, may need to be input into base samplers + seed: seed to use for random sampling + reward_function: reward based on previously observed accuracies. Assumes + that the input is a sequence of observed accuracies. Will ultimately be + a class method and may need access to other class properties. + gamma: weight on uniform mixture. Arm probability updates are a weighted + mixture of uniform and an exponentially weighted distribution. + Lower gamma more aggressively updates based on observed rewards. + samplers: list of dicts with two fields + 'samplers': list of named samplers + 'weights': percentage of batch to allocate to each sampler + """ + + self.name = 'bandit_discrete' + np.random.seed(seed) + self.X = X + self.y = y + self.seed = seed + self.initialize_samplers(samplers) + + self.gamma = gamma + self.n_arms = len(samplers) + self.reward_function = reward_function + + self.pull_history = [] + self.acc_history = [] + self.w = np.ones(self.n_arms) + self.x = np.zeros(self.n_arms) + self.p = self.w / (1.0 * self.n_arms) + self.probs = [] + + def update_vars(self, arm_pulled): + reward = self.reward_function(self.acc_history) + self.x = np.zeros(self.n_arms) + self.x[arm_pulled] = reward / self.p[arm_pulled] + self.w = self.w * np.exp(self.gamma * self.x / self.n_arms) + self.p = ((1.0 - self.gamma) * self.w / sum(self.w) + + self.gamma / self.n_arms) + print(self.p) + self.probs.append(self.p) + + def select_batch_(self, already_selected, N, eval_acc, **kwargs): + """Returns batch of datapoints sampled using mixture of AL_methods. + + Assumes that data has already been shuffled. + + Args: + already_selected: index of datapoints already selected + N: batch size + eval_acc: accuracy of model trained after incorporating datapoints from + last recommended batch + + Returns: + indices of points selected to label + """ + # Update observed reward and arm probabilities + self.acc_history.append(eval_acc) + if len(self.pull_history) > 0: + self.update_vars(self.pull_history[-1]) + # Sample an arm + arm = np.random.choice(range(self.n_arms), p=self.p) + self.pull_history.append(arm) + kwargs['N'] = N + kwargs['already_selected'] = already_selected + sample = self.samplers[arm].select_batch(**kwargs) + return sample + + def to_dict(self): + output = {} + output['samplers'] = self.base_samplers + output['arm_probs'] = self.probs + output['pull_history'] = self.pull_history + output['rewards'] = self.acc_history + return output + diff --git a/sampling_methods/constants.py b/sampling_methods/constants.py new file mode 100644 index 0000000..232c8f6 --- /dev/null +++ b/sampling_methods/constants.py @@ -0,0 +1,127 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Controls imports to fill up dictionary of different sampling methods. +""" + +from functools import partial +AL_MAPPING = {} + + +def get_base_AL_mapping(): + from sampling_methods.margin_AL import MarginAL + from sampling_methods.informative_diverse import InformativeClusterDiverseSampler + from sampling_methods.hierarchical_clustering_AL import HierarchicalClusterAL + from sampling_methods.uniform_sampling import UniformSampling + from sampling_methods.represent_cluster_centers import RepresentativeClusterMeanSampling + from sampling_methods.graph_density import GraphDensitySampler + from sampling_methods.kcenter_greedy import kCenterGreedy + AL_MAPPING['margin'] = MarginAL + AL_MAPPING['informative_diverse'] = InformativeClusterDiverseSampler + AL_MAPPING['hierarchical'] = HierarchicalClusterAL + AL_MAPPING['uniform'] = UniformSampling + AL_MAPPING['margin_cluster_mean'] = RepresentativeClusterMeanSampling + AL_MAPPING['graph_density'] = GraphDensitySampler + AL_MAPPING['kcenter'] = kCenterGreedy + + +def get_all_possible_arms(): + from sampling_methods.mixture_of_samplers import MixtureOfSamplers + AL_MAPPING['mixture_of_samplers'] = MixtureOfSamplers + + +def get_wrapper_AL_mapping(): + from sampling_methods.bandit_discrete import BanditDiscreteSampler + from sampling_methods.simulate_batch import SimulateBatchSampler + AL_MAPPING['bandit_mixture'] = partial( + BanditDiscreteSampler, + samplers=[{ + 'methods': ['margin', 'uniform'], + 'weights': [0, 1] + }, { + 'methods': ['margin', 'uniform'], + 'weights': [0.25, 0.75] + }, { + 'methods': ['margin', 'uniform'], + 'weights': [0.5, 0.5] + }, { + 'methods': ['margin', 'uniform'], + 'weights': [0.75, 0.25] + }, { + 'methods': ['margin', 'uniform'], + 'weights': [1, 0] + }]) + AL_MAPPING['bandit_discrete'] = partial( + BanditDiscreteSampler, + samplers=[{ + 'methods': ['margin', 'uniform'], + 'weights': [0, 1] + }, { + 'methods': ['margin', 'uniform'], + 'weights': [1, 0] + }]) + AL_MAPPING['simulate_batch_mixture'] = partial( + SimulateBatchSampler, + samplers=({ + 'methods': ['margin', 'uniform'], + 'weights': [1, 0] + }, { + 'methods': ['margin', 'uniform'], + 'weights': [0.5, 0.5] + }, { + 'methods': ['margin', 'uniform'], + 'weights': [0, 1] + }), + n_sims=5, + train_per_sim=10, + return_best_sim=False) + AL_MAPPING['simulate_batch_best_sim'] = partial( + SimulateBatchSampler, + samplers=[{ + 'methods': ['margin', 'uniform'], + 'weights': [1, 0] + }], + n_sims=10, + train_per_sim=10, + return_type='best_sim') + AL_MAPPING['simulate_batch_frequency'] = partial( + SimulateBatchSampler, + samplers=[{ + 'methods': ['margin', 'uniform'], + 'weights': [1, 0] + }], + n_sims=10, + train_per_sim=10, + return_type='frequency') + +def get_mixture_of_samplers(name): + assert 'mixture_of_samplers' in name + if 'mixture_of_samplers' not in AL_MAPPING: + raise KeyError('Mixture of Samplers not yet loaded.') + args = name.split('-')[1:] + samplers = args[0::2] + weights = args[1::2] + weights = [float(w) for w in weights] + assert sum(weights) == 1 + mixture = {'methods': samplers, 'weights': weights} + print(mixture) + return partial(AL_MAPPING['mixture_of_samplers'], mixture=mixture) + + +def get_AL_sampler(name): + if name in AL_MAPPING and name != 'mixture_of_samplers': + return AL_MAPPING[name] + if 'mixture_of_samplers' in name: + return get_mixture_of_samplers(name) + raise NotImplementedError('The specified sampler is not available.') diff --git a/sampling_methods/graph_density.py b/sampling_methods/graph_density.py new file mode 100644 index 0000000..d6f13d5 --- /dev/null +++ b/sampling_methods/graph_density.py @@ -0,0 +1,92 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Diversity promoting sampling method that uses graph density to determine + most representative points. + +This is an implementation of the method described in +https://www.mpi-inf.mpg.de/fileadmin/inf/d2/Research_projects_files/EbertCVPR2012.pdf +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy + +from sklearn.neighbors import kneighbors_graph +from sklearn.metrics import pairwise_distances +import numpy as np +from sampling_methods.sampling_def import SamplingMethod + + +class GraphDensitySampler(SamplingMethod): + """Diversity promoting sampling method that uses graph density to determine + most representative points. + """ + + def __init__(self, X, y, seed): + self.name = 'graph_density' + self.X = X + self.flat_X = self.flatten_X() + # Set gamma for gaussian kernel to be equal to 1/n_features + self.gamma = 1. / self.X.shape[1] + self.compute_graph_density() + + def compute_graph_density(self, n_neighbor=10): + # kneighbors graph is constructed using k=10 + connect = kneighbors_graph(self.flat_X, n_neighbor,p=1) + # Make connectivity matrix symmetric, if a point is a k nearest neighbor of + # another point, make it vice versa + neighbors = connect.nonzero() + inds = zip(neighbors[0],neighbors[1]) + # Graph edges are weighted by applying gaussian kernel to manhattan dist. + # By default, gamma for rbf kernel is equal to 1/n_features but may + # get better results if gamma is tuned. + for entry in inds: + i = entry[0] + j = entry[1] + distance = pairwise_distances(self.flat_X[[i]],self.flat_X[[j]],metric='manhattan') + distance = distance[0,0] + weight = np.exp(-distance * self.gamma) + connect[i,j] = weight + connect[j,i] = weight + self.connect = connect + # Define graph density for an observation to be sum of weights for all + # edges to the node representing the datapoint. Normalize sum weights + # by total number of neighbors. + self.graph_density = np.zeros(self.X.shape[0]) + for i in np.arange(self.X.shape[0]): + self.graph_density[i] = connect[i,:].sum() / (connect[i,:]>0).sum() + self.starting_density = copy.deepcopy(self.graph_density) + + def select_batch_(self, N, already_selected, **kwargs): + # If a neighbor has already been sampled, reduce the graph density + # for its direct neighbors to promote diversity. + batch = set() + self.graph_density[already_selected] = min(self.graph_density) - 1 + while len(batch) < N: + selected = np.argmax(self.graph_density) + neighbors = (self.connect[selected,:] > 0).nonzero()[1] + self.graph_density[neighbors] = self.graph_density[neighbors] - self.graph_density[selected] + batch.add(selected) + self.graph_density[already_selected] = min(self.graph_density) - 1 + self.graph_density[list(batch)] = min(self.graph_density) - 1 + return list(batch) + + def to_dict(self): + output = {} + output['connectivity'] = self.connect + output['graph_density'] = self.starting_density + return output \ No newline at end of file diff --git a/sampling_methods/hierarchical_clustering_AL.py b/sampling_methods/hierarchical_clustering_AL.py new file mode 100644 index 0000000..33421d6 --- /dev/null +++ b/sampling_methods/hierarchical_clustering_AL.py @@ -0,0 +1,362 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Hierarchical cluster AL method. + +Implements algorithm described in Dasgupta, S and Hsu, D, +"Hierarchical Sampling for Active Learning, 2008 +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +from sklearn.cluster import AgglomerativeClustering +from sklearn.decomposition import PCA +from sklearn.neighbors import kneighbors_graph +from sampling_methods.sampling_def import SamplingMethod +from sampling_methods.utils.tree import Tree + + +class HierarchicalClusterAL(SamplingMethod): + """Implements hierarchical cluster AL based method. + + All methods are internal. select_batch_ is called via abstract classes + outward facing method select_batch. + + Default affininity is euclidean and default linkage is ward which links + cluster based on variance reduction. Hence, good results depend on + having normalized and standardized data. + """ + + def __init__(self, X, y, seed, beta=2, affinity='euclidean', linkage='ward', + clustering=None, max_features=None): + """Initializes AL method and fits hierarchical cluster to data. + + Args: + X: data + y: labels for determinining number of clusters as an input to + AgglomerativeClustering + seed: random seed used for sampling datapoints for batch + beta: width of error used to decide admissble labels, higher value of beta + corresponds to wider confidence and less stringent definition of + admissibility + See scikit Aggloerative clustering method for more info + affinity: distance metric used for hierarchical clustering + linkage: linkage method used to determine when to join clusters + clustering: can provide an AgglomerativeClustering that is already fit + max_features: limit number of features used to construct hierarchical + cluster. If specified, PCA is used to perform feature reduction and + the hierarchical clustering is performed using transformed features. + """ + self.name = 'hierarchical' + self.seed = seed + np.random.seed(seed) + # Variables for the hierarchical cluster + self.already_clustered = False + if clustering is not None: + self.model = clustering + self.already_clustered = True + self.n_leaves = None + self.n_components = None + self.children_list = None + self.node_dict = None + self.root = None # Node name, all node instances access through self.tree + self.tree = None + # Variables for the AL algorithm + self.initialized = False + self.beta = beta + self.labels = {} + self.pruning = [] + self.admissible = {} + self.selected_nodes = None + # Data variables + self.classes = None + self.X = X + + classes = list(set(y)) + self.n_classes = len(classes) + if max_features is not None: + transformer = PCA(n_components=max_features) + transformer.fit(X) + self.transformed_X = transformer.fit_transform(X) + #connectivity = kneighbors_graph(self.transformed_X,max_features) + self.model = AgglomerativeClustering( + affinity=affinity, linkage=linkage, n_clusters=len(classes)) + self.fit_cluster(self.transformed_X) + else: + self.model = AgglomerativeClustering( + affinity=affinity, linkage=linkage, n_clusters=len(classes)) + self.fit_cluster(self.X) + self.y = y + + self.y_labels = {} + # Fit cluster and update cluster variables + + self.create_tree() + print('Finished creating hierarchical cluster') + + def fit_cluster(self, X): + if not self.already_clustered: + self.model.fit(X) + self.already_clustered = True + self.n_leaves = self.model.n_leaves_ + self.n_components = self.model.n_components_ + self.children_list = self.model.children_ + + def create_tree(self): + node_dict = {} + for i in range(self.n_leaves): + node_dict[i] = [None, None] + for i in range(len(self.children_list)): + node_dict[self.n_leaves + i] = self.children_list[i] + self.node_dict = node_dict + # The sklearn hierarchical clustering algo numbers leaves which correspond + # to actual datapoints 0 to n_points - 1 and all internal nodes have + # ids greater than n_points - 1 with the root having the highest node id + self.root = max(self.node_dict.keys()) + self.tree = Tree(self.root, self.node_dict) + self.tree.create_child_leaves_mapping(range(self.n_leaves)) + for v in node_dict: + self.admissible[v] = set() + + def get_child_leaves(self, node): + return self.tree.get_child_leaves(node) + + def get_node_leaf_counts(self, node_list): + node_counts = [] + for v in node_list: + node_counts.append(len(self.get_child_leaves(v))) + return np.array(node_counts) + + def get_class_counts(self, y): + """Gets the count of all classes in a sample. + + Args: + y: sample vector for which to perform the count + Returns: + count of classes for the sample vector y, the class order for count will + be the same as that of self.classes + """ + unique, counts = np.unique(y, return_counts=True) + complete_counts = [] + for c in self.classes: + if c not in unique: + complete_counts.append(0) + else: + index = np.where(unique == c)[0][0] + complete_counts.append(counts[index]) + return np.array(complete_counts) + + def observe_labels(self, labeled): + for i in labeled: + self.y_labels[i] = labeled[i] + self.classes = np.array( + sorted(list(set([self.y_labels[k] for k in self.y_labels])))) + self.n_classes = len(self.classes) + + def initialize_algo(self): + self.pruning = [self.root] + self.labels[self.root] = np.random.choice(self.classes) + node = self.tree.get_node(self.root) + node.best_label = self.labels[self.root] + self.selected_nodes = [self.root] + + def get_node_class_probabilities(self, node, y=None): + children = self.get_child_leaves(node) + if y is None: + y_dict = self.y_labels + else: + y_dict = dict(zip(range(len(y)), y)) + labels = [y_dict[c] for c in children if c in y_dict] + # If no labels have been observed, simply return uniform distribution + if len(labels) == 0: + return 0, np.ones(self.n_classes)/self.n_classes + return len(labels), self.get_class_counts(labels) / (len(labels) * 1.0) + + def get_node_upper_lower_bounds(self, node): + n_v, p_v = self.get_node_class_probabilities(node) + # If no observations, return worst possible upper lower bounds + if n_v == 0: + return np.zeros(len(p_v)), np.ones(len(p_v)) + delta = 1. / n_v + np.sqrt(p_v * (1 - p_v) / (1. * n_v)) + return (np.maximum(p_v - delta, np.zeros(len(p_v))), + np.minimum(p_v + delta, np.ones(len(p_v)))) + + def get_node_admissibility(self, node): + p_lb, p_up = self.get_node_upper_lower_bounds(node) + all_other_min = np.vectorize( + lambda i:min([1 - p_up[c] for c in range(len(self.classes)) if c != i])) + lowest_alternative_error = self.beta * all_other_min( + np.arange(len(self.classes))) + return 1 - p_lb < lowest_alternative_error + + def get_adjusted_error(self, node): + _, prob = self.get_node_class_probabilities(node) + error = 1 - prob + admissible = self.get_node_admissibility(node) + not_admissible = np.where(admissible != True)[0] + error[not_admissible] = 1.0 + return error + + def get_class_probability_pruning(self, method='lower'): + prob_pruning = [] + for v in self.pruning: + label = self.labels[v] + label_ind = np.where(self.classes == label)[0][0] + if method == 'empirical': + _, v_prob = self.get_node_class_probabilities(v) + else: + lower, upper = self.get_node_upper_lower_bounds(v) + if method == 'lower': + v_prob = lower + elif method == 'upper': + v_prob = upper + else: + raise NotImplementedError + prob = v_prob[label_ind] + prob_pruning.append(prob) + return np.array(prob_pruning) + + def get_pruning_impurity(self, y): + impurity = [] + for v in self.pruning: + _, prob = self.get_node_class_probabilities(v, y) + impurity.append(1-max(prob)) + impurity = np.array(impurity) + weights = self.get_node_leaf_counts(self.pruning) + weights = weights / sum(weights) + return sum(impurity*weights) + + def update_scores(self): + node_list = set(range(self.n_leaves)) + # Loop through generations from bottom to top + while len(node_list) > 0: + parents = set() + for v in node_list: + node = self.tree.get_node(v) + # Update admissible labels for node + admissible = self.get_node_admissibility(v) + admissable_indices = np.where(admissible)[0] + for l in self.classes[admissable_indices]: + self.admissible[v].add(l) + # Calculate score + v_error = self.get_adjusted_error(v) + best_label_ind = np.argmin(v_error) + if admissible[best_label_ind]: + node.best_label = self.classes[best_label_ind] + score = v_error[best_label_ind] + node.split = False + + # Determine if node should be split + if v >= self.n_leaves: # v is not a leaf + if len(admissable_indices) > 0: # There exists an admissible label + # Make sure label set for node so that we can flow to children + # if necessary + assert node.best_label is not None + # Only split if all ancestors are admissible nodes + # This is part of definition of admissible pruning + admissible_ancestors = [len(self.admissible[a]) > 0 for a in + self.tree.get_ancestor(node)] + if all(admissible_ancestors): + left = self.node_dict[v][0] + left_node = self.tree.get_node(left) + right = self.node_dict[v][1] + right_node = self.tree.get_node(right) + node_counts = self.get_node_leaf_counts([v, left, right]) + split_score = (node_counts[1] / node_counts[0] * + left_node.score + node_counts[2] / + node_counts[0] * right_node.score) + if split_score < score: + score = split_score + node.split = True + node.score = score + if node.parent: + parents.add(node.parent.name) + node_list = parents + + def update_pruning_labels(self): + for v in self.selected_nodes: + node = self.tree.get_node(v) + pruning = self.tree.get_pruning(node) + self.pruning.remove(v) + self.pruning.extend(pruning) + # Check that pruning covers all leave nodes + node_counts = self.get_node_leaf_counts(self.pruning) + assert sum(node_counts) == self.n_leaves + # Fill in labels + for v in self.pruning: + node = self.tree.get_node(v) + if node.best_label is None: + node.best_label = node.parent.best_label + self.labels[v] = node.best_label + + def get_fake_labels(self): + fake_y = np.zeros(self.X.shape[0]) + for p in self.pruning: + indices = self.get_child_leaves(p) + fake_y[indices] = self.labels[p] + return fake_y + + def train_using_fake_labels(self, model, X_test, y_test): + classes_labeled = set([self.labels[p] for p in self.pruning]) + if len(classes_labeled) == self.n_classes: + fake_y = self.get_fake_labels() + model.fit(self.X, fake_y) + test_acc = model.score(X_test, y_test) + return test_acc + return 0 + + def select_batch_(self, N, already_selected, labeled, y, **kwargs): + # Observe labels for previously recommended batches + self.observe_labels(labeled) + + if not self.initialized: + self.initialize_algo() + self.initialized = True + print('Initialized algo') + + print('Updating scores and pruning for labels from last batch') + self.update_scores() + self.update_pruning_labels() + print('Nodes in pruning: %d' % (len(self.pruning))) + print('Actual impurity for pruning is: %.2f' % + (self.get_pruning_impurity(y))) + + # TODO(lishal): implement multiple selection methods + selected_nodes = set() + weights = self.get_node_leaf_counts(self.pruning) + probs = 1 - self.get_class_probability_pruning() + weights = weights * probs + weights = weights / sum(weights) + batch = [] + + print('Sampling batch') + while len(batch) < N: + node = np.random.choice(list(self.pruning), p=weights) + children = self.get_child_leaves(node) + children = [ + c for c in children if c not in self.y_labels and c not in batch + ] + if len(children) > 0: + selected_nodes.add(node) + batch.append(np.random.choice(children)) + self.selected_nodes = selected_nodes + return batch + + def to_dict(self): + output = {} + output['node_dict'] = self.node_dict + return output diff --git a/sampling_methods/informative_diverse.py b/sampling_methods/informative_diverse.py new file mode 100644 index 0000000..d102337 --- /dev/null +++ b/sampling_methods/informative_diverse.py @@ -0,0 +1,101 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Informative and diverse batch sampler that samples points with small margin +while maintaining same distribution over clusters as entire training data. + +Batch is created by sorting datapoints by increasing margin and then growing +the batch greedily. A point is added to the batch if the result batch still +respects the constraint that the cluster distribution of the batch will +match the cluster distribution of the entire training set. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from sklearn.cluster import MiniBatchKMeans +import numpy as np +from sampling_methods.sampling_def import SamplingMethod + + +class InformativeClusterDiverseSampler(SamplingMethod): + """Selects batch based on informative and diverse criteria. + + Returns highest uncertainty lowest margin points while maintaining + same distribution over clusters as entire dataset. + """ + + def __init__(self, X, y, seed): + self.name = 'informative_and_diverse' + self.X = X + self.flat_X = self.flatten_X() + # y only used for determining how many clusters there should be + # probably not practical to assume we know # of classes before hand + # should also probably scale with dimensionality of data + self.y = y + self.n_clusters = len(list(set(y))) + self.cluster_model = MiniBatchKMeans(n_clusters=self.n_clusters) + self.cluster_data() + + def cluster_data(self): + # Probably okay to always use MiniBatchKMeans + # Should standardize data before clustering + # Can cluster on standardized data but train on raw features if desired + self.cluster_model.fit(self.flat_X) + unique, counts = np.unique(self.cluster_model.labels_, return_counts=True) + self.cluster_prob = counts/sum(counts) + self.cluster_labels = self.cluster_model.labels_ + + def select_batch_(self, model, already_selected, N, **kwargs): + """Returns a batch of size N using informative and diverse selection. + + Args: + model: scikit learn model with decision_function implemented + already_selected: index of datapoints already selected + N: batch size + + Returns: + indices of points selected to add using margin active learner + """ + # TODO(lishal): have MarginSampler and this share margin function + try: + distances = model.decision_function(self.X) + except: + distances = model.predict_proba(self.X) + if len(distances.shape) < 2: + min_margin = abs(distances) + else: + sort_distances = np.sort(distances, 1)[:, -2:] + min_margin = sort_distances[:, 1] - sort_distances[:, 0] + rank_ind = np.argsort(min_margin) + rank_ind = [i for i in rank_ind if i not in already_selected] + new_batch_cluster_counts = [0 for _ in range(self.n_clusters)] + new_batch = [] + for i in rank_ind: + if len(new_batch) == N: + break + label = self.cluster_labels[i] + if new_batch_cluster_counts[label] / N < self.cluster_prob[label]: + new_batch.append(i) + new_batch_cluster_counts[label] += 1 + n_slot_remaining = N - len(new_batch) + batch_filler = list(set(rank_ind) - set(already_selected) - set(new_batch)) + new_batch.extend(batch_filler[0:n_slot_remaining]) + return new_batch + + def to_dict(self): + output = {} + output['cluster_membership'] = self.cluster_labels + return output diff --git a/sampling_methods/kcenter_greedy.py b/sampling_methods/kcenter_greedy.py new file mode 100644 index 0000000..ff7e548 --- /dev/null +++ b/sampling_methods/kcenter_greedy.py @@ -0,0 +1,123 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Returns points that minimizes the maximum distance of any point to a center. + +Implements the k-Center-Greedy method in +Ozan Sener and Silvio Savarese. A Geometric Approach to Active Learning for +Convolutional Neural Networks. https://arxiv.org/abs/1708.00489 2017 + +Distance metric defaults to l2 distance. Features used to calculate distance +are either raw features or if a model has transform method then uses the output +of model.transform(X). + +Can be extended to a robust k centers algorithm that ignores a certain number of +outlier datapoints. Resulting centers are solution to multiple integer program. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +from sklearn.metrics import pairwise_distances +from sampling_methods.sampling_def import SamplingMethod + + +class kCenterGreedy(SamplingMethod): + + def __init__(self, X, y, seed, metric='euclidean'): + self.X = X + self.y = y + self.flat_X = self.flatten_X() + self.name = 'kcenter' + self.features = self.flat_X + self.metric = metric + self.min_distances = None + self.n_obs = self.X.shape[0] + self.already_selected = [] + + def update_distances(self, cluster_centers, only_new=True, reset_dist=False): + """Update min distances given cluster centers. + + Args: + cluster_centers: indices of cluster centers + only_new: only calculate distance for newly selected points and update + min_distances. + rest_dist: whether to reset min_distances. + """ + + if reset_dist: + self.min_distances = None + if only_new: + cluster_centers = [d for d in cluster_centers + if d not in self.already_selected] + if cluster_centers: + # Update min_distances for all examples given new cluster center. + x = self.features[cluster_centers] + dist = pairwise_distances(self.features, x, metric=self.metric) + + if self.min_distances is None: + self.min_distances = np.min(dist, axis=1).reshape(-1,1) + else: + self.min_distances = np.minimum(self.min_distances, dist) + + def select_batch_(self, model, already_selected, N, **kwargs): + """ + Diversity promoting active learning method that greedily forms a batch + to minimize the maximum distance to a cluster center among all unlabeled + datapoints. + + Args: + model: model with scikit-like API with decision_function implemented + already_selected: index of datapoints already selected + N: batch size + + Returns: + indices of points selected to minimize distance to cluster centers + """ + + try: + # Assumes that the transform function takes in original data and not + # flattened data. + print('Getting transformed features...') + self.features = model.transform(self.X) + print('Calculating distances...') + self.update_distances(already_selected, only_new=False, reset_dist=True) + except: + print('Using flat_X as features.') + self.update_distances(already_selected, only_new=True, reset_dist=False) + + new_batch = [] + + for _ in range(N): + if self.already_selected is None: + # Initialize centers with a randomly selected datapoint + ind = np.random.choice(np.arange(self.n_obs)) + else: + ind = np.argmax(self.min_distances) + # New examples should not be in already selected since those points + # should have min_distance of zero to a cluster center. + assert ind not in already_selected + + self.update_distances([ind], only_new=True, reset_dist=False) + new_batch.append(ind) + print('Maximum distance from cluster centers is %0.2f' + % max(self.min_distances)) + + + self.already_selected = already_selected + + return new_batch + diff --git a/sampling_methods/margin_AL.py b/sampling_methods/margin_AL.py new file mode 100644 index 0000000..6058a84 --- /dev/null +++ b/sampling_methods/margin_AL.py @@ -0,0 +1,64 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Margin based AL method. + +Samples in batches based on margin scores. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +from sampling_methods.sampling_def import SamplingMethod + + +class MarginAL(SamplingMethod): + def __init__(self, X, y, seed): + self.X = X + self.y = y + self.name = 'margin' + + def select_batch_(self, model, already_selected, N, **kwargs): + """Returns batch of datapoints with smallest margin/highest uncertainty. + + For binary classification, can just take the absolute distance to decision + boundary for each point. + For multiclass classification, must consider the margin between distance for + top two most likely classes. + + Args: + model: scikit learn model with decision_function implemented + already_selected: index of datapoints already selected + N: batch size + + Returns: + indices of points selected to add using margin active learner + """ + + try: + distances = model.decision_function(self.X) + except: + distances = model.predict_proba(self.X) + if len(distances.shape) < 2: + min_margin = abs(distances) + else: + sort_distances = np.sort(distances, 1)[:, -2:] + min_margin = sort_distances[:, 1] - sort_distances[:, 0] + rank_ind = np.argsort(min_margin) + rank_ind = [i for i in rank_ind if i not in already_selected] + active_samples = rank_ind[0:N] + return active_samples + diff --git a/sampling_methods/mixture_of_samplers.py b/sampling_methods/mixture_of_samplers.py new file mode 100644 index 0000000..9b1edbc --- /dev/null +++ b/sampling_methods/mixture_of_samplers.py @@ -0,0 +1,110 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mixture of base sampling strategies + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy + +from sampling_methods.sampling_def import SamplingMethod +from sampling_methods.constants import AL_MAPPING, get_base_AL_mapping + +get_base_AL_mapping() + + +class MixtureOfSamplers(SamplingMethod): + """Samples according to mixture of base sampling methods. + + If duplicate points are selected by the mixed strategies when forming the batch + then the remaining slots are divided according to mixture weights and + another partial batch is requested until the batch is full. + """ + def __init__(self, + X, + y, + seed, + mixture={'methods': ('margin', 'uniform'), + 'weight': (0.5, 0.5)}, + samplers=None): + self.X = X + self.y = y + self.name = 'mixture_of_samplers' + self.sampling_methods = mixture['methods'] + self.sampling_weights = dict(zip(mixture['methods'], mixture['weights'])) + self.seed = seed + # A list of initialized samplers is allowed as an input because + # for AL_methods that search over different mixtures, may want mixtures to + # have shared AL_methods so that initialization is only performed once for + # computation intensive methods like HierarchicalClusteringAL and + # states are shared between mixtures. + # If initialized samplers are not provided, initialize them ourselves. + if samplers is None: + self.samplers = {} + self.initialize(self.sampling_methods) + else: + self.samplers = samplers + self.history = [] + + def initialize(self, samplers): + self.samplers = {} + for s in samplers: + self.samplers[s] = AL_MAPPING[s](self.X, self.y, self.seed) + + def select_batch_(self, already_selected, N, **kwargs): + """Returns batch of datapoints selected according to mixture weights. + + Args: + already_included: index of datapoints already selected + N: batch size + + Returns: + indices of points selected to add using margin active learner + """ + kwargs['already_selected'] = copy.copy(already_selected) + inds = set() + self.selected_by_sampler = {} + for s in self.sampling_methods: + self.selected_by_sampler[s] = [] + effective_N = 0 + while len(inds) < N: + effective_N += N - len(inds) + for s in self.sampling_methods: + if len(inds) < N: + batch_size = min(max(int(self.sampling_weights[s] * effective_N), 1), N) + sampler = self.samplers[s] + kwargs['N'] = batch_size + s_inds = sampler.select_batch(**kwargs) + for ind in s_inds: + if ind not in self.selected_by_sampler[s]: + self.selected_by_sampler[s].append(ind) + s_inds = [d for d in s_inds if d not in inds] + s_inds = s_inds[0 : min(len(s_inds), N-len(inds))] + inds.update(s_inds) + self.history.append(copy.deepcopy(self.selected_by_sampler)) + return list(inds) + + def to_dict(self): + output = {} + output['history'] = self.history + output['samplers'] = self.sampling_methods + output['mixture_weights'] = self.sampling_weights + for s in self.samplers: + s_output = self.samplers[s].to_dict() + output[s] = s_output + return output diff --git a/sampling_methods/represent_cluster_centers.py b/sampling_methods/represent_cluster_centers.py new file mode 100644 index 0000000..f761d19 --- /dev/null +++ b/sampling_methods/represent_cluster_centers.py @@ -0,0 +1,78 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Another informative and diverse sampler that mirrors the algorithm described +in Xu, et. al., Representative Sampling for Text Classification Using +Support Vector Machines, 2003 + +Batch is created by clustering points within the margin of the classifier and +choosing points closest to the k centroids. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from sklearn.cluster import MiniBatchKMeans +import numpy as np +from sampling_methods.sampling_def import SamplingMethod + + +class RepresentativeClusterMeanSampling(SamplingMethod): + """Selects batch based on informative and diverse criteria. + + Returns points within the margin of the classifier that are closest to the + k-means centers of those points. + """ + + def __init__(self, X, y, seed): + self.name = 'cluster_mean' + self.X = X + self.flat_X = self.flatten_X() + self.y = y + self.seed = seed + + def select_batch_(self, model, N, already_selected, **kwargs): + # Probably okay to always use MiniBatchKMeans + # Should standardize data before clustering + # Can cluster on standardized data but train on raw features if desired + try: + distances = model.decision_function(self.X) + except: + distances = model.predict_proba(self.X) + if len(distances.shape) < 2: + min_margin = abs(distances) + else: + sort_distances = np.sort(distances, 1)[:, -2:] + min_margin = sort_distances[:, 1] - sort_distances[:, 0] + rank_ind = np.argsort(min_margin) + rank_ind = [i for i in rank_ind if i not in already_selected] + + distances = abs(model.decision_function(self.X)) + min_margin_by_class = np.min(abs(distances[already_selected]),axis=0) + unlabeled_in_margin = np.array([i for i in range(len(self.y)) + if i not in already_selected and + any(distances[i] 2: + flat_X = np.reshape(self.X, (shape[0],np.product(shape[1:]))) + return flat_X + + + @abc.abstractmethod + def select_batch_(self): + return + + def select_batch(self, **kwargs): + return self.select_batch_(**kwargs) + + def to_dict(self): + return None \ No newline at end of file diff --git a/sampling_methods/simulate_batch.py b/sampling_methods/simulate_batch.py new file mode 100644 index 0000000..c7f37c2 --- /dev/null +++ b/sampling_methods/simulate_batch.py @@ -0,0 +1,261 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" Select a new batch based on results of simulated trajectories.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy +import math + +import numpy as np + +from sampling_methods.wrapper_sampler_def import AL_MAPPING +from sampling_methods.wrapper_sampler_def import WrapperSamplingMethod + + +class SimulateBatchSampler(WrapperSamplingMethod): + """Creates batch based on trajectories simulated using smaller batch sizes. + + Current support use case: simulate smaller batches than the batch size + actually indicated to emulate which points would be selected in a + smaller batch setting. This method can do better than just selecting + a batch straight out if smaller batches perform better and the simulations + are informative enough and are not hurt too much by labeling noise. + """ + + def __init__(self, + X, + y, + seed, + samplers=[{'methods': ('margin', 'uniform'),'weight': (1, 0)}], + n_sims=10, + train_per_sim=10, + return_type='best_sim'): + """ Initialize sampler with options. + + Args: + X: training data + y: labels may be used by base sampling methods + seed: seed for np.random + samplers: list of dicts with two fields + 'samplers': list of named samplers + 'weights': percentage of batch to allocate to each sampler + n_sims: number of total trajectories to simulate + train_per_sim: number of minibatches to split the batch into + return_type: two return types supported right now + best_sim: return points selected by the best trajectory + frequency: returns points selected the most over all trajectories + """ + self.name = 'simulate_batch' + self.X = X + self.y = y + self.seed = seed + self.n_sims = n_sims + self.train_per_sim = train_per_sim + self.return_type = return_type + self.samplers_list = samplers + self.initialize_samplers(self.samplers_list) + self.trace = [] + self.selected = [] + np.random.seed(seed) + + def simulate_batch(self, sampler, N, already_selected, y, model, X_test, + y_test, **kwargs): + """Simulates smaller batches by using hallucinated y to select next batch. + + Assumes that select_batch is only dependent on already_selected and not on + any other states internal to the sampler. i.e. this would not work with + BanditDiscreteSampler but will work with margin, hierarchical, and uniform. + + Args: + sampler: dict with two fields + 'samplers': list of named samplers + 'weights': percentage of batch to allocate to each sampler + N: batch size + already_selected: indices already labeled + y: y to use for training + model: model to use for margin calc + X_test: validaiton data + y_test: validation labels + + Returns: + - mean accuracy + - indices selected by best hallucinated trajectory + - best accuracy achieved by one of the trajectories + """ + minibatch = max(int(math.ceil(N / self.train_per_sim)), 1) + results = [] + best_acc = 0 + best_inds = [] + self.selected = [] + n_minibatch = int(N/minibatch) + (N % minibatch > 0) + + for _ in range(self.n_sims): + inds = [] + hallucinated_y = [] + + # Copy these objects to make sure they are not modified while simulating + # trajectories as they are used later by the main run_experiment script. + kwargs['already_selected'] = copy.copy(already_selected) + kwargs['y'] = copy.copy(y) + # Assumes that model has already by fit using all labeled data so + # the probabilities can be used immediately to hallucinate labels + kwargs['model'] = copy.deepcopy(model) + + for _ in range(n_minibatch): + batch_size = min(minibatch, N-len(inds)) + if batch_size > 0: + kwargs['N'] = batch_size + new_inds = sampler.select_batch(**kwargs) + inds.extend(new_inds) + + # All models need to have predict_proba method + probs = kwargs['model'].predict_proba(self.X[new_inds]) + # Hallucinate labels for selected datapoints to be label + # using class probabilities from model + try: + classes = kwargs['model'].best_estimator_.classes_ + except: + classes = kwargs['model'].classes_ + new_y = ([ + np.random.choice(classes, p=probs[i, :]) + for i in range(batch_size) + ]) + hallucinated_y.extend(new_y) + # Not saving already_selected here, if saving then should sort + # only for the input to fit but preserve ordering of indices in + # already_selected + kwargs['already_selected'] = sorted(kwargs['already_selected'] + + new_inds) + kwargs['y'][new_inds] = new_y + kwargs['model'].fit(self.X[kwargs['already_selected']], + kwargs['y'][kwargs['already_selected']]) + acc_hallucinated = kwargs['model'].score(X_test, y_test) + if acc_hallucinated > best_acc: + best_acc = acc_hallucinated + best_inds = inds + kwargs['model'].fit(self.X[kwargs['already_selected']], + y[kwargs['already_selected']]) + # Useful to know how accuracy compares for model trained on hallucinated + # labels vs trained on true labels. But can remove this train to speed + # up simulations. Won't speed up significantly since many more models + # are being trained inside the loop above. + acc_true = kwargs['model'].score(X_test, y_test) + results.append([acc_hallucinated, acc_true]) + print('Hallucinated acc: %.3f, Actual Acc: %.3f' % (acc_hallucinated, + acc_true)) + + # Save trajectory for reference + t = {} + t['arm'] = sampler + t['data_size'] = len(kwargs['already_selected']) + t['inds'] = inds + t['y_hal'] = hallucinated_y + t['acc_hal'] = acc_hallucinated + t['acc_true'] = acc_true + self.trace.append(t) + self.selected.extend(inds) + # Delete created copies + del kwargs['model'] + del kwargs['already_selected'] + results = np.array(results) + return np.mean(results, axis=0), best_inds, best_acc + + def sampler_select_batch(self, sampler, N, already_selected, y, model, X_test, y_test, **kwargs): + """Calculate the performance of the model if the batch had been selected using the base method without simulation. + + Args: + sampler: dict with two fields + 'samplers': list of named samplers + 'weights': percentage of batch to allocate to each sampler + N: batch size + already_selected: indices already selected + y: labels to use for training + model: model to use for training + X_test, y_test: validation set + + Returns: + - indices selected by base method + - validation accuracy of model trained on new batch + """ + m = copy.deepcopy(model) + kwargs['y'] = y + kwargs['model'] = m + kwargs['already_selected'] = copy.copy(already_selected) + inds = [] + kwargs['N'] = N + inds.extend(sampler.select_batch(**kwargs)) + kwargs['already_selected'] = sorted(kwargs['already_selected'] + inds) + + m.fit(self.X[kwargs['already_selected']], y[kwargs['already_selected']]) + acc = m.score(X_test, y_test) + del m + del kwargs['already_selected'] + return inds, acc + + def select_batch_(self, N, already_selected, y, model, + X_test, y_test, **kwargs): + """ Returns a batch of size N selected by using the best sampler in simulation + + Args: + samplers: list of sampling methods represented by dict with two fields + 'samplers': list of named samplers + 'weights': percentage of batch to allocate to each sampler + N: batch size + already_selected: indices of datapoints already labeled + y: actual labels, used to compare simulation with actual + model: training model to use to evaluate different samplers. Model must + have a predict_proba method with same signature as that in sklearn + n_sims: the number of simulations to perform for each sampler + minibatch: batch size to use for simulation + """ + + results = [] + + # THE INPUTS CANNOT BE MODIFIED SO WE MAKE COPIES FOR THE CHECK LATER + # Should check model but kernel_svm does not have coef_ so need better + # handling here + copy_selected = copy.copy(already_selected) + copy_y = copy.copy(y) + + for s in self.samplers: + sim_results, sim_inds, sim_acc = self.simulate_batch( + s, N, already_selected, y, model, X_test, y_test, **kwargs) + real_inds, acc = self.sampler_select_batch( + s, N, already_selected, y, model, X_test, y_test, **kwargs) + print('Best simulated acc: %.3f, Actual acc: %.3f' % (sim_acc, acc)) + results.append([sim_results, sim_inds, real_inds, acc]) + best_s = np.argmax([r[0][0] for r in results]) + + # Make sure that model object fed in did not change during simulations + assert all(y == copy_y) + assert all([copy_selected[i] == already_selected[i] + for i in range(len(already_selected))]) + + # Return indices based on return type specified + if self.return_type == 'best_sim': + return results[best_s][1] + elif self.return_type == 'frequency': + unique, counts = np.unique(self.selected, return_counts=True) + argcount = np.argsort(-counts) + return list(unique[argcount[0:N]]) + return results[best_s][2] + + def to_dict(self): + output = {} + output['simulated_trajectories'] = self.trace + return output diff --git a/sampling_methods/uniform_sampling.py b/sampling_methods/uniform_sampling.py new file mode 100644 index 0000000..27866b3 --- /dev/null +++ b/sampling_methods/uniform_sampling.py @@ -0,0 +1,52 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Uniform sampling method. + +Samples in batches. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from sampling_methods.sampling_def import SamplingMethod + + +class UniformSampling(SamplingMethod): + + def __init__(self, X, y, seed): + self.X = X + self.y = y + self.name = 'uniform' + np.random.seed(seed) + + def select_batch_(self, already_selected, N, **kwargs): + """Returns batch of randomly sampled datapoints. + + Assumes that data has already been shuffled. + + Args: + already_selected: index of datapoints already selected + N: batch size + + Returns: + indices of points selected to label + """ + + # This is uniform given the remaining pool but biased wrt the entire pool. + sample = [i for i in range(self.X.shape[0]) if i not in already_selected] + return sample[0:N] diff --git a/sampling_methods/utils/__init__.py b/sampling_methods/utils/__init__.py new file mode 100644 index 0000000..3eeb306 --- /dev/null +++ b/sampling_methods/utils/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/sampling_methods/utils/tree.py b/sampling_methods/utils/tree.py new file mode 100644 index 0000000..bfa59d1 --- /dev/null +++ b/sampling_methods/utils/tree.py @@ -0,0 +1,158 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Node and Tree class to support hierarchical clustering AL method. + +Assumed to be binary tree. + +Node class is used to represent each node in a hierarchical clustering. +Each node has certain properties that are used in the AL method. + +Tree class is used to traverse a hierarchical clustering. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy + + +class Node(object): + """Node class for hierarchical clustering. + + Initialized with name and left right children. + """ + + def __init__(self, name, left=None, right=None): + self.name = name + self.left = left + self.right = right + self.is_leaf = left is None and right is None + self.parent = None + # Fields for hierarchical clustering AL + self.score = 1.0 + self.split = False + self.best_label = None + self.weight = None + + def set_parent(self, parent): + self.parent = parent + + +class Tree(object): + """Tree object for traversing a binary tree. + + Most methods apply to trees in general with the exception of get_pruning + which is specific to the hierarchical clustering AL method. + """ + + def __init__(self, root, node_dict): + """Initializes tree and creates all nodes in node_dict. + + Args: + root: id of the root node + node_dict: dictionary with node_id as keys and entries indicating + left and right child of node respectively. + """ + self.node_dict = node_dict + self.root = self.make_tree(root) + self.nodes = {} + self.leaves_mapping = {} + self.fill_parents() + self.n_leaves = None + + def print_tree(self, node, max_depth): + """Helper function to print out tree for debugging.""" + node_list = [node] + output = "" + level = 0 + while level < max_depth and len(node_list): + children = set() + for n in node_list: + node = self.get_node(n) + output += ("\t"*level+"node %d: score %.2f, weight %.2f" % + (node.name, node.score, node.weight)+"\n") + if node.left: + children.add(node.left.name) + if node.right: + children.add(node.right.name) + level += 1 + node_list = children + return print(output) + + def make_tree(self, node_id): + if node_id is not None: + return Node(node_id, + self.make_tree(self.node_dict[node_id][0]), + self.make_tree(self.node_dict[node_id][1])) + + def fill_parents(self): + # Setting parent and storing nodes in dict for fast access + def rec(pointer, parent): + if pointer is not None: + self.nodes[pointer.name] = pointer + pointer.set_parent(parent) + rec(pointer.left, pointer) + rec(pointer.right, pointer) + rec(self.root, None) + + def get_node(self, node_id): + return self.nodes[node_id] + + def get_ancestor(self, node): + ancestors = [] + if isinstance(node, int): + node = self.get_node(node) + while node.name != self.root.name: + node = node.parent + ancestors.append(node.name) + return ancestors + + def fill_weights(self): + for v in self.node_dict: + node = self.get_node(v) + node.weight = len(self.leaves_mapping[v]) / (1.0 * self.n_leaves) + + def create_child_leaves_mapping(self, leaves): + """DP for creating child leaves mapping. + + Storing in dict to save recompute. + """ + self.n_leaves = len(leaves) + for v in leaves: + self.leaves_mapping[v] = [v] + node_list = set([self.get_node(v).parent for v in leaves]) + while node_list: + to_fill = copy.copy(node_list) + for v in node_list: + if (v.left.name in self.leaves_mapping + and v.right.name in self.leaves_mapping): + to_fill.remove(v) + self.leaves_mapping[v.name] = (self.leaves_mapping[v.left.name] + + self.leaves_mapping[v.right.name]) + if v.parent is not None: + to_fill.add(v.parent) + node_list = to_fill + self.fill_weights() + + def get_child_leaves(self, node): + return self.leaves_mapping[node] + + def get_pruning(self, node): + if node.split: + return self.get_pruning(node.left) + self.get_pruning(node.right) + else: + return [node.name] + diff --git a/sampling_methods/utils/tree_test.py b/sampling_methods/utils/tree_test.py new file mode 100644 index 0000000..d36ea8b --- /dev/null +++ b/sampling_methods/utils/tree_test.py @@ -0,0 +1,79 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for sampling_methods.utils.tree.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import unittest +from sampling_methods.utils import tree + + +class TreeTest(unittest.TestCase): + + def setUp(self): + node_dict = { + 1: (2, 3), + 2: (4, 5), + 3: (6, 7), + 4: [None, None], + 5: [None, None], + 6: [None, None], + 7: [None, None] + } + self.tree = tree.Tree(1, node_dict) + self.tree.create_child_leaves_mapping([4, 5, 6, 7]) + node = self.tree.get_node(1) + node.split = True + node = self.tree.get_node(2) + node.split = True + + def assertNode(self, node, name, left, right): + self.assertEqual(node.name, name) + self.assertEqual(node.left.name, left) + self.assertEqual(node.right.name, right) + + def testTreeRootSetCorrectly(self): + self.assertNode(self.tree.root, 1, 2, 3) + + def testGetNode(self): + node = self.tree.get_node(1) + assert isinstance(node, tree.Node) + self.assertEqual(node.name, 1) + + def testFillParent(self): + node = self.tree.get_node(3) + self.assertEqual(node.parent.name, 1) + + def testGetAncestors(self): + ancestors = self.tree.get_ancestor(5) + self.assertTrue(all([a in ancestors for a in [1, 2]])) + + def testChildLeaves(self): + leaves = self.tree.get_child_leaves(3) + self.assertTrue(all([c in leaves for c in [6, 7]])) + + def testFillWeights(self): + node = self.tree.get_node(3) + self.assertEqual(node.weight, 0.5) + + def testGetPruning(self): + node = self.tree.get_node(1) + pruning = self.tree.get_pruning(node) + self.assertTrue(all([n in pruning for n in [3, 4, 5]])) + +if __name__ == '__main__': + unittest.main() diff --git a/sampling_methods/wrapper_sampler_def.py b/sampling_methods/wrapper_sampler_def.py new file mode 100644 index 0000000..09361ba --- /dev/null +++ b/sampling_methods/wrapper_sampler_def.py @@ -0,0 +1,50 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Abstract class for wrapper sampling methods that call base sampling methods. + +Provides interface to sampling methods that allow same signature +for select_batch. Each subclass implements select_batch_ with the desired +signature for readability. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import abc + +from sampling_methods.constants import AL_MAPPING +from sampling_methods.constants import get_all_possible_arms +from sampling_methods.sampling_def import SamplingMethod + +get_all_possible_arms() + + +class WrapperSamplingMethod(SamplingMethod): + __metaclass__ = abc.ABCMeta + + def initialize_samplers(self, mixtures): + methods = [] + for m in mixtures: + methods += m['methods'] + methods = set(methods) + self.base_samplers = {} + for s in methods: + self.base_samplers[s] = AL_MAPPING[s](self.X, self.y, self.seed) + self.samplers = [] + for m in mixtures: + self.samplers.append( + AL_MAPPING['mixture_of_samplers'](self.X, self.y, self.seed, m, + self.base_samplers)) diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..3eeb306 --- /dev/null +++ b/utils/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/utils/allconv.py b/utils/allconv.py new file mode 100644 index 0000000..f67070d --- /dev/null +++ b/utils/allconv.py @@ -0,0 +1,196 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Implements allconv model in keras using tensorflow backend.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy + +import keras +import keras.backend as K +from keras.layers import Activation +from keras.layers import Conv2D +from keras.layers import Dropout +from keras.layers import GlobalAveragePooling2D +from keras.models import Sequential + +import numpy as np +import tensorflow as tf + + +class AllConv(object): + """allconv network that matches sklearn api.""" + + def __init__(self, + random_state=1, + epochs=50, + batch_size=32, + solver='rmsprop', + learning_rate=0.001, + lr_decay=0.): + # params + self.solver = solver + self.epochs = epochs + self.batch_size = batch_size + self.learning_rate = learning_rate + self.lr_decay = lr_decay + # data + self.encode_map = None + self.decode_map = None + self.model = None + self.random_state = random_state + self.n_classes = None + + def build_model(self, X): + # assumes that data axis order is same as the backend + input_shape = X.shape[1:] + np.random.seed(self.random_state) + tf.set_random_seed(self.random_state) + + model = Sequential() + model.add(Conv2D(96, (3, 3), padding='same', + input_shape=input_shape, name='conv1')) + model.add(Activation('relu')) + model.add(Conv2D(96, (3, 3), name='conv2', padding='same')) + model.add(Activation('relu')) + model.add(Conv2D(96, (3, 3), strides=(2, 2), padding='same', name='conv3')) + model.add(Activation('relu')) + model.add(Dropout(0.5)) + + model.add(Conv2D(192, (3, 3), name='conv4', padding='same')) + model.add(Activation('relu')) + model.add(Conv2D(192, (3, 3), name='conv5', padding='same')) + model.add(Activation('relu')) + model.add(Conv2D(192, (3, 3), strides=(2, 2), name='conv6', padding='same')) + model.add(Activation('relu')) + model.add(Dropout(0.5)) + + model.add(Conv2D(192, (3, 3), name='conv7', padding='same')) + model.add(Activation('relu')) + model.add(Conv2D(192, (1, 1), name='conv8', padding='valid')) + model.add(Activation('relu')) + model.add(Conv2D(10, (1, 1), name='conv9', padding='valid')) + + model.add(GlobalAveragePooling2D()) + model.add(Activation('softmax', name='activation_top')) + model.summary() + + try: + optimizer = getattr(keras.optimizers, self.solver) + except: + raise NotImplementedError('optimizer not implemented in keras') + # All optimizers with the exception of nadam take decay as named arg + try: + opt = optimizer(lr=self.learning_rate, decay=self.lr_decay) + except: + opt = optimizer(lr=self.learning_rate, schedule_decay=self.lr_decay) + + model.compile(loss='categorical_crossentropy', + optimizer=opt, + metrics=['accuracy']) + # Save initial weights so that model can be retrained with same + # initialization + self.initial_weights = copy.deepcopy(model.get_weights()) + + self.model = model + + def create_y_mat(self, y): + y_encode = self.encode_y(y) + y_encode = np.reshape(y_encode, (len(y_encode), 1)) + y_mat = keras.utils.to_categorical(y_encode, self.n_classes) + return y_mat + + # Add handling for classes that do not start counting from 0 + def encode_y(self, y): + if self.encode_map is None: + self.classes_ = sorted(list(set(y))) + self.n_classes = len(self.classes_) + self.encode_map = dict(zip(self.classes_, range(len(self.classes_)))) + self.decode_map = dict(zip(range(len(self.classes_)), self.classes_)) + mapper = lambda x: self.encode_map[x] + transformed_y = np.array(map(mapper, y)) + return transformed_y + + def decode_y(self, y): + mapper = lambda x: self.decode_map[x] + transformed_y = np.array(map(mapper, y)) + return transformed_y + + def fit(self, X_train, y_train, sample_weight=None): + y_mat = self.create_y_mat(y_train) + + if self.model is None: + self.build_model(X_train) + + # We don't want incremental fit so reset learning rate and weights + K.set_value(self.model.optimizer.lr, self.learning_rate) + self.model.set_weights(self.initial_weights) + self.model.fit( + X_train, + y_mat, + batch_size=self.batch_size, + epochs=self.epochs, + shuffle=True, + sample_weight=sample_weight, + verbose=0) + + def predict(self, X_val): + predicted = self.model.predict(X_val) + return predicted + + def score(self, X_val, val_y): + y_mat = self.create_y_mat(val_y) + val_acc = self.model.evaluate(X_val, y_mat)[1] + return val_acc + + def decision_function(self, X): + return self.predict(X) + + def transform(self, X): + model = self.model + inp = [model.input] + activations = [] + + # Get activations of the last conv layer. + output = [layer.output for layer in model.layers if + layer.name == 'conv9'][0] + func = K.function(inp + [K.learning_phase()], [output]) + for i in range(int(X.shape[0]/self.batch_size) + 1): + minibatch = X[i * self.batch_size + : min(X.shape[0], (i+1) * self.batch_size)] + list_inputs = [minibatch, 0.] + # Learning phase. 0 = Test mode (no dropout or batch normalization) + layer_output = func(list_inputs)[0] + activations.append(layer_output) + output = np.vstack(tuple(activations)) + output = np.reshape(output, (output.shape[0],np.product(output.shape[1:]))) + return output + + def get_params(self, deep = False): + params = {} + params['solver'] = self.solver + params['epochs'] = self.epochs + params['batch_size'] = self.batch_size + params['learning_rate'] = self.learning_rate + params['weight_decay'] = self.lr_decay + if deep: + return copy.deepcopy(params) + return copy.copy(params) + + def set_params(self, **parameters): + for parameter, value in parameters.items(): + setattr(self, parameter, value) + return self diff --git a/utils/chart_data.py b/utils/chart_data.py new file mode 100644 index 0000000..8fd876e --- /dev/null +++ b/utils/chart_data.py @@ -0,0 +1,230 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Experiment charting script. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import pickle + +import numpy as np +import matplotlib.pyplot as plt +from matplotlib.backends.backend_pdf import PdfPages + +from google.apputils import app +import gflags as flags +from tensorflow import gfile + +flags.DEFINE_string('source_dir', + '/tmp/toy_experiments', + 'Directory with the output to analyze.') +flags.DEFINE_string('save_dir', '/tmp/active_learning', + 'Directory to save charts.') +flags.DEFINE_string('dataset', 'letter', 'Dataset to analyze.') +flags.DEFINE_string( + 'sampling_methods', + ('uniform,margin,informative_diverse,' + 'pred_expert_advice_trip_agg,' + 'mixture_of_samplers-margin-0.33-informative_diverse-0.33-uniform-0.34'), + 'Comma separated string of sampling methods to include in chart.') +flags.DEFINE_string('scoring_methods', 'logistic,kernel_ls', + 'Comma separated string of scoring methods to chart.') +flags.DEFINE_bool('normalize', False, 'Chart runs using normalized data.') +flags.DEFINE_bool('standardize', True, 'Chart runs using standardized data.') + +FLAGS = flags.FLAGS + + +def combine_results(files, diff=False): + all_results = {} + for f in files: + data = pickle.load(gfile.FastGFile(f, 'r')) + for k in data: + if isinstance(k, tuple): + data[k].pop('noisy_targets') + data[k].pop('indices') + data[k].pop('selected_inds') + data[k].pop('sampler_output') + key = list(k) + seed = key[-1] + key = key[0:10] + key = tuple(key) + if key in all_results: + if seed not in all_results[key]['random_seeds']: + all_results[key]['random_seeds'].append(seed) + for field in [f for f in data[k] if f != 'n_points']: + all_results[key][field] = np.vstack( + (all_results[key][field], data[k][field])) + else: + all_results[key] = data[k] + all_results[key]['random_seeds'] = [seed] + else: + all_results[k] = data[k] + return all_results + + +def plot_results(all_results, score_method, norm, stand, sampler_filter): + colors = { + 'margin': + 'gold', + 'uniform': + 'k', + 'informative_diverse': + 'r', + 'mixture_of_samplers-margin-0.33-informative_diverse-0.33-uniform-0.34': + 'b', + 'pred_expert_advice_trip_agg': + 'g' + } + labels = { + 'margin': + 'margin', + 'uniform': + 'uniform', + 'mixture_of_samplers-margin-0.33-informative_diverse-0.33-uniform-0.34': + 'margin:0.33,informative_diverse:0.33, uniform:0.34', + 'informative_diverse': + 'informative and diverse', + 'pred_expert_advice_trip_agg': + 'expert: margin,informative_diverse,uniform' + } + markers = { + 'margin': + 'None', + 'uniform': + 'None', + 'mixture_of_samplers-margin-0.33-informative_diverse-0.33-uniform-0.34': + '>', + 'informative_diverse': + 'None', + 'pred_expert_advice_trip_agg': + 'p' + } + fields = all_results['tuple_keys'] + fields = dict(zip(fields, range(len(fields)))) + + for k in sorted(all_results.keys()): + sampler = k[fields['sampler']] + if (isinstance(k, tuple) and + k[fields['score_method']] == score_method and + k[fields['standardize']] == stand and + k[fields['normalize']] == norm and + (sampler_filter is None or sampler in sampler_filter)): + results = all_results[k] + n_trials = results['accuracy'].shape[0] + x = results['data_sizes'][0] + mean_acc = np.mean(results['accuracy'], axis=0) + CI_acc = np.std(results['accuracy'], axis=0) / np.sqrt(n_trials) * 2.96 + if sampler == 'uniform': + plt.plot( + x, + mean_acc, + linewidth=1, + label=labels[sampler], + color=colors[sampler], + linestyle='--' + ) + plt.fill_between( + x, + mean_acc - CI_acc, + mean_acc + CI_acc, + color=colors[sampler], + alpha=0.2 + ) + else: + plt.plot( + x, + mean_acc, + linewidth=1, + label=labels[sampler], + color=colors[sampler], + marker=markers[sampler], + markeredgecolor=colors[sampler] + ) + plt.fill_between( + x, + mean_acc - CI_acc, + mean_acc + CI_acc, + color=colors[sampler], + alpha=0.2 + ) + plt.legend(loc=4) + + +def get_between(filename, start, end): + start_ind = filename.find(start) + len(start) + end_ind = filename.rfind(end) + return filename[start_ind:end_ind] + + +def get_sampling_method(dataset, filename): + return get_between(filename, dataset + '_', '/') + + +def get_scoring_method(filename): + return get_between(filename, 'results_score_', '_select_') + + +def get_normalize(filename): + return get_between(filename, '_norm_', '_stand_') == 'True' + + +def get_standardize(filename): + return get_between( + filename, '_stand_', filename[filename.rfind('_'):]) == 'True' + + +def main(argv): + del argv # Unused. + if not gfile.Exists(FLAGS.save_dir): + gfile.MkDir(FLAGS.save_dir) + charting_filepath = os.path.join(FLAGS.save_dir, + FLAGS.dataset + '_charts.pdf') + sampling_methods = FLAGS.sampling_methods.split(',') + scoring_methods = FLAGS.scoring_methods.split(',') + files = gfile.Glob( + os.path.join(FLAGS.source_dir, FLAGS.dataset + '*/results*.pkl')) + files = [ + f for f in files + if (get_sampling_method(FLAGS.dataset, f) in sampling_methods and + get_scoring_method(f) in scoring_methods and + get_normalize(f) == FLAGS.normalize and + get_standardize(f) == FLAGS.standardize) + ] + + print('Reading in %d files...' % len(files)) + all_results = combine_results(files) + pdf = PdfPages(charting_filepath) + + print('Plotting charts...') + plt.style.use('ggplot') + for m in scoring_methods: + plot_results( + all_results, + m, + FLAGS.normalize, + FLAGS.standardize, + sampler_filter=sampling_methods) + plt.title('Dataset: %s, Score Method: %s' % (FLAGS.dataset, m)) + pdf.savefig() + plt.close() + pdf.close() + + +if __name__ == '__main__': + app.run() diff --git a/utils/create_data.py b/utils/create_data.py new file mode 100644 index 0000000..b47726c --- /dev/null +++ b/utils/create_data.py @@ -0,0 +1,284 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Make datasets and save specified directory. + +Downloads datasets using scikit datasets and can also parse csv file +to save into pickle format. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from io import BytesIO +import os +import pickle +import StringIO +import tarfile +import urllib2 + +import keras.backend as K +from keras.datasets import cifar10 +from keras.datasets import cifar100 +from keras.datasets import mnist + +import numpy as np +import pandas as pd +from sklearn.datasets import fetch_20newsgroups_vectorized +from sklearn.datasets import fetch_mldata +from sklearn.datasets import load_breast_cancer +from sklearn.datasets import load_iris +import sklearn.datasets.rcv1 +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.feature_extraction.text import TfidfTransformer + +from google.apputils import app +import gflags as flags +from tensorflow import gfile + +flags.DEFINE_string('save_dir', '/tmp/data', + 'Where to save outputs') +flags.DEFINE_string('datasets', '', + 'Which datasets to download, comma separated.') +FLAGS = flags.FLAGS + + +class Dataset(object): + + def __init__(self, X, y): + self.data = X + self.target = y + + +def get_csv_data(filename): + """Parse csv and return Dataset object with data and targets. + + Create pickle data from csv, assumes the first column contains the targets + Args: + filename: complete path of the csv file + Returns: + Dataset object + """ + f = gfile.GFile(filename, 'r') + mat = [] + for l in f: + row = l.strip() + row = row.replace('"', '') + row = row.split(',') + row = [float(x) for x in row] + mat.append(row) + mat = np.array(mat) + y = mat[:, 0] + X = mat[:, 1:] + data = Dataset(X, y) + return data + + +def get_wikipedia_talk_data(): + """Get wikipedia talk dataset. + + See here for more information about the dataset: + https://figshare.com/articles/Wikipedia_Detox_Data/4054689 + Downloads annotated comments and annotations. + """ + + ANNOTATED_COMMENTS_URL = 'https://ndownloader.figshare.com/files/7554634' + ANNOTATIONS_URL = 'https://ndownloader.figshare.com/files/7554637' + + def download_file(url): + req = urllib2.Request(url) + response = urllib2.urlopen(req) + return response + + # Process comments + comments = pd.read_table( + download_file(ANNOTATED_COMMENTS_URL), index_col=0, sep='\t') + # remove newline and tab tokens + comments['comment'] = comments['comment'].apply( + lambda x: x.replace('NEWLINE_TOKEN', ' ')) + comments['comment'] = comments['comment'].apply( + lambda x: x.replace('TAB_TOKEN', ' ')) + + # Process labels + annotations = pd.read_table(download_file(ANNOTATIONS_URL), sep='\t') + # labels a comment as an atack if the majority of annoatators did so + labels = annotations.groupby('rev_id')['attack'].mean() > 0.5 + + # Perform data preprocessing, should probably tune these hyperparameters + vect = CountVectorizer(max_features=30000, ngram_range=(1, 2)) + tfidf = TfidfTransformer(norm='l2') + X = tfidf.fit_transform(vect.fit_transform(comments['comment'])) + y = np.array(labels) + data = Dataset(X, y) + return data + + +def get_keras_data(dataname): + """Get datasets using keras API and return as a Dataset object.""" + if dataname == 'cifar10_keras': + train, test = cifar10.load_data() + elif dataname == 'cifar100_coarse_keras': + train, test = cifar100.load_data('coarse') + elif dataname == 'cifar100_keras': + train, test = cifar100.load_data() + elif dataname == 'mnist_keras': + train, test = mnist.load_data() + else: + raise NotImplementedError('dataset not supported') + + X = np.concatenate((train[0], test[0])) + y = np.concatenate((train[1], test[1])) + + if dataname == 'mnist_keras': + # Add extra dimension for channel + num_rows = X.shape[1] + num_cols = X.shape[2] + X = X.reshape(X.shape[0], 1, num_rows, num_cols) + if K.image_data_format() == 'channels_last': + X = X.transpose(0, 2, 3, 1) + + y = y.flatten() + data = Dataset(X, y) + return data + + +# TODO(lishal): remove regular cifar10 dataset and only use dataset downloaded +# from keras to maintain image dims to create tensor for tf models +# Requires adding handling in run_experiment.py for handling of different +# training methods that require either 2d or tensor data. +def get_cifar10(): + """Get CIFAR-10 dataset from source dir. + + Slightly redundant with keras function to get cifar10 but this returns + in flat format instead of keras numpy image tensor. + """ + url = 'http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz' + def download_file(url): + req = urllib2.Request(url) + response = urllib2.urlopen(req) + return response + response = download_file(url) + tmpfile = BytesIO() + while True: + # Download a piece of the file from the connection + s = response.read(16384) + # Once the entire file has been downloaded, tarfile returns b'' + # (the empty bytes) which is a falsey value + if not s: + break + # Otherwise, write the piece of the file to the temporary file. + tmpfile.write(s) + response.close() + + tmpfile.seek(0) + tar_dir = tarfile.open(mode='r:gz', fileobj=tmpfile) + X = None + y = None + for member in tar_dir.getnames(): + if '_batch' in member: + filestream = tar_dir.extractfile(member).read() + batch = pickle.load(StringIO.StringIO(filestream)) + if X is None: + X = np.array(batch['data'], dtype=np.uint8) + y = np.array(batch['labels']) + else: + X = np.concatenate((X, np.array(batch['data'], dtype=np.uint8))) + y = np.concatenate((y, np.array(batch['labels']))) + data = Dataset(X, y) + return data + + +def get_mldata(dataset): + # Use scikit to grab datasets and save them save_dir. + save_dir = FLAGS.save_dir + filename = os.path.join(save_dir, dataset[1]+'.pkl') + + if not gfile.Exists(save_dir): + gfile.MkDir(save_dir) + if not gfile.Exists(filename): + if dataset[0][-3:] == 'csv': + data = get_csv_data(dataset[0]) + elif dataset[0] == 'breast_cancer': + data = load_breast_cancer() + elif dataset[0] == 'iris': + data = load_iris() + elif dataset[0] == 'newsgroup': + # Removing header information to make sure that no newsgroup identifying + # information is included in data + data = fetch_20newsgroups_vectorized(subset='all', remove=('headers')) + tfidf = TfidfTransformer(norm='l2') + X = tfidf.fit_transform(data.data) + data.data = X + elif dataset[0] == 'rcv1': + sklearn.datasets.rcv1.URL = ( + 'http://www.ai.mit.edu/projects/jmlr/papers/' + 'volume5/lewis04a/a13-vector-files/lyrl2004_vectors') + sklearn.datasets.rcv1.URL_topics = ( + 'http://www.ai.mit.edu/projects/jmlr/papers/' + 'volume5/lewis04a/a08-topic-qrels/rcv1-v2.topics.qrels.gz') + data = sklearn.datasets.fetch_rcv1( + data_home='/tmp') + elif dataset[0] == 'wikipedia_attack': + data = get_wikipedia_talk_data() + elif dataset[0] == 'cifar10': + data = get_cifar10() + elif 'keras' in dataset[0]: + data = get_keras_data(dataset[0]) + else: + try: + data = fetch_mldata(dataset[0]) + except: + raise Exception('ERROR: failed to fetch data from mldata.org') + X = data.data + y = data.target + if X.shape[0] != y.shape[0]: + X = np.transpose(X) + assert X.shape[0] == y.shape[0] + + data = {'data': X, 'target': y} + pickle.dump(data, gfile.GFile(filename, 'w')) + + +def main(argv): + del argv # Unused. + # First entry of tuple is mldata.org name, second is the name that we'll use + # to reference the data. + datasets = [('mnist (original)', 'mnist'), ('australian', 'australian'), + ('heart', 'heart'), ('breast_cancer', 'breast_cancer'), + ('iris', 'iris'), ('vehicle', 'vehicle'), ('wine', 'wine'), + ('waveform ida', 'waveform'), ('german ida', 'german'), + ('splice ida', 'splice'), ('ringnorm ida', 'ringnorm'), + ('twonorm ida', 'twonorm'), ('diabetes_scale', 'diabetes'), + ('mushrooms', 'mushrooms'), ('letter', 'letter'), ('dna', 'dna'), + ('banana-ida', 'banana'), ('letter', 'letter'), ('dna', 'dna'), + ('newsgroup', 'newsgroup'), ('cifar10', 'cifar10'), + ('cifar10_keras', 'cifar10_keras'), + ('cifar100_keras', 'cifar100_keras'), + ('cifar100_coarse_keras', 'cifar100_coarse_keras'), + ('mnist_keras', 'mnist_keras'), + ('wikipedia_attack', 'wikipedia_attack'), + ('rcv1', 'rcv1')] + + if FLAGS.datasets: + subset = FLAGS.datasets.split(',') + datasets = [d for d in datasets if d[1] in subset] + + for d in datasets: + print(d[1]) + get_mldata(d) + + +if __name__ == '__main__': + app.run() diff --git a/utils/kernel_block_solver.py b/utils/kernel_block_solver.py new file mode 100644 index 0000000..d3e29eb --- /dev/null +++ b/utils/kernel_block_solver.py @@ -0,0 +1,185 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Block kernel lsqr solver for multi-class classification.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy +import math + +import numpy as np +import scipy.linalg as linalg +from scipy.sparse.linalg import spsolve +from sklearn import metrics + + +class BlockKernelSolver(object): + """Inspired by algorithm from https://arxiv.org/pdf/1602.05310.pdf.""" + # TODO(lishal): save preformed kernel matrix and reuse if possible + # perhaps not possible if want to keep scikitlearn signature + + def __init__(self, + random_state=1, + C=0.1, + block_size=4000, + epochs=3, + verbose=False, + gamma=None): + self.block_size = block_size + self.epochs = epochs + self.C = C + self.kernel = 'rbf' + self.coef_ = None + self.verbose = verbose + self.encode_map = None + self.decode_map = None + self.gamma = gamma + self.X_train = None + self.random_state = random_state + + def encode_y(self, y): + # Handles classes that do not start counting from 0. + if self.encode_map is None: + self.classes_ = sorted(list(set(y))) + self.encode_map = dict(zip(self.classes_, range(len(self.classes_)))) + self.decode_map = dict(zip(range(len(self.classes_)), self.classes_)) + mapper = lambda x: self.encode_map[x] + transformed_y = np.array(map(mapper, y)) + return transformed_y + + def decode_y(self, y): + mapper = lambda x: self.decode_map[x] + transformed_y = np.array(map(mapper, y)) + return transformed_y + + def fit(self, X_train, y_train, sample_weight=None): + """Form K and solve (K + lambda * I)x = y in a block-wise fashion.""" + np.random.seed(self.random_state) + self.X_train = X_train + n_features = X_train.shape[1] + y = self.encode_y(y_train) + if self.gamma is None: + self.gamma = 1./n_features + K = metrics.pairwise.pairwise_kernels( + X_train, metric=self.kernel, gamma=self.gamma) + if self.verbose: + print('Finished forming kernel matrix.') + + # compute some constants + num_classes = len(list(set(y))) + num_samples = K.shape[0] + num_blocks = math.ceil(num_samples*1.0/self.block_size) + x = np.zeros((K.shape[0], num_classes)) + y_hat = np.zeros((K.shape[0], num_classes)) + onehot = lambda x: np.eye(num_classes)[x] + y_onehot = np.array(map(onehot, y)) + idxes = np.diag_indices(num_samples) + if sample_weight is not None: + weights = np.sqrt(sample_weight) + weights = weights[:, np.newaxis] + y_onehot = weights * y_onehot + K *= np.outer(weights, weights) + if num_blocks == 1: + epochs = 1 + else: + epochs = self.epochs + + for e in range(epochs): + shuffled_coords = np.random.choice( + num_samples, num_samples, replace=False) + for b in range(int(num_blocks)): + residuals = y_onehot - y_hat + + # Form a block of K. + K[idxes] += (self.C * num_samples) + block = shuffled_coords[b*self.block_size: + min((b+1)*self.block_size, num_samples)] + K_block = K[:, block] + # Dim should be block size x block size + KbTKb = K_block.T.dot(K_block) + + if self.verbose: + print('solving block {0}'.format(b)) + # Try linalg solve then sparse solve for handling of sparse input. + try: + x_block = linalg.solve(KbTKb, K_block.T.dot(residuals)) + except: + try: + x_block = spsolve(KbTKb, K_block.T.dot(residuals)) + except: + return None + + # update model + x[block] = x[block] + x_block + K[idxes] = K[idxes] - (self.C * num_samples) + y_hat = K.dot(x) + + y_pred = np.argmax(y_hat, axis=1) + train_acc = metrics.accuracy_score(y, y_pred) + if self.verbose: + print('Epoch: {0}, Block: {1}, Train Accuracy: {2}' + .format(e, b, train_acc)) + self.coef_ = x + + def predict(self, X_val): + val_K = metrics.pairwise.pairwise_kernels( + X_val, self.X_train, metric=self.kernel, gamma=self.gamma) + val_pred = np.argmax(val_K.dot(self.coef_), axis=1) + return self.decode_y(val_pred) + + def score(self, X_val, val_y): + val_pred = self.predict(X_val) + val_acc = metrics.accuracy_score(val_y, val_pred) + return val_acc + + def decision_function(self, X, type='predicted'): + # Return the predicted value of the best class + # Margin_AL will see that a vector is returned and not a matrix and + # simply select the points that have the lowest predicted value to label + K = metrics.pairwise.pairwise_kernels( + X, self.X_train, metric=self.kernel, gamma=self.gamma) + predicted = K.dot(self.coef_) + if type == 'scores': + val_best = np.max(K.dot(self.coef_), axis=1) + return val_best + elif type == 'predicted': + return predicted + else: + raise NotImplementedError('Invalid return type for decision function.') + + def get_params(self, deep=False): + params = {} + params['C'] = self.C + params['gamma'] = self.gamma + if deep: + return copy.deepcopy(params) + return copy.copy(params) + + def set_params(self, **parameters): + for parameter, value in parameters.items(): + setattr(self, parameter, value) + return self + + def softmax_over_predicted(self, X): + val_K = metrics.pairwise.pairwise_kernels( + X, self.X_train, metric=self.kernel, gamma=self.gamma) + val_pred = val_K.dot(self.coef_) + row_min = np.min(val_pred, axis=1) + val_pred = val_pred - row_min[:, None] + val_pred = np.exp(val_pred) + sum_exp = np.sum(val_pred, axis=1) + val_pred = val_pred/sum_exp[:, None] + return val_pred diff --git a/utils/small_cnn.py b/utils/small_cnn.py new file mode 100644 index 0000000..ea8b0dd --- /dev/null +++ b/utils/small_cnn.py @@ -0,0 +1,199 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Implements Small CNN model in keras using tensorflow backend.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy + +import keras +import keras.backend as K +from keras.layers import Activation +from keras.layers import Conv2D +from keras.layers import Dense +from keras.layers import Dropout +from keras.layers import Flatten +from keras.layers import MaxPooling2D +from keras.models import Sequential + +import numpy as np +import tensorflow as tf + + +class SmallCNN(object): + """Small convnet that matches sklearn api. + + Implements model from + https://github.com/fchollet/keras/blob/master/examples/cifar10_cnn.py + Adapts for inputs of variable size, expects data to be 4d tensor, with + # of obserations as first dimension and other dimensions to correspond to + length width and # of channels in image. + """ + + def __init__(self, + random_state=1, + epochs=50, + batch_size=32, + solver='rmsprop', + learning_rate=0.001, + lr_decay=0.): + # params + self.solver = solver + self.epochs = epochs + self.batch_size = batch_size + self.learning_rate = learning_rate + self.lr_decay = lr_decay + # data + self.encode_map = None + self.decode_map = None + self.model = None + self.random_state = random_state + self.n_classes = None + + def build_model(self, X): + # assumes that data axis order is same as the backend + input_shape = X.shape[1:] + np.random.seed(self.random_state) + tf.set_random_seed(self.random_state) + + model = Sequential() + model.add(Conv2D(32, (3, 3), padding='same', + input_shape=input_shape, name='conv1')) + model.add(Activation('relu')) + model.add(Conv2D(32, (3, 3), name='conv2')) + model.add(Activation('relu')) + model.add(MaxPooling2D(pool_size=(2, 2))) + model.add(Dropout(0.25)) + + model.add(Conv2D(64, (3, 3), padding='same', name='conv3')) + model.add(Activation('relu')) + model.add(Conv2D(64, (3, 3), name='conv4')) + model.add(Activation('relu')) + model.add(MaxPooling2D(pool_size=(2, 2))) + model.add(Dropout(0.25)) + + model.add(Flatten()) + model.add(Dense(512, name='dense1')) + model.add(Activation('relu')) + model.add(Dropout(0.5)) + model.add(Dense(self.n_classes, name='dense2')) + model.add(Activation('softmax')) + + try: + optimizer = getattr(keras.optimizers, self.solver) + except: + raise NotImplementedError('optimizer not implemented in keras') + # All optimizers with the exception of nadam take decay as named arg + try: + opt = optimizer(lr=self.learning_rate, decay=self.lr_decay) + except: + opt = optimizer(lr=self.learning_rate, schedule_decay=self.lr_decay) + + model.compile(loss='categorical_crossentropy', + optimizer=opt, + metrics=['accuracy']) + # Save initial weights so that model can be retrained with same + # initialization + self.initial_weights = copy.deepcopy(model.get_weights()) + + self.model = model + + def create_y_mat(self, y): + y_encode = self.encode_y(y) + y_encode = np.reshape(y_encode, (len(y_encode), 1)) + y_mat = keras.utils.to_categorical(y_encode, self.n_classes) + return y_mat + + # Add handling for classes that do not start counting from 0 + def encode_y(self, y): + if self.encode_map is None: + self.classes_ = sorted(list(set(y))) + self.n_classes = len(self.classes_) + self.encode_map = dict(zip(self.classes_, range(len(self.classes_)))) + self.decode_map = dict(zip(range(len(self.classes_)), self.classes_)) + mapper = lambda x: self.encode_map[x] + transformed_y = np.array(map(mapper, y)) + return transformed_y + + def decode_y(self, y): + mapper = lambda x: self.decode_map[x] + transformed_y = np.array(map(mapper, y)) + return transformed_y + + def fit(self, X_train, y_train, sample_weight=None): + y_mat = self.create_y_mat(y_train) + + if self.model is None: + self.build_model(X_train) + + # We don't want incremental fit so reset learning rate and weights + K.set_value(self.model.optimizer.lr, self.learning_rate) + self.model.set_weights(self.initial_weights) + self.model.fit( + X_train, + y_mat, + batch_size=self.batch_size, + epochs=self.epochs, + shuffle=True, + sample_weight=sample_weight, + verbose=0) + + def predict(self, X_val): + predicted = self.model.predict(X_val) + return predicted + + def score(self, X_val, val_y): + y_mat = self.create_y_mat(val_y) + val_acc = self.model.evaluate(X_val, y_mat)[1] + return val_acc + + def decision_function(self, X): + return self.predict(X) + + def transform(self, X): + model = self.model + inp = [model.input] + activations = [] + + # Get activations of the first dense layer. + output = [layer.output for layer in model.layers if + layer.name == 'dense1'][0] + func = K.function(inp + [K.learning_phase()], [output]) + for i in range(int(X.shape[0]/self.batch_size) + 1): + minibatch = X[i * self.batch_size + : min(X.shape[0], (i+1) * self.batch_size)] + list_inputs = [minibatch, 0.] + # Learning phase. 0 = Test mode (no dropout or batch normalization) + layer_output = func(list_inputs)[0] + activations.append(layer_output) + output = np.vstack(tuple(activations)) + return output + + def get_params(self, deep = False): + params = {} + params['solver'] = self.solver + params['epochs'] = self.epochs + params['batch_size'] = self.batch_size + params['learning_rate'] = self.learning_rate + params['weight_decay'] = self.lr_decay + if deep: + return copy.deepcopy(params) + return copy.copy(params) + + def set_params(self, **parameters): + for parameter, value in parameters.items(): + setattr(self, parameter, value) + return self diff --git a/utils/utils.py b/utils/utils.py new file mode 100644 index 0000000..a1d2b8b --- /dev/null +++ b/utils/utils.py @@ -0,0 +1,336 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utility functions for run_experiment.py.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy +import os +import pickle +import sys + +import numpy as np +import scipy + +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import GridSearchCV +from sklearn.svm import LinearSVC +from sklearn.svm import SVC + +from tensorflow import gfile + + +from utils.kernel_block_solver import BlockKernelSolver +from utils.small_cnn import SmallCNN +from utils.allconv import AllConv + + +class Logger(object): + """Logging object to write to file and stdout.""" + + def __init__(self, filename): + self.terminal = sys.stdout + self.log = gfile.GFile(filename, "w") + + def write(self, message): + self.terminal.write(message) + self.log.write(message) + + def flush(self): + self.terminal.flush() + + def flush_file(self): + self.log.flush() + + +def create_checker_unbalanced(split, n, grid_size): + """Creates a dataset with two classes that occupy one color of checkboard. + + Args: + split: splits to use for class imbalance. + n: number of datapoints to sample. + grid_size: checkerboard size. + Returns: + X: 2d features. + y: binary class. + """ + y = np.zeros(0) + X = np.zeros((0, 2)) + for i in range(grid_size): + for j in range(grid_size): + label = 0 + n_0 = int(n/(grid_size*grid_size) * split[0] * 2) + if (i-j) % 2 == 0: + label = 1 + n_0 = int(n/(grid_size*grid_size) * split[1] * 2) + x_1 = np.random.uniform(i, i+1, n_0) + x_2 = np.random.uniform(j, j+1, n_0) + x = np.vstack((x_1, x_2)) + x = x.T + X = np.concatenate((X, x)) + y_0 = label * np.ones(n_0) + y = np.concatenate((y, y_0)) + return X, y + + +def flatten_X(X): + shape = X.shape + flat_X = X + if len(shape) > 2: + flat_X = np.reshape(X, (shape[0], np.product(shape[1:]))) + return flat_X + + +def get_mldata(data_dir, name): + """Loads data from data_dir. + + Looks for the file in data_dir. + Assumes that data is in pickle format with dictionary fields data and target. + + + Args: + data_dir: directory to look in + name: dataset name, assumes data is saved in the save_dir with filename + .pkl + Returns: + data and targets + Raises: + NameError: dataset not found in data folder. + """ + dataname = name + if dataname == "checkerboard": + X, y = create_checker_unbalanced(split=[1./5, 4./5], n=10000, grid_size=4) + else: + filename = os.path.join(data_dir, dataname + ".pkl") + if not gfile.Exists(filename): + raise NameError("ERROR: dataset not available") + data = pickle.load(gfile.GFile(filename, "r")) + X = data["data"] + y = data["target"] + if "keras" in dataname: + X = X / 255 + y = y.flatten() + return X, y + + +def filter_data(X, y, keep=None): + """Filters data by class indicated in keep. + + Args: + X: train data + y: train targets + keep: defaults to None which will keep everything, otherwise takes a list + of classes to keep + + Returns: + filtered data and targets + """ + if keep is None: + return X, y + keep_ind = [i for i in range(len(y)) if y[i] in keep] + return X[keep_ind], y[keep_ind] + + +def get_class_counts(y_full, y): + """Gets the count of all classes in a sample. + + Args: + y_full: full target vector containing all classes + y: sample vector for which to perform the count + Returns: + count of classes for the sample vector y, the class order for count will + be the same as long as same y_full is fed in + """ + classes = np.unique(y_full) + classes = np.sort(classes) + unique, counts = np.unique(y, return_counts=True) + complete_counts = [] + for c in classes: + if c not in unique: + complete_counts.append(0) + else: + index = np.where(unique == c)[0][0] + complete_counts.append(counts[index]) + return np.array(complete_counts) + + +def flip_label(y, percent_random): + """Flips a percentage of labels for one class to the other. + + Randomly sample a percent of points and randomly label the sampled points as + one of the other classes. + Does not introduce bias. + + Args: + y: labels of all datapoints + percent_random: percent of datapoints to corrupt the labels + + Returns: + new labels with noisy labels for indicated percent of data + """ + classes = np.unique(y) + y_orig = copy.copy(y) + indices = range(y_orig.shape[0]) + np.random.shuffle(indices) + sample = indices[0:int(len(indices) * 1.0 * percent_random)] + fake_labels = [] + for s in sample: + label = y[s] + class_ind = np.where(classes == label)[0][0] + other_classes = np.delete(classes, class_ind) + np.random.shuffle(other_classes) + fake_label = other_classes[0] + assert fake_label != label + fake_labels.append(fake_label) + y[sample] = np.array(fake_labels) + assert all(y[indices[len(sample):]] == y_orig[indices[len(sample):]]) + return y + + +def get_model(method, seed=13): + """Construct sklearn model using either logistic regression or linear svm. + + Wraps grid search on regularization parameter over either logistic regression + or svm, returns constructed model + + Args: + method: string indicating scikit method to use, currently accepts logistic + and linear svm. + seed: int or rng to use for random state fed to scikit method + + Returns: + scikit learn model + """ + # TODO(lishal): extend to include any scikit model that implements + # a decision function. + # TODO(lishal): for kernel methods, currently using default value for gamma + # but should probably tune. + if method == "logistic": + model = LogisticRegression(random_state=seed, multi_class="multinomial", + solver="lbfgs", max_iter=200) + params = {"C": [10.0**(i) for i in range(-4, 5)]} + elif method == "logistic_ovr": + model = LogisticRegression(random_state=seed) + params = {"C": [10.0**(i) for i in range(-5, 4)]} + elif method == "linear_svm": + model = LinearSVC(random_state=seed) + params = {"C": [10.0**(i) for i in range(-4, 5)]} + elif method == "kernel_svm": + model = SVC(random_state=seed) + params = {"C": [10.0**(i) for i in range(-4, 5)]} + elif method == "kernel_ls": + model = BlockKernelSolver(random_state=seed) + params = {"C": [10.0**(i) for i in range(-6, 1)]} + elif method == "small_cnn": + # Model does not work with weighted_expert or simulate_batch + model = SmallCNN(random_state=seed) + return model + elif method == "allconv": + # Model does not work with weighted_expert or simulate_batch + model = AllConv(random_state=seed) + return model + + else: + raise NotImplementedError("ERROR: " + method + " not implemented") + + model = GridSearchCV(model, params, cv=3) + return model + + +def calculate_entropy(batch_size, y_s): + """Calculates KL div between training targets and targets selected by AL. + + Args: + batch_size: batch size of datapoints selected by AL + y_s: vector of datapoints selected by AL. Assumes that the order of the + data is the order in which points were labeled by AL. Also assumes + that in the offline setting y_s will eventually overlap completely with + original training targets. + Returns: + entropy between actual distribution of classes and distribution of + samples selected by AL + """ + n_batches = int(np.ceil(len(y_s) * 1.0 / batch_size)) + counts = get_class_counts(y_s, y_s) + true_dist = counts / (len(y_s) * 1.0) + entropy = [] + for b in range(n_batches): + sample = y_s[b * batch_size:(b + 1) * batch_size] + counts = get_class_counts(y_s, sample) + sample_dist = counts / (1.0 * len(sample)) + entropy.append(scipy.stats.entropy(true_dist, sample_dist)) + return entropy + + +def get_train_val_test_splits(X, y, max_points, seed, confusion, seed_batch, + split=(2./3, 1./6, 1./6)): + """Return training, validation, and test splits for X and y. + + Args: + X: features + y: targets + max_points: # of points to use when creating splits. + seed: seed for shuffling. + confusion: labeling noise to introduce. 0.1 means randomize 10% of labels. + seed_batch: # of initial datapoints to ensure sufficient class membership. + split: percent splits for train, val, and test. + Returns: + indices: shuffled indices to recreate splits given original input data X. + y_noise: y with noise injected, needed to reproduce results outside of + run_experiments using original data. + """ + np.random.seed(seed) + X_copy = copy.copy(X) + y_copy = copy.copy(y) + + # Introduce labeling noise + y_noise = flip_label(y_copy, confusion) + + indices = np.arange(len(y)) + + if max_points is None: + max_points = len(y_noise) + else: + max_points = min(len(y_noise), max_points) + train_split = int(max_points * split[0]) + val_split = train_split + int(max_points * split[1]) + assert seed_batch <= train_split + + # Do this to make sure that the initial batch has examples from all classes + min_shuffle = 3 + n_shuffle = 0 + y_tmp = y_noise + + # Need at least 4 obs of each class for 2 fold CV to work in grid search step + while (any(get_class_counts(y_tmp, y_tmp[0:seed_batch]) < 4) + or n_shuffle < min_shuffle): + np.random.shuffle(indices) + y_tmp = y_noise[indices] + n_shuffle += 1 + + X_train = X_copy[indices[0:train_split]] + X_val = X_copy[indices[train_split:val_split]] + X_test = X_copy[indices[val_split:max_points]] + y_train = y_noise[indices[0:train_split]] + y_val = y_noise[indices[train_split:val_split]] + y_test = y_noise[indices[val_split:max_points]] + # Make sure that we have enough observations of each class for 2-fold cv + assert all(get_class_counts(y_noise, y_train[0:seed_batch]) >= 4) + # Make sure that returned shuffled indices are correct + assert all(y_noise[indices[0:max_points]] == + np.concatenate((y_train, y_val, y_test), axis=0)) + return (indices[0:max_points], X_train, y_train, + X_val, y_val, X_test, y_test, y_noise)