From 9eba67d49d57fcff49676d59db2628d95b4141a5 Mon Sep 17 00:00:00 2001
From: Afshin Rostamizadeh <rostami@google.com>
Date: Fri, 6 Oct 2017 18:57:13 -0400
Subject: [PATCH] First commit.

---
 CONTRIBUTING.md                               |  23 ++
 LICENSE                                       | 202 ++++++++++
 README.md                                     | 171 +++++++++
 __init__.py                                   |  14 +
 requirements.txt                              |   9 +
 run_experiment.py                             | 344 +++++++++++++++++
 sampling_methods/__init__.py                  |  14 +
 sampling_methods/bandit_discrete.py           | 125 ++++++
 sampling_methods/constants.py                 | 127 ++++++
 sampling_methods/graph_density.py             |  92 +++++
 .../hierarchical_clustering_AL.py             | 362 ++++++++++++++++++
 sampling_methods/informative_diverse.py       | 101 +++++
 sampling_methods/kcenter_greedy.py            | 123 ++++++
 sampling_methods/margin_AL.py                 |  64 ++++
 sampling_methods/mixture_of_samplers.py       | 110 ++++++
 sampling_methods/represent_cluster_centers.py |  78 ++++
 sampling_methods/sampling_def.py              |  54 +++
 sampling_methods/simulate_batch.py            | 261 +++++++++++++
 sampling_methods/uniform_sampling.py          |  52 +++
 sampling_methods/utils/__init__.py            |  14 +
 sampling_methods/utils/tree.py                | 158 ++++++++
 sampling_methods/utils/tree_test.py           |  79 ++++
 sampling_methods/wrapper_sampler_def.py       |  50 +++
 utils/__init__.py                             |  14 +
 utils/allconv.py                              | 196 ++++++++++
 utils/chart_data.py                           | 230 +++++++++++
 utils/create_data.py                          | 284 ++++++++++++++
 utils/kernel_block_solver.py                  | 185 +++++++++
 utils/small_cnn.py                            | 199 ++++++++++
 utils/utils.py                                | 336 ++++++++++++++++
 30 files changed, 4071 insertions(+)
 create mode 100644 CONTRIBUTING.md
 create mode 100644 LICENSE
 create mode 100644 README.md
 create mode 100644 __init__.py
 create mode 100644 requirements.txt
 create mode 100644 run_experiment.py
 create mode 100644 sampling_methods/__init__.py
 create mode 100644 sampling_methods/bandit_discrete.py
 create mode 100644 sampling_methods/constants.py
 create mode 100644 sampling_methods/graph_density.py
 create mode 100644 sampling_methods/hierarchical_clustering_AL.py
 create mode 100644 sampling_methods/informative_diverse.py
 create mode 100644 sampling_methods/kcenter_greedy.py
 create mode 100644 sampling_methods/margin_AL.py
 create mode 100644 sampling_methods/mixture_of_samplers.py
 create mode 100644 sampling_methods/represent_cluster_centers.py
 create mode 100644 sampling_methods/sampling_def.py
 create mode 100644 sampling_methods/simulate_batch.py
 create mode 100644 sampling_methods/uniform_sampling.py
 create mode 100644 sampling_methods/utils/__init__.py
 create mode 100644 sampling_methods/utils/tree.py
 create mode 100644 sampling_methods/utils/tree_test.py
 create mode 100644 sampling_methods/wrapper_sampler_def.py
 create mode 100644 utils/__init__.py
 create mode 100644 utils/allconv.py
 create mode 100644 utils/chart_data.py
 create mode 100644 utils/create_data.py
 create mode 100644 utils/kernel_block_solver.py
 create mode 100644 utils/small_cnn.py
 create mode 100644 utils/utils.py

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..ae319c7
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,23 @@
+# How to Contribute
+
+We'd love to accept your patches and contributions to this project. There are
+just a few small guidelines you need to follow.
+
+## Contributor License Agreement
+
+Contributions to this project must be accompanied by a Contributor License
+Agreement. You (or your employer) retain the copyright to your contribution,
+this simply gives us permission to use and redistribute your contributions as
+part of the project. Head over to <https://cla.developers.google.com/> to see
+your current agreements on file or to sign a new one.
+
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+
+## Code reviews
+
+All submissions, including submissions by project members, require review. We
+use GitHub pull requests for this purpose. Consult
+[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
+information on using pull requests.
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..7a4a3ea
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..41e2565
--- /dev/null
+++ b/README.md
@@ -0,0 +1,171 @@
+# Active Learning Playground
+
+## Introduction
+
+This is a python module for experimenting with different active learning
+algorithms. There are a few key components to running active learning
+experiments:
+
+*   Main experiment script is
+    [`run_experiment.py`](run_experiment.py)
+    with many flags for different run options.
+
+*   Supported datasets can be downloaded to a specified directory by running
+    [`utils/create_data.py`](utils/create_data.py).
+
+*   Supported active learning methods are in
+    [`sampling_methods`](sampling_methods/).
+
+Below I will go into each component in more detail.
+
+DISCLAIMER: This is not an official Google product.
+
+## Setup
+The dependencies are in [`requirements.txt`](requirements.txt).  Please make sure these packages are
+installed before running experiments.  If GPU capable `tensorflow` is desired, please follow
+instructions [here](https://www.tensorflow.org/install/).
+
+It is highly suggested that you install all dependencies into a separate `virtualenv` for
+easy package management.
+
+## Getting benchmark datasets
+
+By default the datasets are saved to `/tmp/data`. You can specify another directory via the
+`--save_dir` flag.
+
+Redownloading all the datasets will be very time consuming so please be patient.
+You can specify a subset of the data to download by passing in a comma separated
+string of datasets via the `--datasets` flag.
+
+## Running experiments
+
+There are a few key flags for
+[`run_experiment.py`](run_experiment.py):
+
+*   `dataset`: name of the dataset, must match the save name used in
+    `create_data.py`. Must also exist in the data_dir.
+
+*   `sampling_method`: active learning method to use. Must be specified in
+    [`sampling_methods/constants.py`](sampling_methods/constants.py).
+
+*   `warmstart_size`: initial batch of uniformly sampled examples to use as seed
+    data. Float indicates percentage of total training data and integer
+    indicates raw size.
+
+*   `batch_size`: number of datapoints to request in each batch. Float indicates
+    percentage of total training data and integer indicates raw size.
+
+*   `score_method`: model to use to evaluate the performance of the sampling
+    method. Must be in `get_model` method of
+    [`utils/utils.py`](utils/utils.py).
+
+*   `data_dir`: directory with saved datasets.
+
+*   `save_dir`: directory to save results.
+
+This is just a subset of all the flags. There are also options for
+preprocessing, introducing labeling noise, dataset subsampling, and using a
+different model to select than to score/evaluate.
+
+## Available active learning methods
+
+All named active learning methods are in
+[`sampling_methods/constants.py`](sampling_methods/constants.py).
+
+You can also specify a mixture of active learning methods by following the
+pattern of `[sampling_method]-[mixture_weight]` separated by dashes; i.e.
+`mixture_of_samplers-margin-0.33-informative_diverse-0.33-uniform-0.34`.
+
+Some supported sampling methods include:
+
+*   Uniform: samples are selected via uniform sampling.
+
+*   Margin: uncertainty based sampling method.
+
+*   Informative and diverse: margin and cluster based sampling method.
+
+*   k-center greedy: representative strategy that greedily forms a batch of
+    points to minimize maximum distance from a labeled point.
+
+*   Graph density: representative strategy that selects points in dense regions
+    of pool.
+
+*   Exp3 bandit: meta-active learning method that tries to learns optimal
+    sampling method using a popular multi-armed bandit algorithm.
+
+### Adding new active learning methods
+
+Implement either a base sampler that inherits from
+[`SamplingMethod`](sampling_methods/sampling_def.py)
+or a meta-sampler that calls base samplers which inherits from
+[`WrapperSamplingMethod`](sampling_methods/wrapper_sampler_def.py).
+
+The only method that must be implemented by any sampler is `select_batch_`,
+which can have arbitrary named arguments. The only restriction is that the name
+for the same input must be consistent across all the samplers (i.e. the indices
+for already selected examples all have the same name across samplers). Adding a
+new named argument that hasn't been used in other sampling methods will require
+feeding that into the `select_batch` call in
+[`run_experiment.py`](run_experiment.py).
+
+After implementing your sampler, be sure to add it to
+[`constants.py`](sampling_methods/constants.py)
+so that it can be called from
+[`run_experiment.py`](run_experiment.py).
+
+## Available models
+
+All available models are in the `get_model` method of
+[`utils/utils.py`](utils/utils.py).
+
+Supported methods:
+
+*   Linear SVM: scikit method with grid search wrapper for regularization
+    parameter.
+
+*   Kernel SVM: scikit method with grid search wrapper for regularization
+    parameter.
+
+*   Logistc Regression: scikit method with grid search wrapper for
+    regularization parameter.
+
+*   Small CNN: 4 layer CNN optimized using rmsprop implemented in Keras with
+    tensorflow backend.
+
+*   Kernel Least Squares Classification: block gradient descient solver that can
+    use multiple cores so is often faster than scikit Kernel SVM.
+
+### Adding new models
+
+New models must follow the scikit learn api and implement the following methods
+
+*   `fit(X, y[, sample_weight])`: fit the model to the input features and
+    target.
+
+*   `predict(X)`: predict the value of the input features.
+
+*   `score(X, y)`: returns target metric given test features and test targets.
+
+*   `decision_function(X)` (optional): return class probabilities, distance to
+    decision boundaries, or other metric that can be used by margin sampler as a
+    measure of uncertainty.
+
+See
+[`small_cnn.py`](utils/small_cnn.py)
+for an example.
+
+After implementing your new model, be sure to add it to `get_model` method of
+[`utils/utils.py`](utils/utils.py).
+
+Currently models must be added on a one-off basis and not all scikit-learn
+classifiers are supported due to the need for user input on whether and how to
+tune the hyperparameters of the model. However, it is very easy to add a
+scikit-learn model with hyperparameter search wrapped around as a supported
+model.
+
+## Collecting results and charting
+
+The
+[`utils/chart_data.py`](utils/chart_data.py)
+script handles processing of data and charting for a specified dataset and
+source directory.
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..3eeb306
--- /dev/null
+++ b/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..1840c77
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,9 @@
+numpy>=1.13
+scipy>=0.19
+pandas>=0.20
+scikit-learn>=0.19
+matplotlib>=2.0.2
+tensorflow>=1.3
+keras>=2.0.8
+google-apputils>=0.4.2
+
diff --git a/run_experiment.py b/run_experiment.py
new file mode 100644
index 0000000..aad001b
--- /dev/null
+++ b/run_experiment.py
@@ -0,0 +1,344 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Run active learner on classification tasks.
+
+Supported datasets include mnist, letter, cifar10, newsgroup20, rcv1,
+wikipedia attack, and select classification datasets from mldata.
+See utils/create_data.py for all available datasets.
+
+For binary classification, mnist_4_9 indicates mnist filtered down to just 4 and
+9.
+By default uses logistic regression but can also train using kernel SVM.
+2 fold cv is used to tune regularization parameter over a exponential grid.
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import pickle
+import sys
+from time import gmtime
+from time import strftime
+
+import numpy as np
+from sklearn.preprocessing import normalize
+from sklearn.preprocessing import StandardScaler
+
+from google.apputils import app
+import gflags as flags
+from tensorflow import gfile
+
+from sampling_methods.constants import AL_MAPPING
+from sampling_methods.constants import get_AL_sampler
+from sampling_methods.constants import get_wrapper_AL_mapping
+from utils import utils
+
+flags.DEFINE_string("dataset", "letter", "Dataset name")
+flags.DEFINE_string("sampling_method", "margin",
+                    ("Name of sampling method to use, can be any defined in "
+                     "AL_MAPPING in sampling_methods.constants"))
+flags.DEFINE_float(
+    "warmstart_size", 0.02,
+    ("Can be float or integer.  Float indicates percentage of training data "
+     "to use in the initial warmstart model")
+)
+flags.DEFINE_float(
+    "batch_size", 0.02,
+    ("Can be float or integer.  Float indicates batch size as a percentage "
+     "of training data size.")
+)
+flags.DEFINE_integer("trials", 1,
+                     "Number of curves to create using different seeds")
+flags.DEFINE_integer("seed", 1, "Seed to use for rng and random state")
+# TODO(lisha): add feature noise to simulate data outliers
+flags.DEFINE_string("confusions", "0.", "Percentage of labels to randomize")
+flags.DEFINE_string("active_sampling_percentage", "1.0",
+                    "Mixture weights on active sampling.")
+flags.DEFINE_string(
+    "score_method", "logistic",
+    "Method to use to calculate accuracy.")
+flags.DEFINE_string(
+    "select_method", "None",
+    "Method to use for selecting points.")
+flags.DEFINE_string("normalize_data", "False", "Whether to normalize the data.")
+flags.DEFINE_string("standardize_data", "True",
+                    "Whether to standardize the data.")
+flags.DEFINE_string("save_dir", "/tmp/toy_experiments",
+                    "Where to save outputs")
+flags.DEFINE_string("data_dir", "/tmp/data",
+                    "Directory with predownloaded and saved datasets.")
+flags.DEFINE_string("max_dataset_size", "15000",
+                    ("maximum number of datapoints to include in data "
+                     "zero indicates no limit"))
+flags.DEFINE_float("train_horizon", "1.0",
+                   "how far to extend learning curve as a percent of train")
+flags.DEFINE_string("do_save", "True",
+                    "whether to save log and results")
+FLAGS = flags.FLAGS
+
+
+get_wrapper_AL_mapping()
+
+
+def generate_one_curve(X,
+                       y,
+                       sampler,
+                       score_model,
+                       seed,
+                       warmstart_size,
+                       batch_size,
+                       select_model=None,
+                       confusion=0.,
+                       active_p=1.0,
+                       max_points=None,
+                       standardize_data=False,
+                       norm_data=False,
+                       train_horizon=0.5):
+  """Creates one learning curve for both active and passive learning.
+
+  Will calculate accuracy on validation set as the number of training data
+  points increases for both PL and AL.
+  Caveats: training method used is sensitive to sorting of the data so we
+    resort all intermediate datasets
+
+  Args:
+    X: training data
+    y: training labels
+    sampler: sampling class from sampling_methods, assumes reference
+      passed in and sampler not yet instantiated.
+    score_model: model used to score the samplers.  Expects fit and predict
+      methods to be implemented.
+    seed: seed used for data shuffle and other sources of randomness in sampler
+      or model training
+    warmstart_size: float or int.  float indicates percentage of train data
+      to use for initial model
+    batch_size: float or int.  float indicates batch size as a percent of
+      training data
+    select_model: defaults to None, in which case the score model will be
+      used to select new datapoints to label.  Model must implement fit, predict
+      and depending on AL method may also need decision_function.
+    confusion: percentage of labels of one class to flip to the other
+    active_p: percent of batch to allocate to active learning
+    max_points: limit dataset size for preliminary
+    standardize_data: wheter to standardize the data to 0 mean unit variance
+    norm_data: whether to normalize the data.  Default is False for logistic
+      regression.
+    train_horizon: how long to draw the curve for.  Percent of training data.
+
+  Returns:
+    results: dictionary of results for all samplers
+    sampler_states: dictionary of sampler objects for debugging
+  """
+  # TODO(lishal): add option to find best hyperparameter setting first on
+  # full dataset and fix the hyperparameter for the rest of the routine
+  # This will save computation and also lead to more stable behavior for the
+  # test accuracy
+
+  # TODO(lishal): remove mixture parameter and have the mixture be specified as
+  # a mixture of samplers strategy
+  def select_batch(sampler, uniform_sampler, mixture, N, already_selected,
+                   **kwargs):
+    n_active = int(mixture * N)
+    n_passive = N - n_active
+    kwargs["N"] = n_active
+    kwargs["already_selected"] = already_selected
+    batch_AL = sampler.select_batch(**kwargs)
+    already_selected = already_selected + batch_AL
+    kwargs["N"] = n_passive
+    kwargs["already_selected"] = already_selected
+    batch_PL = uniform_sampler.select_batch(**kwargs)
+    return batch_AL + batch_PL
+
+  np.random.seed(seed)
+  data_splits = [2./3, 1./6, 1./6]
+
+  # 2/3 of data for training
+  if max_points is None:
+    max_points = len(y)
+  train_size = int(min(max_points, len(y)) * data_splits[0])
+  if batch_size < 1:
+    batch_size = int(batch_size * train_size)
+  else:
+    batch_size = int(batch_size)
+  if warmstart_size < 1:
+    # Set seed batch to provide enough samples to get at least 4 per class
+    # TODO(lishal): switch to sklearn stratified sampler
+    seed_batch = int(warmstart_size * train_size)
+  else:
+    seed_batch = int(warmstart_size)
+  seed_batch = max(seed_batch, 6 * len(np.unique(y)))
+
+  indices, X_train, y_train, X_val, y_val, X_test, y_test, y_noise = (
+      utils.get_train_val_test_splits(X,y,max_points,seed,confusion,
+                                      seed_batch, split=data_splits))
+
+  # Preprocess data
+  if norm_data:
+    print("Normalizing data")
+    X_train = normalize(X_train)
+    X_val = normalize(X_val)
+    X_test = normalize(X_test)
+  if standardize_data:
+    print("Standardizing data")
+    scaler = StandardScaler().fit(X_train)
+    X_train = scaler.transform(X_train)
+    X_val = scaler.transform(X_val)
+    X_test = scaler.transform(X_test)
+  print("active percentage: " + str(active_p) + " warmstart batch: " +
+        str(seed_batch) + " batch size: " + str(batch_size) + " confusion: " +
+        str(confusion) + " seed: " + str(seed))
+
+  # Initialize samplers
+  uniform_sampler = AL_MAPPING["uniform"](X_train, y_train, seed)
+  sampler = sampler(X_train, y_train, seed)
+
+  results = {}
+  data_sizes = []
+  accuracy = []
+  selected_inds = range(seed_batch)
+
+  # If select model is None, use score_model
+  same_score_select = False
+  if select_model is None:
+    select_model = score_model
+    same_score_select = True
+
+  n_batches = int(np.ceil((train_horizon * train_size - seed_batch) *
+                          1.0 / batch_size)) + 1
+  for b in range(n_batches):
+    n_train = seed_batch + min(train_size - seed_batch, b * batch_size)
+    print("Training model on " + str(n_train) + " datapoints")
+
+    assert n_train == len(selected_inds)
+    data_sizes.append(n_train)
+
+    # Sort active_ind so that the end results matches that of uniform sampling
+    partial_X = X_train[sorted(selected_inds)]
+    partial_y = y_train[sorted(selected_inds)]
+    score_model.fit(partial_X, partial_y)
+    if not same_score_select:
+      select_model.fit(partial_X, partial_y)
+    acc = score_model.score(X_test, y_test)
+    accuracy.append(acc)
+    print("Sampler: %s, Accuracy: %.2f%%" % (sampler.name, accuracy[-1]*100))
+
+    n_sample = min(batch_size, train_size - len(selected_inds))
+    select_batch_inputs = {
+        "model": select_model,
+        "labeled": dict(zip(selected_inds, y_train[selected_inds])),
+        "eval_acc": accuracy[-1],
+        "X_test": X_val,
+        "y_test": y_val,
+        "y": y_train
+    }
+    new_batch = select_batch(sampler, uniform_sampler, active_p, n_sample,
+                             selected_inds, **select_batch_inputs)
+    selected_inds.extend(new_batch)
+    print('Requested: %d, Selected: %d' % (n_sample, len(new_batch)))
+    assert len(new_batch) == n_sample
+    assert len(list(set(selected_inds))) == len(selected_inds)
+
+  # Check that the returned indice are correct and will allow mapping to
+  # training set from original data
+  assert all(y_noise[indices[selected_inds]] == y_train[selected_inds])
+  results["accuracy"] = accuracy
+  results["selected_inds"] = selected_inds
+  results["data_sizes"] = data_sizes
+  results["indices"] = indices
+  results["noisy_targets"] = y_noise
+  return results, sampler
+
+
+def main(argv):
+  del argv
+
+  if not gfile.Exists(FLAGS.save_dir):
+    try:
+      gfile.MkDir(FLAGS.save_dir)
+    except:
+      print(('WARNING: error creating save directory, '
+             'directory most likely already created.'))
+
+  save_dir = os.path.join(
+      FLAGS.save_dir,
+      FLAGS.dataset + "_" + FLAGS.sampling_method)
+  do_save = FLAGS.do_save == "True"
+
+  if do_save:
+    if not gfile.Exists(save_dir):
+      try:
+        gfile.MkDir(save_dir)
+      except:
+        print(('WARNING: error creating save directory, '
+               'directory most likely already created.'))
+    # Set up logging
+    filename = os.path.join(
+        save_dir, "log-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime()) + ".txt")
+    sys.stdout = utils.Logger(filename)
+
+  confusions = [float(t) for t in FLAGS.confusions.split(" ")]
+  mixtures = [float(t) for t in FLAGS.active_sampling_percentage.split(" ")]
+  all_results = {}
+  max_dataset_size = None if FLAGS.max_dataset_size == "0" else int(
+      FLAGS.max_dataset_size)
+  normalize_data = FLAGS.normalize_data == "True"
+  standardize_data = FLAGS.standardize_data == "True"
+  X, y = utils.get_mldata(FLAGS.data_dir, FLAGS.dataset)
+  starting_seed = FLAGS.seed
+
+  for c in confusions:
+    for m in mixtures:
+      for seed in range(starting_seed, starting_seed + FLAGS.trials):
+        sampler = get_AL_sampler(FLAGS.sampling_method)
+        score_model = utils.get_model(FLAGS.score_method, seed)
+        if (FLAGS.select_method == "None" or
+            FLAGS.select_method == FLAGS.score_method):
+          select_model = None
+        else:
+          select_model = utils.get_model(FLAGS.select_method, seed)
+        results, sampler_state = generate_one_curve(
+            X, y, sampler, score_model, seed, FLAGS.warmstart_size,
+            FLAGS.batch_size, select_model, c, m, max_dataset_size,
+            standardize_data, normalize_data, FLAGS.train_horizon)
+        key = (FLAGS.dataset, FLAGS.sampling_method, FLAGS.score_method,
+               FLAGS.select_method, m, FLAGS.warmstart_size, FLAGS.batch_size,
+               c, standardize_data, normalize_data, seed)
+        sampler_output = sampler_state.to_dict()
+        results["sampler_output"] = sampler_output
+        all_results[key] = results
+  fields = [
+      "dataset", "sampler", "score_method", "select_method",
+      "active percentage", "warmstart size", "batch size", "confusion",
+      "standardize", "normalize", "seed"
+  ]
+  all_results["tuple_keys"] = fields
+
+  if do_save:
+    filename = ("results_score_" + FLAGS.score_method +
+                "_select_" + FLAGS.select_method +
+                "_norm_" + str(normalize_data) +
+                "_stand_" + str(standardize_data))
+    existing_files = gfile.Glob(os.path.join(save_dir, filename + "*.pkl"))
+    filename = os.path.join(save_dir,
+                            filename + "_" + str(1000+len(existing_files))[1:] + ".pkl")
+    pickle.dump(all_results, gfile.GFile(filename, "w"))
+    sys.stdout.flush_file()
+
+
+if __name__ == "__main__":
+  app.run()
diff --git a/sampling_methods/__init__.py b/sampling_methods/__init__.py
new file mode 100644
index 0000000..3eeb306
--- /dev/null
+++ b/sampling_methods/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/sampling_methods/bandit_discrete.py b/sampling_methods/bandit_discrete.py
new file mode 100644
index 0000000..44f1bf4
--- /dev/null
+++ b/sampling_methods/bandit_discrete.py
@@ -0,0 +1,125 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Bandit wrapper around base AL sampling methods.
+
+Assumes adversarial multi-armed bandit setting where arms correspond to 
+mixtures of different AL methods.
+
+Uses EXP3 algorithm to decide which AL method to use to create the next batch.
+Similar to Hsu & Lin 2015, Active Learning by Learning.
+https://www.csie.ntu.edu.tw/~htlin/paper/doc/aaai15albl.pdf
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from sampling_methods.wrapper_sampler_def import AL_MAPPING, WrapperSamplingMethod
+
+
+class BanditDiscreteSampler(WrapperSamplingMethod):
+  """Wraps EXP3 around mixtures of indicated methods.
+
+  Uses EXP3 mult-armed bandit algorithm to select sampler methods.
+  """
+  def __init__(self,
+               X,
+               y,
+               seed,
+               reward_function = lambda AL_acc: AL_acc[-1],
+               gamma=0.5,
+               samplers=[{'methods':('margin','uniform'),'weights':(0,1)},
+                         {'methods':('margin','uniform'),'weights':(1,0)}]):
+    """Initializes sampler with indicated gamma and arms.
+
+    Args:
+      X: training data
+      y: labels, may need to be input into base samplers
+      seed: seed to use for random sampling
+      reward_function: reward based on previously observed accuracies.  Assumes
+        that the input is a sequence of observed accuracies.  Will ultimately be
+        a class method and may need access to other class properties.
+      gamma: weight on uniform mixture.  Arm probability updates are a weighted
+        mixture of uniform and an exponentially weighted distribution.
+        Lower gamma more aggressively updates based on observed rewards.
+      samplers: list of dicts with two fields
+        'samplers': list of named samplers
+        'weights': percentage of batch to allocate to each sampler
+    """
+
+    self.name = 'bandit_discrete'
+    np.random.seed(seed)
+    self.X = X
+    self.y = y
+    self.seed = seed
+    self.initialize_samplers(samplers)
+
+    self.gamma = gamma
+    self.n_arms = len(samplers)
+    self.reward_function = reward_function
+
+    self.pull_history = []
+    self.acc_history = []
+    self.w = np.ones(self.n_arms)
+    self.x = np.zeros(self.n_arms)
+    self.p = self.w / (1.0 * self.n_arms)
+    self.probs = []
+
+  def update_vars(self, arm_pulled):
+    reward = self.reward_function(self.acc_history)
+    self.x = np.zeros(self.n_arms)
+    self.x[arm_pulled] = reward / self.p[arm_pulled]
+    self.w = self.w * np.exp(self.gamma * self.x / self.n_arms)
+    self.p = ((1.0 - self.gamma) * self.w / sum(self.w)
+              + self.gamma / self.n_arms)
+    print(self.p)
+    self.probs.append(self.p)
+
+  def select_batch_(self, already_selected, N, eval_acc, **kwargs):
+    """Returns batch of datapoints sampled using mixture of AL_methods.
+
+    Assumes that data has already been shuffled.
+
+    Args:
+      already_selected: index of datapoints already selected
+      N: batch size
+      eval_acc: accuracy of model trained after incorporating datapoints from
+        last recommended batch
+
+    Returns:
+      indices of points selected to label
+    """
+    # Update observed reward and arm probabilities
+    self.acc_history.append(eval_acc)
+    if len(self.pull_history) > 0:
+      self.update_vars(self.pull_history[-1])
+    # Sample an arm
+    arm = np.random.choice(range(self.n_arms), p=self.p)
+    self.pull_history.append(arm)
+    kwargs['N'] = N
+    kwargs['already_selected'] = already_selected
+    sample = self.samplers[arm].select_batch(**kwargs)
+    return sample
+
+  def to_dict(self):
+    output = {}
+    output['samplers'] = self.base_samplers
+    output['arm_probs'] = self.probs
+    output['pull_history'] = self.pull_history
+    output['rewards'] = self.acc_history
+    return output
+
diff --git a/sampling_methods/constants.py b/sampling_methods/constants.py
new file mode 100644
index 0000000..232c8f6
--- /dev/null
+++ b/sampling_methods/constants.py
@@ -0,0 +1,127 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Controls imports to fill up dictionary of different sampling methods.
+"""
+
+from functools import partial
+AL_MAPPING = {}
+
+
+def get_base_AL_mapping():
+  from sampling_methods.margin_AL import MarginAL
+  from sampling_methods.informative_diverse import InformativeClusterDiverseSampler
+  from sampling_methods.hierarchical_clustering_AL import HierarchicalClusterAL
+  from sampling_methods.uniform_sampling import UniformSampling
+  from sampling_methods.represent_cluster_centers import RepresentativeClusterMeanSampling
+  from sampling_methods.graph_density import GraphDensitySampler
+  from sampling_methods.kcenter_greedy import kCenterGreedy
+  AL_MAPPING['margin'] = MarginAL
+  AL_MAPPING['informative_diverse'] = InformativeClusterDiverseSampler
+  AL_MAPPING['hierarchical'] = HierarchicalClusterAL
+  AL_MAPPING['uniform'] = UniformSampling
+  AL_MAPPING['margin_cluster_mean'] = RepresentativeClusterMeanSampling
+  AL_MAPPING['graph_density'] = GraphDensitySampler
+  AL_MAPPING['kcenter'] = kCenterGreedy
+
+
+def get_all_possible_arms():
+  from sampling_methods.mixture_of_samplers import MixtureOfSamplers
+  AL_MAPPING['mixture_of_samplers'] = MixtureOfSamplers
+
+
+def get_wrapper_AL_mapping():
+  from sampling_methods.bandit_discrete import BanditDiscreteSampler
+  from sampling_methods.simulate_batch import SimulateBatchSampler
+  AL_MAPPING['bandit_mixture'] = partial(
+      BanditDiscreteSampler,
+      samplers=[{
+          'methods': ['margin', 'uniform'],
+          'weights': [0, 1]
+      }, {
+          'methods': ['margin', 'uniform'],
+          'weights': [0.25, 0.75]
+      }, {
+          'methods': ['margin', 'uniform'],
+          'weights': [0.5, 0.5]
+      }, {
+          'methods': ['margin', 'uniform'],
+          'weights': [0.75, 0.25]
+      }, {
+          'methods': ['margin', 'uniform'],
+          'weights': [1, 0]
+      }])
+  AL_MAPPING['bandit_discrete'] = partial(
+      BanditDiscreteSampler,
+      samplers=[{
+          'methods': ['margin', 'uniform'],
+          'weights': [0, 1]
+      }, {
+          'methods': ['margin', 'uniform'],
+          'weights': [1, 0]
+      }])
+  AL_MAPPING['simulate_batch_mixture'] = partial(
+      SimulateBatchSampler,
+      samplers=({
+          'methods': ['margin', 'uniform'],
+          'weights': [1, 0]
+      }, {
+          'methods': ['margin', 'uniform'],
+          'weights': [0.5, 0.5]
+      }, {
+          'methods': ['margin', 'uniform'],
+          'weights': [0, 1]
+      }),
+      n_sims=5,
+      train_per_sim=10,
+      return_best_sim=False)
+  AL_MAPPING['simulate_batch_best_sim'] = partial(
+      SimulateBatchSampler,
+      samplers=[{
+          'methods': ['margin', 'uniform'],
+          'weights': [1, 0]
+      }],
+      n_sims=10,
+      train_per_sim=10,
+      return_type='best_sim')
+  AL_MAPPING['simulate_batch_frequency'] = partial(
+      SimulateBatchSampler,
+      samplers=[{
+          'methods': ['margin', 'uniform'],
+          'weights': [1, 0]
+      }],
+      n_sims=10,
+      train_per_sim=10,
+      return_type='frequency')
+
+def get_mixture_of_samplers(name):
+  assert 'mixture_of_samplers' in name
+  if 'mixture_of_samplers' not in AL_MAPPING:
+    raise KeyError('Mixture of Samplers not yet loaded.')
+  args = name.split('-')[1:]
+  samplers = args[0::2]
+  weights = args[1::2]
+  weights = [float(w) for w in weights]
+  assert sum(weights) == 1
+  mixture = {'methods': samplers, 'weights': weights}
+  print(mixture)
+  return partial(AL_MAPPING['mixture_of_samplers'], mixture=mixture)
+
+
+def get_AL_sampler(name):
+  if name in AL_MAPPING and name != 'mixture_of_samplers':
+    return AL_MAPPING[name]
+  if 'mixture_of_samplers' in name:
+    return get_mixture_of_samplers(name)
+  raise NotImplementedError('The specified sampler is not available.')
diff --git a/sampling_methods/graph_density.py b/sampling_methods/graph_density.py
new file mode 100644
index 0000000..d6f13d5
--- /dev/null
+++ b/sampling_methods/graph_density.py
@@ -0,0 +1,92 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Diversity promoting sampling method that uses graph density to determine
+ most representative points.
+
+This is an implementation of the method described in
+https://www.mpi-inf.mpg.de/fileadmin/inf/d2/Research_projects_files/EbertCVPR2012.pdf
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+from sklearn.neighbors import kneighbors_graph
+from sklearn.metrics import pairwise_distances
+import numpy as np
+from sampling_methods.sampling_def import SamplingMethod
+
+
+class GraphDensitySampler(SamplingMethod):
+  """Diversity promoting sampling method that uses graph density to determine
+  most representative points.
+  """
+
+  def __init__(self, X, y, seed):
+    self.name = 'graph_density'
+    self.X = X
+    self.flat_X = self.flatten_X()
+    # Set gamma for gaussian kernel to be equal to 1/n_features
+    self.gamma = 1. / self.X.shape[1]
+    self.compute_graph_density()
+
+  def compute_graph_density(self, n_neighbor=10):
+    # kneighbors graph is constructed using k=10
+    connect = kneighbors_graph(self.flat_X, n_neighbor,p=1)
+    # Make connectivity matrix symmetric, if a point is a k nearest neighbor of
+    # another point, make it vice versa
+    neighbors = connect.nonzero()
+    inds = zip(neighbors[0],neighbors[1])
+    # Graph edges are weighted by applying gaussian kernel to manhattan dist.
+    # By default, gamma for rbf kernel is equal to 1/n_features but may
+    # get better results if gamma is tuned.
+    for entry in inds:
+      i = entry[0]
+      j = entry[1]
+      distance = pairwise_distances(self.flat_X[[i]],self.flat_X[[j]],metric='manhattan')
+      distance = distance[0,0]
+      weight = np.exp(-distance * self.gamma)
+      connect[i,j] = weight
+      connect[j,i] = weight
+    self.connect = connect
+    # Define graph density for an observation to be sum of weights for all
+    # edges to the node representing the datapoint.  Normalize sum weights
+    # by total number of neighbors.
+    self.graph_density = np.zeros(self.X.shape[0])
+    for i in np.arange(self.X.shape[0]):
+      self.graph_density[i] = connect[i,:].sum() / (connect[i,:]>0).sum()
+    self.starting_density = copy.deepcopy(self.graph_density)
+
+  def select_batch_(self, N, already_selected, **kwargs):
+    # If a neighbor has already been sampled, reduce the graph density
+    # for its direct neighbors to promote diversity.
+    batch = set()
+    self.graph_density[already_selected] = min(self.graph_density) - 1
+    while len(batch) < N:
+      selected = np.argmax(self.graph_density)
+      neighbors = (self.connect[selected,:] > 0).nonzero()[1]
+      self.graph_density[neighbors] = self.graph_density[neighbors] - self.graph_density[selected]
+      batch.add(selected)
+      self.graph_density[already_selected] = min(self.graph_density) - 1
+      self.graph_density[list(batch)] = min(self.graph_density) - 1
+    return list(batch)
+
+  def to_dict(self):
+    output = {}
+    output['connectivity'] = self.connect
+    output['graph_density'] = self.starting_density
+    return output
\ No newline at end of file
diff --git a/sampling_methods/hierarchical_clustering_AL.py b/sampling_methods/hierarchical_clustering_AL.py
new file mode 100644
index 0000000..33421d6
--- /dev/null
+++ b/sampling_methods/hierarchical_clustering_AL.py
@@ -0,0 +1,362 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Hierarchical cluster AL method.
+
+Implements algorithm described in Dasgupta, S and Hsu, D,
+"Hierarchical Sampling for Active Learning, 2008
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.decomposition import PCA
+from sklearn.neighbors import kneighbors_graph
+from sampling_methods.sampling_def import SamplingMethod
+from sampling_methods.utils.tree import Tree
+
+
+class HierarchicalClusterAL(SamplingMethod):
+  """Implements hierarchical cluster AL based method.
+
+  All methods are internal.  select_batch_ is called via abstract classes
+  outward facing method select_batch.
+
+  Default affininity is euclidean and default linkage is ward which links
+  cluster based on variance reduction.  Hence, good results depend on
+  having normalized and standardized data.
+  """
+
+  def __init__(self, X, y, seed, beta=2, affinity='euclidean', linkage='ward',
+               clustering=None, max_features=None):
+    """Initializes AL method and fits hierarchical cluster to data.
+
+    Args:
+      X: data
+      y: labels for determinining number of clusters as an input to
+        AgglomerativeClustering
+      seed: random seed used for sampling datapoints for batch
+      beta: width of error used to decide admissble labels, higher value of beta
+        corresponds to wider confidence and less stringent definition of
+        admissibility
+        See scikit Aggloerative clustering method for more info
+      affinity: distance metric used for hierarchical clustering
+      linkage: linkage method used to determine when to join clusters
+      clustering: can provide an AgglomerativeClustering that is already fit
+      max_features: limit number of features used to construct hierarchical
+        cluster.  If specified, PCA is used to perform feature reduction and
+        the hierarchical clustering is performed using transformed features.
+    """
+    self.name = 'hierarchical'
+    self.seed = seed
+    np.random.seed(seed)
+    # Variables for the hierarchical cluster
+    self.already_clustered = False
+    if clustering is not None:
+      self.model = clustering
+      self.already_clustered = True
+    self.n_leaves = None
+    self.n_components = None
+    self.children_list = None
+    self.node_dict = None
+    self.root = None  # Node name, all node instances access through self.tree
+    self.tree = None
+    # Variables for the AL algorithm
+    self.initialized = False
+    self.beta = beta
+    self.labels = {}
+    self.pruning = []
+    self.admissible = {}
+    self.selected_nodes = None
+    # Data variables
+    self.classes = None
+    self.X = X
+
+    classes = list(set(y))
+    self.n_classes = len(classes)
+    if max_features is not None:
+      transformer = PCA(n_components=max_features)
+      transformer.fit(X)
+      self.transformed_X = transformer.fit_transform(X)
+      #connectivity = kneighbors_graph(self.transformed_X,max_features)
+      self.model = AgglomerativeClustering(
+          affinity=affinity, linkage=linkage, n_clusters=len(classes))
+      self.fit_cluster(self.transformed_X)
+    else:
+      self.model = AgglomerativeClustering(
+          affinity=affinity, linkage=linkage, n_clusters=len(classes))
+      self.fit_cluster(self.X)
+    self.y = y
+
+    self.y_labels = {}
+    # Fit cluster and update cluster variables
+
+    self.create_tree()
+    print('Finished creating hierarchical cluster')
+
+  def fit_cluster(self, X):
+    if not self.already_clustered:
+      self.model.fit(X)
+      self.already_clustered = True
+    self.n_leaves = self.model.n_leaves_
+    self.n_components = self.model.n_components_
+    self.children_list = self.model.children_
+
+  def create_tree(self):
+    node_dict = {}
+    for i in range(self.n_leaves):
+      node_dict[i] = [None, None]
+    for i in range(len(self.children_list)):
+      node_dict[self.n_leaves + i] = self.children_list[i]
+    self.node_dict = node_dict
+    # The sklearn hierarchical clustering algo numbers leaves which correspond
+    # to actual datapoints 0 to n_points - 1 and all internal nodes have
+    # ids greater than n_points - 1 with the root having the highest node id
+    self.root = max(self.node_dict.keys())
+    self.tree = Tree(self.root, self.node_dict)
+    self.tree.create_child_leaves_mapping(range(self.n_leaves))
+    for v in node_dict:
+      self.admissible[v] = set()
+
+  def get_child_leaves(self, node):
+    return self.tree.get_child_leaves(node)
+
+  def get_node_leaf_counts(self, node_list):
+    node_counts = []
+    for v in node_list:
+      node_counts.append(len(self.get_child_leaves(v)))
+    return np.array(node_counts)
+
+  def get_class_counts(self, y):
+    """Gets the count of all classes in a sample.
+
+    Args:
+      y: sample vector for which to perform the count
+    Returns:
+      count of classes for the sample vector y, the class order for count will
+      be the same as that of self.classes
+    """
+    unique, counts = np.unique(y, return_counts=True)
+    complete_counts = []
+    for c in self.classes:
+      if c not in unique:
+        complete_counts.append(0)
+      else:
+        index = np.where(unique == c)[0][0]
+        complete_counts.append(counts[index])
+    return np.array(complete_counts)
+
+  def observe_labels(self, labeled):
+    for i in labeled:
+      self.y_labels[i] = labeled[i]
+    self.classes = np.array(
+        sorted(list(set([self.y_labels[k] for k in self.y_labels]))))
+    self.n_classes = len(self.classes)
+
+  def initialize_algo(self):
+    self.pruning = [self.root]
+    self.labels[self.root] = np.random.choice(self.classes)
+    node = self.tree.get_node(self.root)
+    node.best_label = self.labels[self.root]
+    self.selected_nodes = [self.root]
+
+  def get_node_class_probabilities(self, node, y=None):
+    children = self.get_child_leaves(node)
+    if y is None:
+      y_dict = self.y_labels
+    else:
+      y_dict = dict(zip(range(len(y)), y))
+    labels = [y_dict[c] for c in children if c in y_dict]
+    # If no labels have been observed, simply return uniform distribution
+    if len(labels) == 0:
+      return 0, np.ones(self.n_classes)/self.n_classes
+    return len(labels), self.get_class_counts(labels) / (len(labels) * 1.0)
+
+  def get_node_upper_lower_bounds(self, node):
+    n_v, p_v = self.get_node_class_probabilities(node)
+    # If no observations, return worst possible upper lower bounds
+    if n_v == 0:
+      return np.zeros(len(p_v)), np.ones(len(p_v))
+    delta = 1. / n_v + np.sqrt(p_v * (1 - p_v) / (1. * n_v))
+    return (np.maximum(p_v - delta, np.zeros(len(p_v))),
+            np.minimum(p_v + delta, np.ones(len(p_v))))
+
+  def get_node_admissibility(self, node):
+    p_lb, p_up = self.get_node_upper_lower_bounds(node)
+    all_other_min = np.vectorize(
+        lambda i:min([1 - p_up[c] for c in range(len(self.classes)) if c != i]))
+    lowest_alternative_error = self.beta * all_other_min(
+        np.arange(len(self.classes)))
+    return 1 - p_lb < lowest_alternative_error
+
+  def get_adjusted_error(self, node):
+    _, prob = self.get_node_class_probabilities(node)
+    error = 1 - prob
+    admissible = self.get_node_admissibility(node)
+    not_admissible = np.where(admissible != True)[0]
+    error[not_admissible] = 1.0
+    return error
+
+  def get_class_probability_pruning(self, method='lower'):
+    prob_pruning = []
+    for v in self.pruning:
+      label = self.labels[v]
+      label_ind = np.where(self.classes == label)[0][0]
+      if method == 'empirical':
+        _, v_prob = self.get_node_class_probabilities(v)
+      else:
+        lower, upper = self.get_node_upper_lower_bounds(v)
+        if method == 'lower':
+          v_prob = lower
+        elif method == 'upper':
+          v_prob = upper
+        else:
+          raise NotImplementedError
+      prob = v_prob[label_ind]
+      prob_pruning.append(prob)
+    return np.array(prob_pruning)
+
+  def get_pruning_impurity(self, y):
+    impurity = []
+    for v in self.pruning:
+      _, prob = self.get_node_class_probabilities(v, y)
+      impurity.append(1-max(prob))
+    impurity = np.array(impurity)
+    weights = self.get_node_leaf_counts(self.pruning)
+    weights = weights / sum(weights)
+    return sum(impurity*weights)
+
+  def update_scores(self):
+    node_list = set(range(self.n_leaves))
+    # Loop through generations from bottom to top
+    while len(node_list) > 0:
+      parents = set()
+      for v in node_list:
+        node = self.tree.get_node(v)
+        # Update admissible labels for node
+        admissible = self.get_node_admissibility(v)
+        admissable_indices = np.where(admissible)[0]
+        for l in self.classes[admissable_indices]:
+          self.admissible[v].add(l)
+        # Calculate score
+        v_error = self.get_adjusted_error(v)
+        best_label_ind = np.argmin(v_error)
+        if admissible[best_label_ind]:
+          node.best_label = self.classes[best_label_ind]
+        score = v_error[best_label_ind]
+        node.split = False
+
+        # Determine if node should be split
+        if v >= self.n_leaves:  # v is not a leaf
+          if len(admissable_indices) > 0:  # There exists an admissible label
+            # Make sure label set for node so that we can flow to children
+            # if necessary
+            assert node.best_label is not None
+            # Only split if all ancestors are admissible nodes
+            # This is part  of definition of admissible pruning
+            admissible_ancestors = [len(self.admissible[a]) > 0 for a in
+                                    self.tree.get_ancestor(node)]
+            if all(admissible_ancestors):
+              left = self.node_dict[v][0]
+              left_node = self.tree.get_node(left)
+              right = self.node_dict[v][1]
+              right_node = self.tree.get_node(right)
+              node_counts = self.get_node_leaf_counts([v, left, right])
+              split_score = (node_counts[1] / node_counts[0] *
+                             left_node.score + node_counts[2] /
+                             node_counts[0] * right_node.score)
+              if split_score < score:
+                score = split_score
+                node.split = True
+        node.score = score
+        if node.parent:
+          parents.add(node.parent.name)
+        node_list = parents
+
+  def update_pruning_labels(self):
+    for v in self.selected_nodes:
+      node = self.tree.get_node(v)
+      pruning = self.tree.get_pruning(node)
+      self.pruning.remove(v)
+      self.pruning.extend(pruning)
+    # Check that pruning covers all leave nodes
+    node_counts = self.get_node_leaf_counts(self.pruning)
+    assert sum(node_counts) == self.n_leaves
+    # Fill in labels
+    for v in self.pruning:
+      node = self.tree.get_node(v)
+      if node.best_label  is None:
+        node.best_label = node.parent.best_label
+      self.labels[v] = node.best_label
+
+  def get_fake_labels(self):
+    fake_y = np.zeros(self.X.shape[0])
+    for p in self.pruning:
+      indices = self.get_child_leaves(p)
+      fake_y[indices] = self.labels[p]
+    return fake_y
+
+  def train_using_fake_labels(self, model, X_test, y_test):
+    classes_labeled = set([self.labels[p] for p in self.pruning])
+    if len(classes_labeled) == self.n_classes:
+      fake_y = self.get_fake_labels()
+      model.fit(self.X, fake_y)
+      test_acc = model.score(X_test, y_test)
+      return test_acc
+    return 0
+
+  def select_batch_(self, N, already_selected, labeled, y, **kwargs):
+    # Observe labels for previously recommended batches
+    self.observe_labels(labeled)
+
+    if not self.initialized:
+      self.initialize_algo()
+      self.initialized = True
+      print('Initialized algo')
+
+    print('Updating scores and pruning for labels from last batch')
+    self.update_scores()
+    self.update_pruning_labels()
+    print('Nodes in pruning: %d' % (len(self.pruning)))
+    print('Actual impurity for pruning is: %.2f' %
+          (self.get_pruning_impurity(y)))
+
+    # TODO(lishal): implement multiple selection methods
+    selected_nodes = set()
+    weights = self.get_node_leaf_counts(self.pruning)
+    probs = 1 - self.get_class_probability_pruning()
+    weights = weights * probs
+    weights = weights / sum(weights)
+    batch = []
+
+    print('Sampling batch')
+    while len(batch) < N:
+      node = np.random.choice(list(self.pruning), p=weights)
+      children = self.get_child_leaves(node)
+      children = [
+          c for c in children if c not in self.y_labels and c not in batch
+      ]
+      if len(children) > 0:
+        selected_nodes.add(node)
+        batch.append(np.random.choice(children))
+    self.selected_nodes = selected_nodes
+    return batch
+
+  def to_dict(self):
+    output = {}
+    output['node_dict'] = self.node_dict
+    return output
diff --git a/sampling_methods/informative_diverse.py b/sampling_methods/informative_diverse.py
new file mode 100644
index 0000000..d102337
--- /dev/null
+++ b/sampling_methods/informative_diverse.py
@@ -0,0 +1,101 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Informative and diverse batch sampler that samples points with small margin
+while maintaining same distribution over clusters as entire training data.
+
+Batch is created by sorting datapoints by increasing margin and then growing
+the batch greedily.  A point is added to the batch if the result batch still
+respects the constraint that the cluster distribution of the batch will
+match the cluster distribution of the entire training set.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from sklearn.cluster import MiniBatchKMeans
+import numpy as np
+from sampling_methods.sampling_def import SamplingMethod
+
+
+class InformativeClusterDiverseSampler(SamplingMethod):
+  """Selects batch based on informative and diverse criteria.
+
+    Returns highest uncertainty lowest margin points while maintaining
+    same distribution over clusters as entire dataset.
+  """
+
+  def __init__(self, X, y, seed):
+    self.name = 'informative_and_diverse'
+    self.X = X
+    self.flat_X = self.flatten_X()
+    # y only used for determining how many clusters there should be
+    # probably not practical to assume we know # of classes before hand
+    # should also probably scale with dimensionality of data
+    self.y = y
+    self.n_clusters = len(list(set(y)))
+    self.cluster_model = MiniBatchKMeans(n_clusters=self.n_clusters)
+    self.cluster_data()
+
+  def cluster_data(self):
+    # Probably okay to always use MiniBatchKMeans
+    # Should standardize data before clustering
+    # Can cluster on standardized data but train on raw features if desired
+    self.cluster_model.fit(self.flat_X)
+    unique, counts = np.unique(self.cluster_model.labels_, return_counts=True)
+    self.cluster_prob = counts/sum(counts)
+    self.cluster_labels = self.cluster_model.labels_
+
+  def select_batch_(self, model, already_selected, N, **kwargs):
+    """Returns a batch of size N using informative and diverse selection.
+
+    Args:
+      model: scikit learn model with decision_function implemented
+      already_selected: index of datapoints already selected
+      N: batch size
+
+    Returns:
+      indices of points selected to add using margin active learner
+    """
+    # TODO(lishal): have MarginSampler and this share margin function
+    try:
+      distances = model.decision_function(self.X)
+    except:
+      distances = model.predict_proba(self.X)
+    if len(distances.shape) < 2:
+      min_margin = abs(distances)
+    else:
+      sort_distances = np.sort(distances, 1)[:, -2:]
+      min_margin = sort_distances[:, 1] - sort_distances[:, 0]
+    rank_ind = np.argsort(min_margin)
+    rank_ind = [i for i in rank_ind if i not in already_selected]
+    new_batch_cluster_counts = [0 for _ in range(self.n_clusters)]
+    new_batch = []
+    for i in rank_ind:
+      if len(new_batch) == N:
+        break
+      label = self.cluster_labels[i]
+      if new_batch_cluster_counts[label] / N < self.cluster_prob[label]:
+        new_batch.append(i)
+        new_batch_cluster_counts[label] += 1
+    n_slot_remaining = N - len(new_batch)
+    batch_filler = list(set(rank_ind) - set(already_selected) - set(new_batch))
+    new_batch.extend(batch_filler[0:n_slot_remaining])
+    return new_batch
+
+  def to_dict(self):
+    output = {}
+    output['cluster_membership'] = self.cluster_labels
+    return output
diff --git a/sampling_methods/kcenter_greedy.py b/sampling_methods/kcenter_greedy.py
new file mode 100644
index 0000000..ff7e548
--- /dev/null
+++ b/sampling_methods/kcenter_greedy.py
@@ -0,0 +1,123 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Returns points that minimizes the maximum distance of any point to a center.
+
+Implements the k-Center-Greedy method in
+Ozan Sener and Silvio Savarese.  A Geometric Approach to Active Learning for
+Convolutional Neural Networks. https://arxiv.org/abs/1708.00489 2017
+
+Distance metric defaults to l2 distance.  Features used to calculate distance
+are either raw features or if a model has transform method then uses the output
+of model.transform(X).
+
+Can be extended to a robust k centers algorithm that ignores a certain number of
+outlier datapoints.  Resulting centers are solution to multiple integer program.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from sklearn.metrics import pairwise_distances
+from sampling_methods.sampling_def import SamplingMethod
+
+
+class kCenterGreedy(SamplingMethod):
+
+  def __init__(self, X, y, seed, metric='euclidean'):
+    self.X = X
+    self.y = y
+    self.flat_X = self.flatten_X()
+    self.name = 'kcenter'
+    self.features = self.flat_X
+    self.metric = metric
+    self.min_distances = None
+    self.n_obs = self.X.shape[0]
+    self.already_selected = []
+
+  def update_distances(self, cluster_centers, only_new=True, reset_dist=False):
+    """Update min distances given cluster centers.
+
+    Args:
+      cluster_centers: indices of cluster centers
+      only_new: only calculate distance for newly selected points and update
+        min_distances.
+      rest_dist: whether to reset min_distances.
+    """
+
+    if reset_dist:
+      self.min_distances = None
+    if only_new:
+      cluster_centers = [d for d in cluster_centers
+                         if d not in self.already_selected]
+    if cluster_centers:
+      # Update min_distances for all examples given new cluster center.
+      x = self.features[cluster_centers]
+      dist = pairwise_distances(self.features, x, metric=self.metric)
+
+      if self.min_distances is None:
+        self.min_distances = np.min(dist, axis=1).reshape(-1,1)
+      else:
+        self.min_distances = np.minimum(self.min_distances, dist)
+
+  def select_batch_(self, model, already_selected, N, **kwargs):
+    """
+    Diversity promoting active learning method that greedily forms a batch
+    to minimize the maximum distance to a cluster center among all unlabeled
+    datapoints.
+
+    Args:
+      model: model with scikit-like API with decision_function implemented
+      already_selected: index of datapoints already selected
+      N: batch size
+
+    Returns:
+      indices of points selected to minimize distance to cluster centers
+    """
+
+    try:
+      # Assumes that the transform function takes in original data and not
+      # flattened data.
+      print('Getting transformed features...')
+      self.features = model.transform(self.X)
+      print('Calculating distances...')
+      self.update_distances(already_selected, only_new=False, reset_dist=True)
+    except:
+      print('Using flat_X as features.')
+      self.update_distances(already_selected, only_new=True, reset_dist=False)
+
+    new_batch = []
+
+    for _ in range(N):
+      if self.already_selected is None:
+        # Initialize centers with a randomly selected datapoint
+        ind = np.random.choice(np.arange(self.n_obs))
+      else:
+        ind = np.argmax(self.min_distances)
+      # New examples should not be in already selected since those points
+      # should have min_distance of zero to a cluster center.
+      assert ind not in already_selected
+
+      self.update_distances([ind], only_new=True, reset_dist=False)
+      new_batch.append(ind)
+    print('Maximum distance from cluster centers is %0.2f'
+            % max(self.min_distances))
+
+
+    self.already_selected = already_selected
+
+    return new_batch
+
diff --git a/sampling_methods/margin_AL.py b/sampling_methods/margin_AL.py
new file mode 100644
index 0000000..6058a84
--- /dev/null
+++ b/sampling_methods/margin_AL.py
@@ -0,0 +1,64 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Margin based AL method.
+
+Samples in batches based on margin scores.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from sampling_methods.sampling_def import SamplingMethod
+
+
+class MarginAL(SamplingMethod):
+  def __init__(self, X, y, seed):
+    self.X = X
+    self.y = y
+    self.name = 'margin'
+
+  def select_batch_(self, model, already_selected, N, **kwargs):
+    """Returns batch of datapoints with smallest margin/highest uncertainty.
+
+    For binary classification, can just take the absolute distance to decision
+    boundary for each point.
+    For multiclass classification, must consider the margin between distance for
+    top two most likely classes.
+
+    Args:
+      model: scikit learn model with decision_function implemented
+      already_selected: index of datapoints already selected
+      N: batch size
+
+    Returns:
+      indices of points selected to add using margin active learner
+    """
+
+    try:
+      distances = model.decision_function(self.X)
+    except:
+      distances = model.predict_proba(self.X)
+    if len(distances.shape) < 2:
+      min_margin = abs(distances)
+    else:
+      sort_distances = np.sort(distances, 1)[:, -2:]
+      min_margin = sort_distances[:, 1] - sort_distances[:, 0]
+    rank_ind = np.argsort(min_margin)
+    rank_ind = [i for i in rank_ind if i not in already_selected]
+    active_samples = rank_ind[0:N]
+    return active_samples
+
diff --git a/sampling_methods/mixture_of_samplers.py b/sampling_methods/mixture_of_samplers.py
new file mode 100644
index 0000000..9b1edbc
--- /dev/null
+++ b/sampling_methods/mixture_of_samplers.py
@@ -0,0 +1,110 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Mixture of base sampling strategies
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+from sampling_methods.sampling_def import SamplingMethod
+from sampling_methods.constants import AL_MAPPING, get_base_AL_mapping
+
+get_base_AL_mapping()
+
+
+class MixtureOfSamplers(SamplingMethod):
+  """Samples according to mixture of base sampling methods.
+
+  If duplicate points are selected by the mixed strategies when forming the batch
+  then the remaining slots are divided according to mixture weights and
+  another partial batch is requested until the batch is full.
+  """
+  def __init__(self,
+               X,
+               y,
+               seed,
+               mixture={'methods': ('margin', 'uniform'),
+                        'weight': (0.5, 0.5)},
+               samplers=None):
+    self.X = X
+    self.y = y
+    self.name = 'mixture_of_samplers'
+    self.sampling_methods = mixture['methods']
+    self.sampling_weights = dict(zip(mixture['methods'], mixture['weights']))
+    self.seed = seed
+    # A list of initialized samplers is allowed as an input because
+    # for AL_methods that search over different mixtures, may want mixtures to
+    # have shared AL_methods so that initialization is only performed once for
+    # computation intensive methods like HierarchicalClusteringAL and
+    # states are shared between mixtures.
+    # If initialized samplers are not provided, initialize them ourselves.
+    if samplers is None:
+      self.samplers = {}
+      self.initialize(self.sampling_methods)
+    else:
+      self.samplers = samplers
+    self.history = []
+
+  def initialize(self, samplers):
+    self.samplers = {}
+    for s in samplers:
+      self.samplers[s] = AL_MAPPING[s](self.X, self.y, self.seed)
+
+  def select_batch_(self, already_selected, N, **kwargs):
+    """Returns batch of datapoints selected according to mixture weights.
+
+    Args:
+      already_included: index of datapoints already selected
+      N: batch size
+
+    Returns:
+      indices of points selected to add using margin active learner
+    """
+    kwargs['already_selected'] = copy.copy(already_selected)
+    inds = set()
+    self.selected_by_sampler = {}
+    for s in self.sampling_methods:
+      self.selected_by_sampler[s] = []
+    effective_N = 0
+    while len(inds) < N:
+      effective_N += N - len(inds)
+      for s in self.sampling_methods:
+        if len(inds) < N:
+          batch_size = min(max(int(self.sampling_weights[s] * effective_N), 1), N)
+          sampler = self.samplers[s]
+          kwargs['N'] = batch_size
+          s_inds = sampler.select_batch(**kwargs)
+          for ind in s_inds:
+            if ind not in self.selected_by_sampler[s]:
+              self.selected_by_sampler[s].append(ind)
+          s_inds = [d for d in s_inds if d not in inds]
+          s_inds = s_inds[0 : min(len(s_inds), N-len(inds))]
+          inds.update(s_inds)
+    self.history.append(copy.deepcopy(self.selected_by_sampler))
+    return list(inds)
+
+  def to_dict(self):
+    output = {}
+    output['history'] = self.history
+    output['samplers'] = self.sampling_methods
+    output['mixture_weights'] = self.sampling_weights
+    for s in self.samplers:
+      s_output = self.samplers[s].to_dict()
+      output[s] = s_output
+    return output
diff --git a/sampling_methods/represent_cluster_centers.py b/sampling_methods/represent_cluster_centers.py
new file mode 100644
index 0000000..f761d19
--- /dev/null
+++ b/sampling_methods/represent_cluster_centers.py
@@ -0,0 +1,78 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Another informative and diverse sampler that mirrors the algorithm described
+in Xu, et. al., Representative Sampling for Text Classification Using 
+Support Vector Machines, 2003
+
+Batch is created by clustering points within the margin of the classifier and 
+choosing points closest to the k centroids.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from sklearn.cluster import MiniBatchKMeans
+import numpy as np
+from sampling_methods.sampling_def import SamplingMethod
+
+
+class RepresentativeClusterMeanSampling(SamplingMethod):
+  """Selects batch based on informative and diverse criteria.
+
+    Returns points within the margin of the classifier that are closest to the
+    k-means centers of those points.  
+  """
+
+  def __init__(self, X, y, seed):
+    self.name = 'cluster_mean'
+    self.X = X
+    self.flat_X = self.flatten_X()
+    self.y = y
+    self.seed = seed
+
+  def select_batch_(self, model, N, already_selected, **kwargs):
+    # Probably okay to always use MiniBatchKMeans
+    # Should standardize data before clustering
+    # Can cluster on standardized data but train on raw features if desired
+    try:
+      distances = model.decision_function(self.X)
+    except:
+      distances = model.predict_proba(self.X)
+    if len(distances.shape) < 2:
+      min_margin = abs(distances)
+    else:
+      sort_distances = np.sort(distances, 1)[:, -2:]
+      min_margin = sort_distances[:, 1] - sort_distances[:, 0]
+    rank_ind = np.argsort(min_margin)
+    rank_ind = [i for i in rank_ind if i not in already_selected]
+
+    distances = abs(model.decision_function(self.X))
+    min_margin_by_class = np.min(abs(distances[already_selected]),axis=0)
+    unlabeled_in_margin = np.array([i for i in range(len(self.y))
+                                    if i not in already_selected and
+                                    any(distances[i]<min_margin_by_class)])
+    if len(unlabeled_in_margin) < N:
+      print("Not enough points within margin of classifier, using simple uncertainty sampling")
+      return rank_ind[0:N]
+    clustering_model = MiniBatchKMeans(n_clusters=N)
+    dist_to_centroid = clustering_model.fit_transform(self.flat_X[unlabeled_in_margin])
+    medoids = np.argmin(dist_to_centroid,axis=0)
+    medoids = list(set(medoids))
+    selected_indices = unlabeled_in_margin[medoids]
+    selected_indices = sorted(selected_indices,key=lambda x: min_margin[x])
+    remaining = [i for i in rank_ind if i not in selected_indices]
+    selected_indices.extend(remaining[0:N-len(selected_indices)])
+    return selected_indices
diff --git a/sampling_methods/sampling_def.py b/sampling_methods/sampling_def.py
new file mode 100644
index 0000000..6dfd1c0
--- /dev/null
+++ b/sampling_methods/sampling_def.py
@@ -0,0 +1,54 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Abstract class for sampling methods.
+
+Provides interface to sampling methods that allow same signature
+for select_batch.  Each subclass implements select_batch_ with the desired
+signature for readability.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import numpy as np
+
+class SamplingMethod(object):
+  __metaclass__ = abc.ABCMeta
+
+  @abc.abstractmethod
+  def __init__(self, X, y, seed, **kwargs):
+    self.X = X
+    self.y = y
+    self.seed = seed
+
+  def flatten_X(self):
+    shape = self.X.shape
+    flat_X = self.X
+    if len(shape) > 2:
+      flat_X = np.reshape(self.X, (shape[0],np.product(shape[1:])))
+    return flat_X
+
+
+  @abc.abstractmethod
+  def select_batch_(self):
+    return
+
+  def select_batch(self, **kwargs):
+    return self.select_batch_(**kwargs)
+
+  def to_dict(self):
+    return None
\ No newline at end of file
diff --git a/sampling_methods/simulate_batch.py b/sampling_methods/simulate_batch.py
new file mode 100644
index 0000000..c7f37c2
--- /dev/null
+++ b/sampling_methods/simulate_batch.py
@@ -0,0 +1,261 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Select a new batch based on results of simulated trajectories."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import math
+
+import numpy as np
+
+from sampling_methods.wrapper_sampler_def import AL_MAPPING
+from sampling_methods.wrapper_sampler_def import WrapperSamplingMethod
+
+
+class SimulateBatchSampler(WrapperSamplingMethod):
+  """Creates batch based on trajectories simulated using smaller batch sizes.
+
+  Current support use case: simulate smaller batches than the batch size
+  actually indicated to emulate which points would be selected in a
+  smaller batch setting. This method can do better than just selecting
+  a batch straight out if smaller batches perform better and the simulations
+  are informative enough and are not hurt too much by labeling noise.
+  """
+
+  def __init__(self,
+               X,
+               y,
+               seed,
+               samplers=[{'methods': ('margin', 'uniform'),'weight': (1, 0)}],
+               n_sims=10,
+               train_per_sim=10,
+               return_type='best_sim'):
+    """ Initialize sampler with options.
+
+    Args:
+      X: training data
+      y: labels may be used by base sampling methods
+      seed: seed for np.random
+      samplers: list of dicts with two fields
+        'samplers': list of named samplers
+        'weights': percentage of batch to allocate to each sampler
+      n_sims: number of total trajectories to simulate
+      train_per_sim: number of minibatches to split the batch into
+      return_type: two return types supported right now
+        best_sim: return points selected by the best trajectory
+        frequency: returns points selected the most over all trajectories
+    """
+    self.name = 'simulate_batch'
+    self.X = X
+    self.y = y
+    self.seed = seed
+    self.n_sims = n_sims
+    self.train_per_sim = train_per_sim
+    self.return_type = return_type
+    self.samplers_list = samplers
+    self.initialize_samplers(self.samplers_list)
+    self.trace = []
+    self.selected = []
+    np.random.seed(seed)
+
+  def simulate_batch(self, sampler, N, already_selected, y, model, X_test,
+                     y_test, **kwargs):
+    """Simulates smaller batches by using hallucinated y to select next batch.
+
+    Assumes that select_batch is only dependent on already_selected and not on
+    any other states internal to the sampler.  i.e. this would not work with
+    BanditDiscreteSampler but will work with margin, hierarchical, and uniform.
+
+    Args:
+      sampler: dict with two fields
+        'samplers': list of named samplers
+        'weights': percentage of batch to allocate to each sampler
+      N: batch size
+      already_selected: indices already labeled
+      y: y to use for training
+      model: model to use for margin calc
+      X_test: validaiton data
+      y_test: validation labels
+
+    Returns:
+      - mean accuracy
+      - indices selected by best hallucinated trajectory
+      - best accuracy achieved by one of the trajectories
+    """
+    minibatch = max(int(math.ceil(N / self.train_per_sim)), 1)
+    results = []
+    best_acc = 0
+    best_inds = []
+    self.selected = []
+    n_minibatch = int(N/minibatch) + (N % minibatch > 0)
+
+    for _ in range(self.n_sims):
+      inds = []
+      hallucinated_y = []
+
+      # Copy these objects to make sure they are not modified while simulating
+      # trajectories as they are used later by the main run_experiment script.
+      kwargs['already_selected'] = copy.copy(already_selected)
+      kwargs['y'] = copy.copy(y)
+      # Assumes that model has already by fit using all labeled data so
+      # the probabilities can be used immediately to hallucinate labels
+      kwargs['model'] = copy.deepcopy(model)
+
+      for _ in range(n_minibatch):
+        batch_size = min(minibatch, N-len(inds))
+        if batch_size > 0:
+          kwargs['N'] = batch_size
+          new_inds = sampler.select_batch(**kwargs)
+          inds.extend(new_inds)
+
+          # All models need to have predict_proba method
+          probs = kwargs['model'].predict_proba(self.X[new_inds])
+          # Hallucinate labels for selected datapoints to be label
+          # using class probabilities from model
+          try:
+            classes = kwargs['model'].best_estimator_.classes_
+          except:
+            classes = kwargs['model'].classes_
+          new_y = ([
+              np.random.choice(classes, p=probs[i, :])
+              for i in range(batch_size)
+          ])
+          hallucinated_y.extend(new_y)
+          # Not saving already_selected here, if saving then should sort
+          # only for the input to fit but preserve ordering of indices in
+          # already_selected
+          kwargs['already_selected'] = sorted(kwargs['already_selected']
+                                              + new_inds)
+          kwargs['y'][new_inds] = new_y
+          kwargs['model'].fit(self.X[kwargs['already_selected']],
+                              kwargs['y'][kwargs['already_selected']])
+      acc_hallucinated = kwargs['model'].score(X_test, y_test)
+      if acc_hallucinated > best_acc:
+        best_acc = acc_hallucinated
+        best_inds = inds
+      kwargs['model'].fit(self.X[kwargs['already_selected']],
+                          y[kwargs['already_selected']])
+      # Useful to know how accuracy compares for model trained on hallucinated
+      # labels vs trained on true labels.  But can remove this train to speed
+      # up simulations.  Won't speed up significantly since many more models
+      # are being trained inside the loop above.
+      acc_true = kwargs['model'].score(X_test, y_test)
+      results.append([acc_hallucinated, acc_true])
+      print('Hallucinated acc: %.3f, Actual Acc: %.3f' % (acc_hallucinated,
+                                                          acc_true))
+
+      # Save trajectory for reference
+      t = {}
+      t['arm'] = sampler
+      t['data_size'] = len(kwargs['already_selected'])
+      t['inds'] = inds
+      t['y_hal'] = hallucinated_y
+      t['acc_hal'] = acc_hallucinated
+      t['acc_true'] = acc_true
+      self.trace.append(t)
+      self.selected.extend(inds)
+      # Delete created copies
+      del kwargs['model']
+      del kwargs['already_selected']
+    results = np.array(results)
+    return np.mean(results, axis=0), best_inds, best_acc
+
+  def sampler_select_batch(self, sampler, N, already_selected, y, model, X_test, y_test, **kwargs):
+    """Calculate the performance of the model if the batch had been selected using the base method without simulation.
+
+    Args:
+      sampler: dict with two fields
+        'samplers': list of named samplers
+        'weights': percentage of batch to allocate to each sampler
+      N: batch size
+      already_selected: indices already selected
+      y: labels to use for training
+      model: model to use for training
+      X_test, y_test: validation set
+
+    Returns:
+      - indices selected by base method
+      - validation accuracy of model trained on new batch
+    """
+    m = copy.deepcopy(model)
+    kwargs['y'] = y
+    kwargs['model'] = m
+    kwargs['already_selected'] = copy.copy(already_selected)
+    inds = []
+    kwargs['N'] = N
+    inds.extend(sampler.select_batch(**kwargs))
+    kwargs['already_selected'] = sorted(kwargs['already_selected'] + inds)
+
+    m.fit(self.X[kwargs['already_selected']], y[kwargs['already_selected']])
+    acc = m.score(X_test, y_test)
+    del m
+    del kwargs['already_selected']
+    return inds, acc
+
+  def select_batch_(self, N, already_selected, y, model,
+                    X_test, y_test, **kwargs):
+    """ Returns a batch of size N selected by using the best sampler in simulation
+
+    Args:
+      samplers: list of sampling methods represented by dict with two fields
+        'samplers': list of named samplers
+        'weights': percentage of batch to allocate to each sampler
+      N: batch size
+      already_selected: indices of datapoints already labeled
+      y: actual labels, used to compare simulation with actual
+      model: training model to use to evaluate different samplers.  Model must
+        have a predict_proba method with same signature as that in sklearn
+      n_sims: the number of simulations to perform for each sampler
+      minibatch: batch size to use for simulation
+    """
+
+    results = []
+
+    # THE INPUTS CANNOT BE MODIFIED SO WE MAKE COPIES FOR THE CHECK LATER
+    # Should check model but kernel_svm does not have coef_ so need better
+    # handling here
+    copy_selected = copy.copy(already_selected)
+    copy_y = copy.copy(y)
+
+    for s in self.samplers:
+      sim_results, sim_inds, sim_acc = self.simulate_batch(
+          s, N, already_selected, y, model, X_test, y_test, **kwargs)
+      real_inds, acc = self.sampler_select_batch(
+          s, N, already_selected, y, model, X_test, y_test, **kwargs)
+      print('Best simulated acc: %.3f, Actual acc: %.3f' % (sim_acc, acc))
+      results.append([sim_results, sim_inds, real_inds, acc])
+    best_s = np.argmax([r[0][0] for r in results])
+
+    # Make sure that model object fed in did not change during simulations
+    assert all(y == copy_y)
+    assert all([copy_selected[i] == already_selected[i]
+                for i in range(len(already_selected))])
+
+    # Return indices based on return type specified
+    if self.return_type == 'best_sim':
+      return results[best_s][1]
+    elif self.return_type == 'frequency':
+      unique, counts = np.unique(self.selected, return_counts=True)
+      argcount = np.argsort(-counts)
+      return list(unique[argcount[0:N]])
+    return results[best_s][2]
+
+  def to_dict(self):
+    output = {}
+    output['simulated_trajectories'] = self.trace
+    return output
diff --git a/sampling_methods/uniform_sampling.py b/sampling_methods/uniform_sampling.py
new file mode 100644
index 0000000..27866b3
--- /dev/null
+++ b/sampling_methods/uniform_sampling.py
@@ -0,0 +1,52 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Uniform sampling method.
+
+Samples in batches.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from sampling_methods.sampling_def import SamplingMethod
+
+
+class UniformSampling(SamplingMethod):
+
+  def __init__(self, X, y, seed):
+    self.X = X
+    self.y = y
+    self.name = 'uniform'
+    np.random.seed(seed)
+
+  def select_batch_(self, already_selected, N, **kwargs):
+    """Returns batch of randomly sampled datapoints.
+
+    Assumes that data has already been shuffled.
+
+    Args:
+      already_selected: index of datapoints already selected
+      N: batch size
+
+    Returns:
+      indices of points selected to label
+    """
+
+    # This is uniform given the remaining pool but biased wrt the entire pool.
+    sample = [i for i in range(self.X.shape[0]) if i not in already_selected]
+    return sample[0:N]
diff --git a/sampling_methods/utils/__init__.py b/sampling_methods/utils/__init__.py
new file mode 100644
index 0000000..3eeb306
--- /dev/null
+++ b/sampling_methods/utils/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/sampling_methods/utils/tree.py b/sampling_methods/utils/tree.py
new file mode 100644
index 0000000..bfa59d1
--- /dev/null
+++ b/sampling_methods/utils/tree.py
@@ -0,0 +1,158 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Node and Tree class to support hierarchical clustering AL method.
+
+Assumed to be binary tree.
+
+Node class is used to represent each node in a hierarchical clustering.
+Each node has certain properties that are used in the AL method.
+
+Tree class is used to traverse a hierarchical clustering.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+
+class Node(object):
+  """Node class for hierarchical clustering.
+
+  Initialized with name and left right children.
+  """
+
+  def __init__(self, name, left=None, right=None):
+    self.name = name
+    self.left = left
+    self.right = right
+    self.is_leaf = left is None and right is None
+    self.parent = None
+    # Fields for hierarchical clustering AL
+    self.score = 1.0
+    self.split = False
+    self.best_label = None
+    self.weight = None
+
+  def set_parent(self, parent):
+    self.parent = parent
+
+
+class Tree(object):
+  """Tree object for traversing a binary tree.
+
+  Most methods apply to trees in general with the exception of get_pruning
+  which is specific to the hierarchical clustering AL method.
+  """
+
+  def __init__(self, root, node_dict):
+    """Initializes tree and creates all nodes in node_dict.
+
+    Args:
+      root: id of the root node
+      node_dict: dictionary with node_id as keys and entries indicating
+        left and right child of node respectively.
+    """
+    self.node_dict = node_dict
+    self.root = self.make_tree(root)
+    self.nodes = {}
+    self.leaves_mapping = {}
+    self.fill_parents()
+    self.n_leaves = None
+
+  def print_tree(self, node, max_depth):
+    """Helper function to print out tree for debugging."""
+    node_list = [node]
+    output = ""
+    level = 0
+    while level < max_depth and len(node_list):
+      children = set()
+      for n in node_list:
+        node = self.get_node(n)
+        output += ("\t"*level+"node %d: score %.2f, weight %.2f" %
+                   (node.name, node.score, node.weight)+"\n")
+        if node.left:
+          children.add(node.left.name)
+        if node.right:
+          children.add(node.right.name)
+      level += 1
+      node_list = children
+    return print(output)
+
+  def make_tree(self, node_id):
+    if node_id is not None:
+      return Node(node_id,
+                  self.make_tree(self.node_dict[node_id][0]),
+                  self.make_tree(self.node_dict[node_id][1]))
+
+  def fill_parents(self):
+    # Setting parent and storing nodes in dict for fast access
+    def rec(pointer, parent):
+      if pointer is not None:
+        self.nodes[pointer.name] = pointer
+        pointer.set_parent(parent)
+        rec(pointer.left, pointer)
+        rec(pointer.right, pointer)
+    rec(self.root, None)
+
+  def get_node(self, node_id):
+    return self.nodes[node_id]
+
+  def get_ancestor(self, node):
+    ancestors = []
+    if isinstance(node, int):
+      node = self.get_node(node)
+    while node.name != self.root.name:
+      node = node.parent
+      ancestors.append(node.name)
+    return ancestors
+
+  def fill_weights(self):
+    for v in self.node_dict:
+      node = self.get_node(v)
+      node.weight = len(self.leaves_mapping[v]) / (1.0 * self.n_leaves)
+
+  def create_child_leaves_mapping(self, leaves):
+    """DP for creating child leaves mapping.
+    
+    Storing in dict to save recompute.
+    """
+    self.n_leaves = len(leaves)
+    for v in leaves:
+      self.leaves_mapping[v] = [v]
+    node_list = set([self.get_node(v).parent for v in leaves])
+    while node_list:
+      to_fill = copy.copy(node_list)
+      for v in node_list:
+        if (v.left.name in self.leaves_mapping
+            and v.right.name in self.leaves_mapping):
+          to_fill.remove(v)
+          self.leaves_mapping[v.name] = (self.leaves_mapping[v.left.name] +
+                                         self.leaves_mapping[v.right.name])
+          if v.parent is not None:
+            to_fill.add(v.parent)
+      node_list = to_fill
+    self.fill_weights()
+
+  def get_child_leaves(self, node):
+    return self.leaves_mapping[node]
+
+  def get_pruning(self, node):
+    if node.split:
+      return self.get_pruning(node.left) + self.get_pruning(node.right)
+    else:
+      return [node.name]
+
diff --git a/sampling_methods/utils/tree_test.py b/sampling_methods/utils/tree_test.py
new file mode 100644
index 0000000..d36ea8b
--- /dev/null
+++ b/sampling_methods/utils/tree_test.py
@@ -0,0 +1,79 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for sampling_methods.utils.tree."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+from sampling_methods.utils import tree
+
+
+class TreeTest(unittest.TestCase):
+
+  def setUp(self):
+    node_dict = {
+        1: (2, 3),
+        2: (4, 5),
+        3: (6, 7),
+        4: [None, None],
+        5: [None, None],
+        6: [None, None],
+        7: [None, None]
+    }
+    self.tree = tree.Tree(1, node_dict)
+    self.tree.create_child_leaves_mapping([4, 5, 6, 7])
+    node = self.tree.get_node(1)
+    node.split = True
+    node = self.tree.get_node(2)
+    node.split = True
+
+  def assertNode(self, node, name, left, right):
+    self.assertEqual(node.name, name)
+    self.assertEqual(node.left.name, left)
+    self.assertEqual(node.right.name, right)
+
+  def testTreeRootSetCorrectly(self):
+    self.assertNode(self.tree.root, 1, 2, 3)
+
+  def testGetNode(self):
+    node = self.tree.get_node(1)
+    assert isinstance(node, tree.Node)
+    self.assertEqual(node.name, 1)
+
+  def testFillParent(self):
+    node = self.tree.get_node(3)
+    self.assertEqual(node.parent.name, 1)
+
+  def testGetAncestors(self):
+    ancestors = self.tree.get_ancestor(5)
+    self.assertTrue(all([a in ancestors for a in [1, 2]]))
+
+  def testChildLeaves(self):
+    leaves = self.tree.get_child_leaves(3)
+    self.assertTrue(all([c in leaves for c in [6, 7]]))
+
+  def testFillWeights(self):
+    node = self.tree.get_node(3)
+    self.assertEqual(node.weight, 0.5)
+
+  def testGetPruning(self):
+    node = self.tree.get_node(1)
+    pruning = self.tree.get_pruning(node)
+    self.assertTrue(all([n in pruning for n in [3, 4, 5]]))
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/sampling_methods/wrapper_sampler_def.py b/sampling_methods/wrapper_sampler_def.py
new file mode 100644
index 0000000..09361ba
--- /dev/null
+++ b/sampling_methods/wrapper_sampler_def.py
@@ -0,0 +1,50 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Abstract class for wrapper sampling methods that call base sampling methods.
+
+Provides interface to sampling methods that allow same signature
+for select_batch.  Each subclass implements select_batch_ with the desired
+signature for readability.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+from sampling_methods.constants import AL_MAPPING
+from sampling_methods.constants import get_all_possible_arms
+from sampling_methods.sampling_def import SamplingMethod
+
+get_all_possible_arms()
+
+
+class WrapperSamplingMethod(SamplingMethod):
+  __metaclass__ = abc.ABCMeta
+
+  def initialize_samplers(self, mixtures):
+    methods = []
+    for m in mixtures:
+      methods += m['methods']
+    methods = set(methods)
+    self.base_samplers = {}
+    for s in methods:
+      self.base_samplers[s] = AL_MAPPING[s](self.X, self.y, self.seed)
+    self.samplers = []
+    for m in mixtures:
+      self.samplers.append(
+          AL_MAPPING['mixture_of_samplers'](self.X, self.y, self.seed, m,
+                                            self.base_samplers))
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000..3eeb306
--- /dev/null
+++ b/utils/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/utils/allconv.py b/utils/allconv.py
new file mode 100644
index 0000000..f67070d
--- /dev/null
+++ b/utils/allconv.py
@@ -0,0 +1,196 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Implements allconv model in keras using tensorflow backend."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+import keras
+import keras.backend as K
+from keras.layers import Activation
+from keras.layers import Conv2D
+from keras.layers import Dropout
+from keras.layers import GlobalAveragePooling2D
+from keras.models import Sequential
+
+import numpy as np
+import tensorflow as tf
+
+
+class AllConv(object):
+  """allconv network that matches sklearn api."""
+
+  def __init__(self,
+               random_state=1,
+               epochs=50,
+               batch_size=32,
+               solver='rmsprop',
+               learning_rate=0.001,
+               lr_decay=0.):
+    # params
+    self.solver = solver
+    self.epochs = epochs
+    self.batch_size = batch_size
+    self.learning_rate = learning_rate
+    self.lr_decay = lr_decay
+    # data
+    self.encode_map = None
+    self.decode_map = None
+    self.model = None
+    self.random_state = random_state
+    self.n_classes = None
+
+  def build_model(self, X):
+    # assumes that data axis order is same as the backend
+    input_shape = X.shape[1:]
+    np.random.seed(self.random_state)
+    tf.set_random_seed(self.random_state)
+
+    model = Sequential()
+    model.add(Conv2D(96, (3, 3), padding='same',
+                     input_shape=input_shape, name='conv1'))
+    model.add(Activation('relu'))
+    model.add(Conv2D(96, (3, 3), name='conv2', padding='same'))
+    model.add(Activation('relu'))
+    model.add(Conv2D(96, (3, 3), strides=(2, 2), padding='same', name='conv3'))
+    model.add(Activation('relu'))
+    model.add(Dropout(0.5))
+
+    model.add(Conv2D(192, (3, 3), name='conv4', padding='same'))
+    model.add(Activation('relu'))
+    model.add(Conv2D(192, (3, 3), name='conv5', padding='same'))
+    model.add(Activation('relu'))
+    model.add(Conv2D(192, (3, 3), strides=(2, 2), name='conv6', padding='same'))
+    model.add(Activation('relu'))
+    model.add(Dropout(0.5))
+
+    model.add(Conv2D(192, (3, 3), name='conv7', padding='same'))
+    model.add(Activation('relu'))
+    model.add(Conv2D(192, (1, 1), name='conv8', padding='valid'))
+    model.add(Activation('relu'))
+    model.add(Conv2D(10, (1, 1),  name='conv9', padding='valid'))
+
+    model.add(GlobalAveragePooling2D())
+    model.add(Activation('softmax', name='activation_top'))
+    model.summary()
+
+    try:
+      optimizer = getattr(keras.optimizers, self.solver)
+    except:
+      raise NotImplementedError('optimizer not implemented in keras')
+    # All optimizers with the exception of nadam take decay as named arg
+    try:
+      opt = optimizer(lr=self.learning_rate, decay=self.lr_decay)
+    except:
+      opt = optimizer(lr=self.learning_rate, schedule_decay=self.lr_decay)
+
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=opt,
+                  metrics=['accuracy'])
+    # Save initial weights so that model can be retrained with same
+    # initialization
+    self.initial_weights = copy.deepcopy(model.get_weights())
+
+    self.model = model
+
+  def create_y_mat(self, y):
+    y_encode = self.encode_y(y)
+    y_encode = np.reshape(y_encode, (len(y_encode), 1))
+    y_mat = keras.utils.to_categorical(y_encode, self.n_classes)
+    return y_mat
+
+  # Add handling for classes that do not start counting from 0
+  def encode_y(self, y):
+    if self.encode_map is None:
+      self.classes_ = sorted(list(set(y)))
+      self.n_classes = len(self.classes_)
+      self.encode_map = dict(zip(self.classes_, range(len(self.classes_))))
+      self.decode_map = dict(zip(range(len(self.classes_)), self.classes_))
+    mapper = lambda x: self.encode_map[x]
+    transformed_y = np.array(map(mapper, y))
+    return transformed_y
+
+  def decode_y(self, y):
+    mapper = lambda x: self.decode_map[x]
+    transformed_y = np.array(map(mapper, y))
+    return transformed_y
+
+  def fit(self, X_train, y_train, sample_weight=None):
+    y_mat = self.create_y_mat(y_train)
+
+    if self.model is None:
+      self.build_model(X_train)
+
+    # We don't want incremental fit so reset learning rate and weights
+    K.set_value(self.model.optimizer.lr, self.learning_rate)
+    self.model.set_weights(self.initial_weights)
+    self.model.fit(
+        X_train,
+        y_mat,
+        batch_size=self.batch_size,
+        epochs=self.epochs,
+        shuffle=True,
+        sample_weight=sample_weight,
+        verbose=0)
+
+  def predict(self, X_val):
+    predicted = self.model.predict(X_val)
+    return predicted
+
+  def score(self, X_val, val_y):
+    y_mat = self.create_y_mat(val_y)
+    val_acc = self.model.evaluate(X_val, y_mat)[1]
+    return val_acc
+
+  def decision_function(self, X):
+    return self.predict(X)
+
+  def transform(self, X):
+    model = self.model
+    inp = [model.input]
+    activations = []
+
+    # Get activations of the last conv layer.
+    output = [layer.output for layer in model.layers if
+              layer.name == 'conv9'][0]
+    func = K.function(inp + [K.learning_phase()], [output])
+    for i in range(int(X.shape[0]/self.batch_size) + 1):
+      minibatch = X[i * self.batch_size
+                    : min(X.shape[0], (i+1) * self.batch_size)]
+      list_inputs = [minibatch, 0.]
+      # Learning phase. 0 = Test mode (no dropout or batch normalization)
+      layer_output = func(list_inputs)[0]
+      activations.append(layer_output)
+    output = np.vstack(tuple(activations))
+    output = np.reshape(output, (output.shape[0],np.product(output.shape[1:])))
+    return output
+
+  def get_params(self, deep = False):
+    params = {}
+    params['solver'] = self.solver
+    params['epochs'] = self.epochs
+    params['batch_size'] = self.batch_size
+    params['learning_rate'] = self.learning_rate
+    params['weight_decay'] = self.lr_decay
+    if deep:
+      return copy.deepcopy(params)
+    return copy.copy(params)
+
+  def set_params(self, **parameters):
+    for parameter, value in parameters.items():
+      setattr(self, parameter, value)
+    return self
diff --git a/utils/chart_data.py b/utils/chart_data.py
new file mode 100644
index 0000000..8fd876e
--- /dev/null
+++ b/utils/chart_data.py
@@ -0,0 +1,230 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Experiment charting script.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import pickle
+
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.backends.backend_pdf import PdfPages
+
+from google.apputils import app
+import gflags as flags
+from tensorflow import gfile
+
+flags.DEFINE_string('source_dir',
+                    '/tmp/toy_experiments',
+                    'Directory with the output to analyze.')
+flags.DEFINE_string('save_dir', '/tmp/active_learning',
+                    'Directory to save charts.')
+flags.DEFINE_string('dataset', 'letter', 'Dataset to analyze.')
+flags.DEFINE_string(
+    'sampling_methods',
+    ('uniform,margin,informative_diverse,'
+     'pred_expert_advice_trip_agg,'
+     'mixture_of_samplers-margin-0.33-informative_diverse-0.33-uniform-0.34'),
+    'Comma separated string of sampling methods to include in chart.')
+flags.DEFINE_string('scoring_methods', 'logistic,kernel_ls',
+                    'Comma separated string of scoring methods to chart.')
+flags.DEFINE_bool('normalize', False, 'Chart runs using normalized data.')
+flags.DEFINE_bool('standardize', True, 'Chart runs using standardized data.')
+
+FLAGS = flags.FLAGS
+
+
+def combine_results(files, diff=False):
+  all_results = {}
+  for f in files:
+    data = pickle.load(gfile.FastGFile(f, 'r'))
+    for k in data:
+      if isinstance(k, tuple):
+        data[k].pop('noisy_targets')
+        data[k].pop('indices')
+        data[k].pop('selected_inds')
+        data[k].pop('sampler_output')
+        key = list(k)
+        seed = key[-1]
+        key = key[0:10]
+        key = tuple(key)
+        if key in all_results:
+          if seed not in all_results[key]['random_seeds']:
+            all_results[key]['random_seeds'].append(seed)
+            for field in [f for f in data[k] if f != 'n_points']:
+              all_results[key][field] = np.vstack(
+                  (all_results[key][field], data[k][field]))
+        else:
+          all_results[key] = data[k]
+          all_results[key]['random_seeds'] = [seed]
+      else:
+        all_results[k] = data[k]
+  return all_results
+
+
+def plot_results(all_results, score_method, norm, stand, sampler_filter):
+  colors = {
+      'margin':
+          'gold',
+      'uniform':
+          'k',
+      'informative_diverse':
+          'r',
+      'mixture_of_samplers-margin-0.33-informative_diverse-0.33-uniform-0.34':
+          'b',
+      'pred_expert_advice_trip_agg':
+          'g'
+  }
+  labels = {
+      'margin':
+          'margin',
+      'uniform':
+          'uniform',
+      'mixture_of_samplers-margin-0.33-informative_diverse-0.33-uniform-0.34':
+          'margin:0.33,informative_diverse:0.33, uniform:0.34',
+      'informative_diverse':
+          'informative and diverse',
+      'pred_expert_advice_trip_agg':
+          'expert: margin,informative_diverse,uniform'
+  }
+  markers = {
+      'margin':
+          'None',
+      'uniform':
+          'None',
+      'mixture_of_samplers-margin-0.33-informative_diverse-0.33-uniform-0.34':
+          '>',
+      'informative_diverse':
+          'None',
+      'pred_expert_advice_trip_agg':
+          'p'
+  }
+  fields = all_results['tuple_keys']
+  fields = dict(zip(fields, range(len(fields))))
+
+  for k in sorted(all_results.keys()):
+    sampler = k[fields['sampler']]
+    if (isinstance(k, tuple) and
+        k[fields['score_method']] == score_method and
+        k[fields['standardize']] == stand and
+        k[fields['normalize']] == norm and
+        (sampler_filter is None or sampler in sampler_filter)):
+      results = all_results[k]
+      n_trials = results['accuracy'].shape[0]
+      x = results['data_sizes'][0]
+      mean_acc = np.mean(results['accuracy'], axis=0)
+      CI_acc = np.std(results['accuracy'], axis=0) / np.sqrt(n_trials) * 2.96
+      if sampler == 'uniform':
+        plt.plot(
+            x,
+            mean_acc,
+            linewidth=1,
+            label=labels[sampler],
+            color=colors[sampler],
+            linestyle='--'
+        )
+        plt.fill_between(
+            x,
+            mean_acc - CI_acc,
+            mean_acc + CI_acc,
+            color=colors[sampler],
+            alpha=0.2
+        )
+      else:
+        plt.plot(
+            x,
+            mean_acc,
+            linewidth=1,
+            label=labels[sampler],
+            color=colors[sampler],
+            marker=markers[sampler],
+            markeredgecolor=colors[sampler]
+        )
+        plt.fill_between(
+            x,
+            mean_acc - CI_acc,
+            mean_acc + CI_acc,
+            color=colors[sampler],
+            alpha=0.2
+        )
+  plt.legend(loc=4)
+
+
+def get_between(filename, start, end):
+  start_ind = filename.find(start) + len(start)
+  end_ind = filename.rfind(end)
+  return filename[start_ind:end_ind]
+
+
+def get_sampling_method(dataset, filename):
+  return get_between(filename, dataset + '_', '/')
+
+
+def get_scoring_method(filename):
+  return get_between(filename, 'results_score_', '_select_')
+
+
+def get_normalize(filename):
+  return get_between(filename, '_norm_', '_stand_') == 'True'
+
+
+def get_standardize(filename):
+  return get_between(
+      filename, '_stand_', filename[filename.rfind('_'):]) == 'True'
+
+
+def main(argv):
+  del argv  # Unused.
+  if not gfile.Exists(FLAGS.save_dir):
+    gfile.MkDir(FLAGS.save_dir)
+  charting_filepath = os.path.join(FLAGS.save_dir,
+                                   FLAGS.dataset + '_charts.pdf')
+  sampling_methods = FLAGS.sampling_methods.split(',')
+  scoring_methods = FLAGS.scoring_methods.split(',')
+  files = gfile.Glob(
+      os.path.join(FLAGS.source_dir, FLAGS.dataset + '*/results*.pkl'))
+  files = [
+      f for f in files
+      if (get_sampling_method(FLAGS.dataset, f) in sampling_methods and
+          get_scoring_method(f) in scoring_methods and
+          get_normalize(f) == FLAGS.normalize and
+          get_standardize(f) == FLAGS.standardize)
+  ]
+
+  print('Reading in %d files...' % len(files))
+  all_results = combine_results(files)
+  pdf = PdfPages(charting_filepath)
+
+  print('Plotting charts...')
+  plt.style.use('ggplot')
+  for m in scoring_methods:
+    plot_results(
+        all_results,
+        m,
+        FLAGS.normalize,
+        FLAGS.standardize,
+        sampler_filter=sampling_methods)
+    plt.title('Dataset: %s, Score Method: %s' % (FLAGS.dataset, m))
+    pdf.savefig()
+    plt.close()
+  pdf.close()
+
+
+if __name__ == '__main__':
+  app.run()
diff --git a/utils/create_data.py b/utils/create_data.py
new file mode 100644
index 0000000..b47726c
--- /dev/null
+++ b/utils/create_data.py
@@ -0,0 +1,284 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Make datasets and save specified directory.
+
+Downloads datasets using scikit datasets and can also parse csv file
+to save into pickle format.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from io import BytesIO
+import os
+import pickle
+import StringIO
+import tarfile
+import urllib2
+
+import keras.backend as K
+from keras.datasets import cifar10
+from keras.datasets import cifar100
+from keras.datasets import mnist
+
+import numpy as np
+import pandas as pd
+from sklearn.datasets import fetch_20newsgroups_vectorized
+from sklearn.datasets import fetch_mldata
+from sklearn.datasets import load_breast_cancer
+from sklearn.datasets import load_iris
+import sklearn.datasets.rcv1
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction.text import TfidfTransformer
+
+from google.apputils import app
+import gflags as flags
+from tensorflow import gfile
+
+flags.DEFINE_string('save_dir', '/tmp/data',
+                    'Where to save outputs')
+flags.DEFINE_string('datasets', '',
+                    'Which datasets to download, comma separated.')
+FLAGS = flags.FLAGS
+
+
+class Dataset(object):
+
+  def __init__(self, X, y):
+    self.data = X
+    self.target = y
+
+
+def get_csv_data(filename):
+  """Parse csv and return Dataset object with data and targets.
+
+  Create pickle data from csv, assumes the first column contains the targets
+  Args:
+    filename: complete path of the csv file
+  Returns:
+    Dataset object
+  """
+  f = gfile.GFile(filename, 'r')
+  mat = []
+  for l in f:
+    row = l.strip()
+    row = row.replace('"', '')
+    row = row.split(',')
+    row = [float(x) for x in row]
+    mat.append(row)
+  mat = np.array(mat)
+  y = mat[:, 0]
+  X = mat[:, 1:]
+  data = Dataset(X, y)
+  return data
+
+
+def get_wikipedia_talk_data():
+  """Get wikipedia talk dataset.
+
+  See here for more information about the dataset:
+  https://figshare.com/articles/Wikipedia_Detox_Data/4054689
+  Downloads annotated comments and annotations.
+  """
+
+  ANNOTATED_COMMENTS_URL = 'https://ndownloader.figshare.com/files/7554634'
+  ANNOTATIONS_URL = 'https://ndownloader.figshare.com/files/7554637'
+
+  def download_file(url):
+    req = urllib2.Request(url)
+    response = urllib2.urlopen(req)
+    return response
+
+  # Process comments
+  comments = pd.read_table(
+      download_file(ANNOTATED_COMMENTS_URL), index_col=0, sep='\t')
+  # remove newline and tab tokens
+  comments['comment'] = comments['comment'].apply(
+      lambda x: x.replace('NEWLINE_TOKEN', ' '))
+  comments['comment'] = comments['comment'].apply(
+      lambda x: x.replace('TAB_TOKEN', ' '))
+
+  # Process labels
+  annotations = pd.read_table(download_file(ANNOTATIONS_URL), sep='\t')
+  # labels a comment as an atack if the majority of annoatators did so
+  labels = annotations.groupby('rev_id')['attack'].mean() > 0.5
+
+  # Perform data preprocessing, should probably tune these hyperparameters
+  vect = CountVectorizer(max_features=30000, ngram_range=(1, 2))
+  tfidf = TfidfTransformer(norm='l2')
+  X = tfidf.fit_transform(vect.fit_transform(comments['comment']))
+  y = np.array(labels)
+  data = Dataset(X, y)
+  return data
+
+
+def get_keras_data(dataname):
+  """Get datasets using keras API and return as a Dataset object."""
+  if dataname == 'cifar10_keras':
+    train, test = cifar10.load_data()
+  elif dataname == 'cifar100_coarse_keras':
+    train, test = cifar100.load_data('coarse')
+  elif dataname == 'cifar100_keras':
+    train, test = cifar100.load_data()
+  elif dataname == 'mnist_keras':
+    train, test = mnist.load_data()
+  else:
+    raise NotImplementedError('dataset not supported')
+
+  X = np.concatenate((train[0], test[0]))
+  y = np.concatenate((train[1], test[1]))
+
+  if dataname == 'mnist_keras':
+    # Add extra dimension for channel
+    num_rows = X.shape[1]
+    num_cols = X.shape[2]
+    X = X.reshape(X.shape[0], 1, num_rows, num_cols)
+    if K.image_data_format() == 'channels_last':
+      X = X.transpose(0, 2, 3, 1)
+
+  y = y.flatten()
+  data = Dataset(X, y)
+  return data
+
+
+# TODO(lishal): remove regular cifar10 dataset and only use dataset downloaded
+# from keras to maintain image dims to create tensor for tf models
+# Requires adding handling in run_experiment.py for handling of different
+# training methods that require either 2d or tensor data.
+def get_cifar10():
+  """Get CIFAR-10 dataset from source dir.
+
+  Slightly redundant with keras function to get cifar10 but this returns
+  in flat format instead of keras numpy image tensor.
+  """
+  url = 'http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
+  def download_file(url):
+    req = urllib2.Request(url)
+    response = urllib2.urlopen(req)
+    return response
+  response = download_file(url)
+  tmpfile = BytesIO()
+  while True:
+    # Download a piece of the file from the connection
+    s = response.read(16384)
+    # Once the entire file has been downloaded, tarfile returns b''
+    # (the empty bytes) which is a falsey value
+    if not s:
+      break
+    # Otherwise, write the piece of the file to the temporary file.
+    tmpfile.write(s)
+  response.close()
+
+  tmpfile.seek(0)
+  tar_dir = tarfile.open(mode='r:gz', fileobj=tmpfile)
+  X = None
+  y = None
+  for member in tar_dir.getnames():
+    if '_batch' in member:
+      filestream = tar_dir.extractfile(member).read()
+      batch = pickle.load(StringIO.StringIO(filestream))
+      if X is None:
+        X = np.array(batch['data'], dtype=np.uint8)
+        y = np.array(batch['labels'])
+      else:
+        X = np.concatenate((X, np.array(batch['data'], dtype=np.uint8)))
+        y = np.concatenate((y, np.array(batch['labels'])))
+  data = Dataset(X, y)
+  return data
+
+
+def get_mldata(dataset):
+  # Use scikit to grab datasets and save them save_dir.
+  save_dir = FLAGS.save_dir
+  filename = os.path.join(save_dir, dataset[1]+'.pkl')
+
+  if not gfile.Exists(save_dir):
+    gfile.MkDir(save_dir)
+  if not gfile.Exists(filename):
+    if dataset[0][-3:] == 'csv':
+      data = get_csv_data(dataset[0])
+    elif dataset[0] == 'breast_cancer':
+      data = load_breast_cancer()
+    elif dataset[0] == 'iris':
+      data = load_iris()
+    elif dataset[0] == 'newsgroup':
+      # Removing header information to make sure that no newsgroup identifying
+      # information is included in data
+      data = fetch_20newsgroups_vectorized(subset='all', remove=('headers'))
+      tfidf = TfidfTransformer(norm='l2')
+      X = tfidf.fit_transform(data.data)
+      data.data = X
+    elif dataset[0] == 'rcv1':
+      sklearn.datasets.rcv1.URL = (
+        'http://www.ai.mit.edu/projects/jmlr/papers/'
+        'volume5/lewis04a/a13-vector-files/lyrl2004_vectors')
+      sklearn.datasets.rcv1.URL_topics = (
+        'http://www.ai.mit.edu/projects/jmlr/papers/'
+        'volume5/lewis04a/a08-topic-qrels/rcv1-v2.topics.qrels.gz')
+      data = sklearn.datasets.fetch_rcv1(
+          data_home='/tmp')
+    elif dataset[0] == 'wikipedia_attack':
+      data = get_wikipedia_talk_data()
+    elif dataset[0] == 'cifar10':
+      data = get_cifar10()
+    elif 'keras' in dataset[0]:
+      data = get_keras_data(dataset[0])
+    else:
+      try:
+        data = fetch_mldata(dataset[0])
+      except:
+        raise Exception('ERROR: failed to fetch data from mldata.org')
+    X = data.data
+    y = data.target
+    if X.shape[0] != y.shape[0]:
+      X = np.transpose(X)
+    assert X.shape[0] == y.shape[0]
+
+    data = {'data': X, 'target': y}
+    pickle.dump(data, gfile.GFile(filename, 'w'))
+
+
+def main(argv):
+  del argv  # Unused.
+  # First entry of tuple is mldata.org name, second is the name that we'll use
+  # to reference the data.
+  datasets = [('mnist (original)', 'mnist'), ('australian', 'australian'),
+              ('heart', 'heart'), ('breast_cancer', 'breast_cancer'),
+              ('iris', 'iris'), ('vehicle', 'vehicle'), ('wine', 'wine'),
+              ('waveform ida', 'waveform'), ('german ida', 'german'),
+              ('splice ida', 'splice'), ('ringnorm ida', 'ringnorm'),
+              ('twonorm ida', 'twonorm'), ('diabetes_scale', 'diabetes'),
+              ('mushrooms', 'mushrooms'), ('letter', 'letter'), ('dna', 'dna'),
+              ('banana-ida', 'banana'), ('letter', 'letter'), ('dna', 'dna'),
+              ('newsgroup', 'newsgroup'), ('cifar10', 'cifar10'),
+              ('cifar10_keras', 'cifar10_keras'),
+              ('cifar100_keras', 'cifar100_keras'),
+              ('cifar100_coarse_keras', 'cifar100_coarse_keras'),
+              ('mnist_keras', 'mnist_keras'),
+              ('wikipedia_attack', 'wikipedia_attack'),
+              ('rcv1', 'rcv1')]
+
+  if FLAGS.datasets:
+    subset = FLAGS.datasets.split(',')
+    datasets = [d for d in datasets if d[1] in subset]
+
+  for d in datasets:
+    print(d[1])
+    get_mldata(d)
+
+
+if __name__ == '__main__':
+  app.run()
diff --git a/utils/kernel_block_solver.py b/utils/kernel_block_solver.py
new file mode 100644
index 0000000..d3e29eb
--- /dev/null
+++ b/utils/kernel_block_solver.py
@@ -0,0 +1,185 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Block kernel lsqr solver for multi-class classification."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import math
+
+import numpy as np
+import scipy.linalg as linalg
+from scipy.sparse.linalg import spsolve
+from sklearn import metrics
+
+
+class BlockKernelSolver(object):
+  """Inspired by algorithm from https://arxiv.org/pdf/1602.05310.pdf."""
+  # TODO(lishal): save preformed kernel matrix and reuse if possible
+  # perhaps not possible if want to keep scikitlearn signature
+
+  def __init__(self,
+               random_state=1,
+               C=0.1,
+               block_size=4000,
+               epochs=3,
+               verbose=False,
+               gamma=None):
+    self.block_size = block_size
+    self.epochs = epochs
+    self.C = C
+    self.kernel = 'rbf'
+    self.coef_ = None
+    self.verbose = verbose
+    self.encode_map = None
+    self.decode_map = None
+    self.gamma = gamma
+    self.X_train = None
+    self.random_state = random_state
+
+  def encode_y(self, y):
+    # Handles classes that do not start counting from 0.
+    if self.encode_map is None:
+      self.classes_ = sorted(list(set(y)))
+      self.encode_map = dict(zip(self.classes_, range(len(self.classes_))))
+      self.decode_map = dict(zip(range(len(self.classes_)), self.classes_))
+    mapper = lambda x: self.encode_map[x]
+    transformed_y = np.array(map(mapper, y))
+    return transformed_y
+
+  def decode_y(self, y):
+    mapper = lambda x: self.decode_map[x]
+    transformed_y = np.array(map(mapper, y))
+    return transformed_y
+
+  def fit(self, X_train, y_train, sample_weight=None):
+    """Form K and solve (K + lambda * I)x = y in a block-wise fashion."""
+    np.random.seed(self.random_state)
+    self.X_train = X_train
+    n_features = X_train.shape[1]
+    y = self.encode_y(y_train)
+    if self.gamma is None:
+      self.gamma = 1./n_features
+    K = metrics.pairwise.pairwise_kernels(
+        X_train, metric=self.kernel, gamma=self.gamma)
+    if self.verbose:
+      print('Finished forming kernel matrix.')
+
+    # compute some constants
+    num_classes = len(list(set(y)))
+    num_samples = K.shape[0]
+    num_blocks = math.ceil(num_samples*1.0/self.block_size)
+    x = np.zeros((K.shape[0], num_classes))
+    y_hat = np.zeros((K.shape[0], num_classes))
+    onehot = lambda x: np.eye(num_classes)[x]
+    y_onehot = np.array(map(onehot, y))
+    idxes = np.diag_indices(num_samples)
+    if sample_weight is not None:
+      weights = np.sqrt(sample_weight)
+      weights = weights[:, np.newaxis]
+      y_onehot = weights * y_onehot
+      K *= np.outer(weights, weights)
+    if num_blocks == 1:
+      epochs = 1
+    else:
+      epochs = self.epochs
+
+    for e in range(epochs):
+      shuffled_coords = np.random.choice(
+          num_samples, num_samples, replace=False)
+      for b in range(int(num_blocks)):
+        residuals = y_onehot - y_hat
+
+        # Form a block of K.
+        K[idxes] += (self.C * num_samples)
+        block = shuffled_coords[b*self.block_size:
+                                min((b+1)*self.block_size, num_samples)]
+        K_block = K[:, block]
+        # Dim should be block size x block size
+        KbTKb = K_block.T.dot(K_block)
+
+        if self.verbose:
+          print('solving block {0}'.format(b))
+        # Try linalg solve then sparse solve for handling of sparse input.
+        try:
+          x_block = linalg.solve(KbTKb, K_block.T.dot(residuals))
+        except:
+          try:
+            x_block = spsolve(KbTKb, K_block.T.dot(residuals))
+          except:
+            return None
+
+        # update model
+        x[block] = x[block] + x_block
+        K[idxes] = K[idxes] - (self.C * num_samples)
+        y_hat = K.dot(x)
+
+        y_pred = np.argmax(y_hat, axis=1)
+        train_acc = metrics.accuracy_score(y, y_pred)
+        if self.verbose:
+          print('Epoch: {0}, Block: {1}, Train Accuracy: {2}'
+                .format(e, b, train_acc))
+    self.coef_ = x
+
+  def predict(self, X_val):
+    val_K = metrics.pairwise.pairwise_kernels(
+        X_val, self.X_train, metric=self.kernel, gamma=self.gamma)
+    val_pred = np.argmax(val_K.dot(self.coef_), axis=1)
+    return self.decode_y(val_pred)
+
+  def score(self, X_val, val_y):
+    val_pred = self.predict(X_val)
+    val_acc = metrics.accuracy_score(val_y, val_pred)
+    return val_acc
+
+  def decision_function(self, X, type='predicted'):
+    # Return the predicted value of the best class
+    # Margin_AL will see that a vector is returned and not a matrix and
+    # simply select the points that have the lowest predicted value to label
+    K = metrics.pairwise.pairwise_kernels(
+        X, self.X_train, metric=self.kernel, gamma=self.gamma)
+    predicted = K.dot(self.coef_)
+    if type == 'scores':
+      val_best = np.max(K.dot(self.coef_), axis=1)
+      return val_best
+    elif type == 'predicted':
+      return predicted
+    else:
+      raise NotImplementedError('Invalid return type for decision function.')
+
+  def get_params(self, deep=False):
+    params = {}
+    params['C'] = self.C
+    params['gamma'] = self.gamma
+    if deep:
+      return copy.deepcopy(params)
+    return copy.copy(params)
+
+  def set_params(self, **parameters):
+    for parameter, value in parameters.items():
+      setattr(self, parameter, value)
+    return self
+
+  def softmax_over_predicted(self, X):
+    val_K = metrics.pairwise.pairwise_kernels(
+        X, self.X_train, metric=self.kernel, gamma=self.gamma)
+    val_pred = val_K.dot(self.coef_)
+    row_min = np.min(val_pred, axis=1)
+    val_pred = val_pred - row_min[:, None]
+    val_pred = np.exp(val_pred)
+    sum_exp = np.sum(val_pred, axis=1)
+    val_pred = val_pred/sum_exp[:, None]
+    return val_pred
diff --git a/utils/small_cnn.py b/utils/small_cnn.py
new file mode 100644
index 0000000..ea8b0dd
--- /dev/null
+++ b/utils/small_cnn.py
@@ -0,0 +1,199 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Implements Small CNN model in keras using tensorflow backend."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+import keras
+import keras.backend as K
+from keras.layers import Activation
+from keras.layers import Conv2D
+from keras.layers import Dense
+from keras.layers import Dropout
+from keras.layers import Flatten
+from keras.layers import MaxPooling2D
+from keras.models import Sequential
+
+import numpy as np
+import tensorflow as tf
+
+
+class SmallCNN(object):
+  """Small convnet that matches sklearn api.
+
+  Implements model from
+  https://github.com/fchollet/keras/blob/master/examples/cifar10_cnn.py
+  Adapts for inputs of variable size, expects data to be 4d tensor, with
+  # of obserations as first dimension and other dimensions to correspond to
+  length width and # of channels in image.
+  """
+
+  def __init__(self,
+               random_state=1,
+               epochs=50,
+               batch_size=32,
+               solver='rmsprop',
+               learning_rate=0.001,
+               lr_decay=0.):
+    # params
+    self.solver = solver
+    self.epochs = epochs
+    self.batch_size = batch_size
+    self.learning_rate = learning_rate
+    self.lr_decay = lr_decay
+    # data
+    self.encode_map = None
+    self.decode_map = None
+    self.model = None
+    self.random_state = random_state
+    self.n_classes = None
+
+  def build_model(self, X):
+    # assumes that data axis order is same as the backend
+    input_shape = X.shape[1:]
+    np.random.seed(self.random_state)
+    tf.set_random_seed(self.random_state)
+
+    model = Sequential()
+    model.add(Conv2D(32, (3, 3), padding='same',
+                     input_shape=input_shape, name='conv1'))
+    model.add(Activation('relu'))
+    model.add(Conv2D(32, (3, 3), name='conv2'))
+    model.add(Activation('relu'))
+    model.add(MaxPooling2D(pool_size=(2, 2)))
+    model.add(Dropout(0.25))
+
+    model.add(Conv2D(64, (3, 3), padding='same', name='conv3'))
+    model.add(Activation('relu'))
+    model.add(Conv2D(64, (3, 3), name='conv4'))
+    model.add(Activation('relu'))
+    model.add(MaxPooling2D(pool_size=(2, 2)))
+    model.add(Dropout(0.25))
+
+    model.add(Flatten())
+    model.add(Dense(512, name='dense1'))
+    model.add(Activation('relu'))
+    model.add(Dropout(0.5))
+    model.add(Dense(self.n_classes, name='dense2'))
+    model.add(Activation('softmax'))
+
+    try:
+      optimizer = getattr(keras.optimizers, self.solver)
+    except:
+      raise NotImplementedError('optimizer not implemented in keras')
+    # All optimizers with the exception of nadam take decay as named arg
+    try:
+      opt = optimizer(lr=self.learning_rate, decay=self.lr_decay)
+    except:
+      opt = optimizer(lr=self.learning_rate, schedule_decay=self.lr_decay)
+
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=opt,
+                  metrics=['accuracy'])
+    # Save initial weights so that model can be retrained with same
+    # initialization
+    self.initial_weights = copy.deepcopy(model.get_weights())
+
+    self.model = model
+
+  def create_y_mat(self, y):
+    y_encode = self.encode_y(y)
+    y_encode = np.reshape(y_encode, (len(y_encode), 1))
+    y_mat = keras.utils.to_categorical(y_encode, self.n_classes)
+    return y_mat
+
+  # Add handling for classes that do not start counting from 0
+  def encode_y(self, y):
+    if self.encode_map is None:
+      self.classes_ = sorted(list(set(y)))
+      self.n_classes = len(self.classes_)
+      self.encode_map = dict(zip(self.classes_, range(len(self.classes_))))
+      self.decode_map = dict(zip(range(len(self.classes_)), self.classes_))
+    mapper = lambda x: self.encode_map[x]
+    transformed_y = np.array(map(mapper, y))
+    return transformed_y
+
+  def decode_y(self, y):
+    mapper = lambda x: self.decode_map[x]
+    transformed_y = np.array(map(mapper, y))
+    return transformed_y
+
+  def fit(self, X_train, y_train, sample_weight=None):
+    y_mat = self.create_y_mat(y_train)
+
+    if self.model is None:
+      self.build_model(X_train)
+
+    # We don't want incremental fit so reset learning rate and weights
+    K.set_value(self.model.optimizer.lr, self.learning_rate)
+    self.model.set_weights(self.initial_weights)
+    self.model.fit(
+        X_train,
+        y_mat,
+        batch_size=self.batch_size,
+        epochs=self.epochs,
+        shuffle=True,
+        sample_weight=sample_weight,
+        verbose=0)
+
+  def predict(self, X_val):
+    predicted = self.model.predict(X_val)
+    return predicted
+
+  def score(self, X_val, val_y):
+    y_mat = self.create_y_mat(val_y)
+    val_acc = self.model.evaluate(X_val, y_mat)[1]
+    return val_acc
+
+  def decision_function(self, X):
+    return self.predict(X)
+
+  def transform(self, X):
+    model = self.model
+    inp = [model.input]
+    activations = []
+
+    # Get activations of the first dense layer.
+    output = [layer.output for layer in model.layers if
+              layer.name == 'dense1'][0]
+    func = K.function(inp + [K.learning_phase()], [output])
+    for i in range(int(X.shape[0]/self.batch_size) + 1):
+      minibatch = X[i * self.batch_size
+                    : min(X.shape[0], (i+1) * self.batch_size)]
+      list_inputs = [minibatch, 0.]
+      # Learning phase. 0 = Test mode (no dropout or batch normalization)
+      layer_output = func(list_inputs)[0]
+      activations.append(layer_output)
+    output = np.vstack(tuple(activations))
+    return output
+
+  def get_params(self, deep = False):
+    params = {}
+    params['solver'] = self.solver
+    params['epochs'] = self.epochs
+    params['batch_size'] = self.batch_size
+    params['learning_rate'] = self.learning_rate
+    params['weight_decay'] = self.lr_decay
+    if deep:
+      return copy.deepcopy(params)
+    return copy.copy(params)
+
+  def set_params(self, **parameters):
+    for parameter, value in parameters.items():
+      setattr(self, parameter, value)
+    return self
diff --git a/utils/utils.py b/utils/utils.py
new file mode 100644
index 0000000..a1d2b8b
--- /dev/null
+++ b/utils/utils.py
@@ -0,0 +1,336 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utility functions for run_experiment.py."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import os
+import pickle
+import sys
+
+import numpy as np
+import scipy
+
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import GridSearchCV
+from sklearn.svm import LinearSVC
+from sklearn.svm import SVC
+
+from tensorflow import gfile
+
+
+from utils.kernel_block_solver import BlockKernelSolver
+from utils.small_cnn import SmallCNN
+from utils.allconv import AllConv
+
+
+class Logger(object):
+  """Logging object to write to file and stdout."""
+
+  def __init__(self, filename):
+    self.terminal = sys.stdout
+    self.log = gfile.GFile(filename, "w")
+
+  def write(self, message):
+    self.terminal.write(message)
+    self.log.write(message)
+
+  def flush(self):
+    self.terminal.flush()
+
+  def flush_file(self):
+    self.log.flush()
+
+
+def create_checker_unbalanced(split, n, grid_size):
+  """Creates a dataset with two classes that occupy one color of checkboard.
+
+  Args:
+    split: splits to use for class imbalance.
+    n: number of datapoints to sample.
+    grid_size: checkerboard size.
+  Returns:
+    X: 2d features.
+    y: binary class.
+  """
+  y = np.zeros(0)
+  X = np.zeros((0, 2))
+  for i in range(grid_size):
+    for j in range(grid_size):
+      label = 0
+      n_0 = int(n/(grid_size*grid_size) * split[0] * 2)
+      if (i-j) % 2 == 0:
+        label = 1
+        n_0 = int(n/(grid_size*grid_size) * split[1] * 2)
+      x_1 = np.random.uniform(i, i+1, n_0)
+      x_2 = np.random.uniform(j, j+1, n_0)
+      x = np.vstack((x_1, x_2))
+      x = x.T
+      X = np.concatenate((X, x))
+      y_0 = label * np.ones(n_0)
+      y = np.concatenate((y, y_0))
+  return X, y
+
+
+def flatten_X(X):
+  shape = X.shape
+  flat_X = X
+  if len(shape) > 2:
+    flat_X = np.reshape(X, (shape[0], np.product(shape[1:])))
+  return flat_X
+
+
+def get_mldata(data_dir, name):
+  """Loads data from data_dir.
+
+  Looks for the file in data_dir.
+  Assumes that data is in pickle format with dictionary fields data and target.
+
+
+  Args:
+    data_dir: directory to look in
+    name: dataset name, assumes data is saved in the save_dir with filename
+      <name>.pkl
+  Returns:
+    data and targets
+  Raises:
+    NameError: dataset not found in data folder.
+  """
+  dataname = name
+  if dataname == "checkerboard":
+    X, y = create_checker_unbalanced(split=[1./5, 4./5], n=10000, grid_size=4)
+  else:
+    filename = os.path.join(data_dir, dataname + ".pkl")
+    if not gfile.Exists(filename):
+      raise NameError("ERROR: dataset not available")
+    data = pickle.load(gfile.GFile(filename, "r"))
+    X = data["data"]
+    y = data["target"]
+    if "keras" in dataname:
+      X = X / 255
+      y = y.flatten()
+  return X, y
+
+
+def filter_data(X, y, keep=None):
+  """Filters data by class indicated in keep.
+
+  Args:
+    X: train data
+    y: train targets
+    keep: defaults to None which will keep everything, otherwise takes a list
+      of classes to keep
+
+  Returns:
+    filtered data and targets
+  """
+  if keep is None:
+    return X, y
+  keep_ind = [i for i in range(len(y)) if y[i] in keep]
+  return X[keep_ind], y[keep_ind]
+
+
+def get_class_counts(y_full, y):
+  """Gets the count of all classes in a sample.
+
+  Args:
+    y_full: full target vector containing all classes
+    y: sample vector for which to perform the count
+  Returns:
+    count of classes for the sample vector y, the class order for count will
+    be the same as long as same y_full is fed in
+  """
+  classes = np.unique(y_full)
+  classes = np.sort(classes)
+  unique, counts = np.unique(y, return_counts=True)
+  complete_counts = []
+  for c in classes:
+    if c not in unique:
+      complete_counts.append(0)
+    else:
+      index = np.where(unique == c)[0][0]
+      complete_counts.append(counts[index])
+  return np.array(complete_counts)
+
+
+def flip_label(y, percent_random):
+  """Flips a percentage of labels for one class to the other.
+
+  Randomly sample a percent of points and randomly label the sampled points as
+  one of the other classes.
+  Does not introduce bias.
+
+  Args:
+    y: labels of all datapoints
+    percent_random: percent of datapoints to corrupt the labels
+
+  Returns:
+    new labels with noisy labels for indicated percent of data
+  """
+  classes = np.unique(y)
+  y_orig = copy.copy(y)
+  indices = range(y_orig.shape[0])
+  np.random.shuffle(indices)
+  sample = indices[0:int(len(indices) * 1.0 * percent_random)]
+  fake_labels = []
+  for s in sample:
+    label = y[s]
+    class_ind = np.where(classes == label)[0][0]
+    other_classes = np.delete(classes, class_ind)
+    np.random.shuffle(other_classes)
+    fake_label = other_classes[0]
+    assert fake_label != label
+    fake_labels.append(fake_label)
+  y[sample] = np.array(fake_labels)
+  assert all(y[indices[len(sample):]] == y_orig[indices[len(sample):]])
+  return y
+
+
+def get_model(method, seed=13):
+  """Construct sklearn model using either logistic regression or linear svm.
+
+  Wraps grid search on regularization parameter over either logistic regression
+  or svm, returns constructed model
+
+  Args:
+    method: string indicating scikit method to use, currently accepts logistic
+      and linear svm.
+    seed: int or rng to use for random state fed to scikit method
+
+  Returns:
+    scikit learn model
+  """
+  # TODO(lishal): extend to include any scikit model that implements
+  #   a decision function.
+  # TODO(lishal): for kernel methods, currently using default value for gamma
+  # but should probably tune.
+  if method == "logistic":
+    model = LogisticRegression(random_state=seed, multi_class="multinomial",
+                               solver="lbfgs", max_iter=200)
+    params = {"C": [10.0**(i) for i in range(-4, 5)]}
+  elif method == "logistic_ovr":
+    model = LogisticRegression(random_state=seed)
+    params = {"C": [10.0**(i) for i in range(-5, 4)]}
+  elif method == "linear_svm":
+    model = LinearSVC(random_state=seed)
+    params = {"C": [10.0**(i) for i in range(-4, 5)]}
+  elif method == "kernel_svm":
+    model = SVC(random_state=seed)
+    params = {"C": [10.0**(i) for i in range(-4, 5)]}
+  elif method == "kernel_ls":
+    model = BlockKernelSolver(random_state=seed)
+    params = {"C": [10.0**(i) for i in range(-6, 1)]}
+  elif method == "small_cnn":
+    # Model does not work with weighted_expert or simulate_batch
+    model = SmallCNN(random_state=seed)
+    return model
+  elif method == "allconv":
+    # Model does not work with weighted_expert or simulate_batch
+    model = AllConv(random_state=seed)
+    return model
+
+  else:
+    raise NotImplementedError("ERROR: " + method + " not implemented")
+
+  model = GridSearchCV(model, params, cv=3)
+  return model
+
+
+def calculate_entropy(batch_size, y_s):
+  """Calculates KL div between training targets and targets selected by AL.
+
+  Args:
+    batch_size: batch size of datapoints selected by AL
+    y_s: vector of datapoints selected by AL.  Assumes that the order of the
+      data is the order in which points were labeled by AL.  Also assumes
+      that in the offline setting y_s will eventually overlap completely with
+      original training targets.
+  Returns:
+    entropy between actual distribution of classes and distribution of
+    samples selected by AL
+  """
+  n_batches = int(np.ceil(len(y_s) * 1.0 / batch_size))
+  counts = get_class_counts(y_s, y_s)
+  true_dist = counts / (len(y_s) * 1.0)
+  entropy = []
+  for b in range(n_batches):
+    sample = y_s[b * batch_size:(b + 1) * batch_size]
+    counts = get_class_counts(y_s, sample)
+    sample_dist = counts / (1.0 * len(sample))
+    entropy.append(scipy.stats.entropy(true_dist, sample_dist))
+  return entropy
+
+
+def get_train_val_test_splits(X, y, max_points, seed, confusion, seed_batch,
+                              split=(2./3, 1./6, 1./6)):
+  """Return training, validation, and test splits for X and y.
+
+  Args:
+    X: features
+    y: targets
+    max_points: # of points to use when creating splits.
+    seed: seed for shuffling.
+    confusion: labeling noise to introduce.  0.1 means randomize 10% of labels.
+    seed_batch: # of initial datapoints to ensure sufficient class membership.
+    split: percent splits for train, val, and test.
+  Returns:
+    indices: shuffled indices to recreate splits given original input data X.
+    y_noise: y with noise injected, needed to reproduce results outside of
+      run_experiments using original data.
+  """
+  np.random.seed(seed)
+  X_copy = copy.copy(X)
+  y_copy = copy.copy(y)
+
+  # Introduce labeling noise
+  y_noise = flip_label(y_copy, confusion)
+
+  indices = np.arange(len(y))
+
+  if max_points is None:
+    max_points = len(y_noise)
+  else:
+    max_points = min(len(y_noise), max_points)
+  train_split = int(max_points * split[0])
+  val_split = train_split + int(max_points * split[1])
+  assert seed_batch <= train_split
+
+  # Do this to make sure that the initial batch has examples from all classes
+  min_shuffle = 3
+  n_shuffle = 0
+  y_tmp = y_noise
+
+  # Need at least 4 obs of each class for 2 fold CV to work in grid search step
+  while (any(get_class_counts(y_tmp, y_tmp[0:seed_batch]) < 4)
+         or n_shuffle < min_shuffle):
+    np.random.shuffle(indices)
+    y_tmp = y_noise[indices]
+    n_shuffle += 1
+
+  X_train = X_copy[indices[0:train_split]]
+  X_val = X_copy[indices[train_split:val_split]]
+  X_test = X_copy[indices[val_split:max_points]]
+  y_train = y_noise[indices[0:train_split]]
+  y_val = y_noise[indices[train_split:val_split]]
+  y_test = y_noise[indices[val_split:max_points]]
+  # Make sure that we have enough observations of each class for 2-fold cv
+  assert all(get_class_counts(y_noise, y_train[0:seed_batch]) >= 4)
+  # Make sure that returned shuffled indices are correct
+  assert all(y_noise[indices[0:max_points]] ==
+             np.concatenate((y_train, y_val, y_test), axis=0))
+  return (indices[0:max_points], X_train, y_train,
+          X_val, y_val, X_test, y_test, y_noise)