Merge pull request #15 from openai/documentation

npapernot · web-flow · commit ea518f72ba4a · 2016-09-19T08:49:31.000+02:00
Documentation
diff --git a/README.md b/README.md
@@ -7,6 +7,8 @@ benchmark machine learning systems' vulnerability to
 [adversarial examples](http://karpathy.github.io/2015/03/30/breaking-convnets/)
 .
 
+Note: this library is still in active development.
+
 ## Setting up `cleverhans`
 
 ### Dependencies
@@ -50,7 +52,6 @@ Bug fixes can be initiated through Github pull requests.
 The following authors contributed to this library (by alphabetical order):
 * Ian Goodfellow (OpenAI)
 * Nicolas Papernot (Pennsylvania State University)
-* Ryan Sheatsley (Pennsylvania State University)
 
 ## Copyright
 
diff --git a/cleverhans/attacks.py b/cleverhans/attacks.py
@@ -13,7 +13,9 @@
 
 def fgsm(x, predictions, eps, back='tf'):
     """
-
+    A wrapper for the Fast Gradient Sign Method.
+    It calls the right function, depending on the 
+    user's backend.
     :param sess:
     :param x:
     :param y:
@@ -31,6 +33,14 @@ def fgsm(x, predictions, eps, back='tf'):
         raise NotImplementedError("Theano FGSM not implemented.")
 
 def fgsm_tf(x, predictions, eps):
+    """
+    TensorFlow implementation of the Fast Gradient 
+    Sign method. 
+    :param x: the input placeholder
+    :param predictions: the model's output tensor
+    :param eps: the epsilon (input variation parameter) 
+    :return: a tensor for the adversarial example
+    """
     # Define loss
 
     y = tf.to_float(tf.equal(predictions, tf.reduce_max(predictions, 1, keep_dims=True)))
diff --git a/cleverhans/utils_tf.py b/cleverhans/utils_tf.py
@@ -33,14 +33,16 @@ def tf_model_train(sess, x, y, predictions, X_train, Y_train, save=False,
                    predictions_adv=None):
     """
     Train a TF graph
-    :param sess:
-    :param x:
-    :param y:
-    :param model:
-    :param X_train:
-    :param Y_train:
-    :param save:
-    :return:
+    :param sess: TF session to use when training the graph
+    :param x: input placeholder
+    :param y: output placeholder (for labels)
+    :param predictions: model output predictions
+    :param X_train: numpy array with training inputs
+    :param Y_train: numpy array with training outputs
+    :param save: Boolean controling the save operation
+    :param predictions_adv: if set with the adversarial example tensor, 
+                            will run adversarial training 
+    :return: True if model trained
     """
     print "Starting model training using TensorFlow."
 
@@ -93,14 +95,14 @@ def tf_model_train(sess, x, y, predictions, X_train, Y_train, save=False,
 
 def tf_model_eval(sess, x, y, model, X_test, Y_test):
     """
-
-    :param sess:
-    :param x:
-    :param y:
-    :param model:
-    :param X_test:
-    :param Y_test:
-    :return:
+    Compute the accuracy of a TF model on some data
+    :param sess: TF session to use when training the graph
+    :param x: input placeholder
+    :param y: output placeholder (for labels)
+    :param model: model output predictions
+    :param X_test: numpy array with training inputs
+    :param Y_test: numpy array with training outputs
+    :return: a float with the accuracy value
     """
     # Define sympbolic for accuracy
     acc_value = keras.metrics.categorical_accuracy(y, model)
@@ -145,6 +147,9 @@ def tf_model_load(sess):
     return True
 
 def batch_eval(sess, tf_inputs, tf_outputs, numpy_inputs):
+    """
+    A helper function that computes a tensor on numpy inputs by batches.
+    """
     n = len(numpy_inputs)
     assert n > 0
     assert n == len(tf_inputs)
diff --git a/tutorials/mnist_tutorial.md b/tutorials/mnist_tutorial.md
@@ -32,7 +32,116 @@ it is made up of multiple convolutional and ReLU layers.
 You can find the model definition in the 
 [`utils_mnist` cleverhans module](https://github.com/openai/cleverhans/blob/master/cleverhans/utils_mnist.py).
 
-TODO(insert code snippet here)
+```
+# Define input TF placeholder
+x = tf.placeholder(tf.float32, shape=(None, 1, 28, 28))
+y = tf.placeholder(tf.float32, shape=(None, FLAGS.nb_classes))
+
+# Define TF model graph
+model = model_mnist()
+predictions = model(x)
+print "Defined TensorFlow model graph."
+```
 
 ## Training the model with TensorFlow
 
+The library includes a helper function that runs a
+TensorFlow optimizer to train models and another 
+helper function to load the MNIST dataset. 
+To train our MNIST model, we run the following: 
+
+```
+# Get MNIST test data
+X_train, Y_train, X_test, Y_test = data_mnist()
+
+# Train an MNIST model
+tf_model_train(sess, x, y, predictions, X_train, Y_train)
+```
+
+We can then evaluate the performance of this model
+using `tf_model_eval` included in `cleverhans.utils_tf`:
+
+```
+# Evaluate the accuracy of the MNIST model on legitimate test examples
+accuracy = tf_model_eval(sess, x, y, predictions, X_test, Y_test)
+assert X_test.shape[0] == 10000, X_test.shape
+print 'Test accuracy on legitimate test examples: ' + str(accuracy)
+```
+
+The accuracy returned should be above `97%`.
+
+## Crafting adversarial examples
+
+This tutorial applies the Fast Gradient Sign method
+introduced by [Goodfellow et al.](https://arxiv.org/abs/1412.6572).
+We first need to create the necessary graph elements by 
+calling `cleverhans.attacks.fgsm` before using the helper
+function `cleverhans.utils_tf.batch_eval` to apply it to 
+our test set. This gives the following:
+
+```
+# Craft adversarial examples using Fast Gradient Sign Method (FGSM)
+adv_x = fgsm(x, predictions, eps=0.3)
+X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test])
+assert X_test_adv.shape[0] == 10000, X_test_adv.shape
+
+# Evaluate the accuracy of the MNIST model on adversarial examples
+accuracy = tf_model_eval(sess, x, y, predictions, X_test_adv, Y_test)
+print'Test accuracy on adversarial examples: ' + str(accuracy)
+```
+
+The second part evaluates the accuracy of the model on 
+adversarial examples in a similar way than described 
+previously for legitimate examples. It should be lower
+than the previous accuracy you obtained.
+
+
+## Improving robustness using adversarial training
+
+One defense strategy to mitigate adversarial examples is to use
+adversarial training, i.e. train the model with both the 
+original data and adversarially modified data (with correct 
+labels). You can use the training function `utils_tf.tf_model_train`
+with the optional argument `predictions_adv` set to the result 
+of `cleverhans.attacks.fgsm` in order to perform adversarial 
+training. 
+
+In the following snippet, we first declare a new model (in a 
+way similar to the one described previously) and then we train
+it with both legitimate and adversarial training points. 
+
+```
+# Redefine TF model graph
+model_2 = model_mnist()
+predictions_2 = model_2(x)
+adv_x_2 = fgsm(x, predictions_2, eps=0.3)
+predictions_2_adv = model_2(adv_x_2)
+
+# Perform adversarial training
+tf_model_train(sess, x, y, predictions_2, X_train, Y_train, predictions_adv=predictions_2_adv)
+```
+
+We can then verify that (1) its accuracy on legitimate data is 
+still comparable to the first model, (2) its accuracy on newly
+generated adversarial examples is higher.
+
+```
+# Evaluate the accuracy of the adversarialy trained MNIST model on
+# legitimate test examples
+accuracy = tf_model_eval(sess, x, y, predictions_2, X_test, Y_test)
+print 'Test accuracy on legitimate test examples: ' + str(accuracy)
+
+# Craft adversarial examples using Fast Gradient Sign Method (FGSM) on
+# the new model, which was trained using adversarial training
+X_test_adv_2, = batch_eval(sess, [x], [adv_x_2], [X_test])
+assert X_test_adv_2.shape[0] == 10000, X_test_adv_2.shape
+
+# Evaluate the accuracy of the adversarially trained MNIST model on
+# adversarial examples
+accuracy_adv = tf_model_eval(sess, x, y, predictions_2, X_test_adv_2, Y_test)
+print'Test accuracy on adversarial examples: ' + str(accuracy_adv)
+```
+
+## Code
+
+The complete code for this tutorial is available [here](https://github.com/openai/cleverhans/blob/master/tutorials/mnist_tutorial.py).