-
Notifications
You must be signed in to change notification settings - Fork 0
/
tftest.py
286 lines (236 loc) · 10.5 KB
/
tftest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
import tensorflow as tf
hello = tf.constant('Hello, TensorFlow!')
sess = tf.Session()
print(sess.run(hello))
import glob
import math
import os
from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset
tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format
mnist_dataframe = pd.read_csv(
"https://download.mlcc.google.cn/mledu-datasets/mnist_train_small.csv",
sep=",",
header=None)
# Use just the first 10,000 records for training/validation.
mnist_dataframe = mnist_dataframe.head(10000)
mnist_dataframe = mnist_dataframe.reindex(np.random.permutation(mnist_dataframe.index))
mnist_dataframe.head()
def parse_labels_and_features(dataset):
"""Extracts labels and features.
This is a good place to scale or transform the features if needed.
Args:
dataset: A Pandas `Dataframe`, containing the label on the first column and
monochrome pixel values on the remaining columns, in row major order.
Returns:
A `tuple` `(labels, features)`:
labels: A Pandas `Series`.
features: A Pandas `DataFrame`.
"""
labels = dataset[0]
# DataFrame.loc index ranges are inclusive at both ends.
features = dataset.loc[:,1:784]
# Scale the data to [0, 1] by dividing out the max value, 255.
features = features / 255
return labels, features
training_targets, training_examples = parse_labels_and_features(mnist_dataframe[:7500])
training_examples.describe()
print(training_targets)
validation_targets, validation_examples = parse_labels_and_features(mnist_dataframe[7500:10000])
validation_examples.describe()
def construct_feature_columns():
"""Construct the TensorFlow Feature Columns.
Returns:
A set of feature columns
"""
# There are 784 pixels in each image.
return set([tf.feature_column.numeric_column('pixels', shape=784)])
def create_training_input_fn(features, labels, batch_size, num_epochs=None, shuffle=True):
"""A custom input_fn for sending MNIST data to the estimator for training.
Args:
features: The training features.
labels: The training labels.
batch_size: Batch size to use during training.
Returns:
A function that returns batches of training features and labels during
training.
"""
def _input_fn(num_epochs=None, shuffle=True):
# Input pipelines are reset with each call to .train(). To ensure model
# gets a good sampling of data, even when number of steps is small, we
# shuffle all the data before creating the Dataset object
idx = np.random.permutation(features.index)
raw_features = {"pixels":features.reindex(idx)}#我才发现这个features是key:[所有的样本]
raw_targets = np.array(labels[idx])
ds = Dataset.from_tensor_slices((raw_features,raw_targets)) # warning: 2GB limit
ds = ds.batch(batch_size).repeat(num_epochs)
if shuffle:
ds = ds.shuffle(10000)
# Return the next batch of data.
feature_batch, label_batch = ds.make_one_shot_iterator().get_next()
return feature_batch, label_batch
return _input_fn
def create_predict_input_fn(features, labels, batch_size):
"""A custom input_fn for sending mnist data to the estimator for predictions.
Args:
features: The features to base predictions on.
labels: The labels of the prediction examples.
Returns:
A function that returns features and labels for predictions.
"""
def _input_fn():
raw_features = {"pixels": features.values}
raw_targets = np.array(labels)
ds = Dataset.from_tensor_slices((raw_features, raw_targets)) # warning: 2GB limit
ds = ds.batch(batch_size)
# Return the next batch of data.
feature_batch, label_batch = ds.make_one_shot_iterator().get_next()
return feature_batch, label_batch
return _input_fn
mnist_test_dataframe = pd.read_csv(
"https://download.mlcc.google.cn/mledu-datasets/mnist_test.csv",
sep=",",
header=None)
test_targets, test_examples = parse_labels_and_features(mnist_test_dataframe)
test_examples.describe()
def train_nn_classification_model(
learning_rate,
steps,
batch_size,
hidden_units,
training_examples,
training_targets,
validation_examples,
validation_targets):
"""Trains a neural network classification model for the MNIST digits dataset.
In addition to training, this function also prints training progress information,
a plot of the training and validation loss over time, as well as a confusion
matrix.
Args:
learning_rate: An `int`, the learning rate to use.
steps: A non-zero `int`, the total number of training steps. A training step
consists of a forward and backward pass using a single batch.
batch_size: A non-zero `int`, the batch size.
hidden_units: A `list` of int values, specifying the number of neurons in each layer.
training_examples: A `DataFrame` containing the training features.
training_targets: A `DataFrame` containing the training labels.
validation_examples: A `DataFrame` containing the validation features.
validation_targets: A `DataFrame` containing the validation labels.
Returns:
The trained `DNNClassifier` object.
"""
periods = 10
# Caution: input pipelines are reset with each call to train.
# If the number of steps is small, your model may never see most of the data.
# So with multiple `.train` calls like this you may want to control the length
# of training with num_epochs passed to the input_fn. Or, you can do a really-big shuffle,
# or since it's in-memory data, shuffle all the data in the `input_fn`.
steps_per_period = steps / periods
# Create the input functions.
predict_training_input_fn = create_predict_input_fn(
training_examples, training_targets, batch_size)
predict_validation_input_fn = create_predict_input_fn(
validation_examples, validation_targets, batch_size)
training_input_fn = create_training_input_fn(
training_examples, training_targets, batch_size)
# Create the input functions.
predict_training_input_fn = create_predict_input_fn(
training_examples, training_targets, batch_size)
predict_validation_input_fn = create_predict_input_fn(
validation_examples, validation_targets, batch_size)
training_input_fn = create_training_input_fn(
training_examples, training_targets, batch_size)
# Create feature columns.
feature_columns = [tf.feature_column.numeric_column('pixels', shape=784)]
# Create a DNNClassifier object.
my_optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
classifier = tf.estimator.DNNClassifier(
feature_columns=feature_columns,
n_classes=10,
hidden_units=hidden_units,
optimizer=my_optimizer,
config=tf.contrib.learn.RunConfig(keep_checkpoint_max=1)
)
# Train the model, but do so inside a loop so that we can periodically assess
# loss metrics.
print("Training model...")
print("LogLoss error (on validation data):")
training_errors = []
validation_errors = []
for period in range (0, periods):
# Train the model, starting from the prior state.
classifier.train(
input_fn=training_input_fn,
steps=steps_per_period
)
# Take a break and compute probabilities.
training_predictions = list(classifier.predict(input_fn=predict_training_input_fn))
training_probabilities = np.array([item['probabilities'] for item in training_predictions])
training_pred_class_id = np.array([item['class_ids'][0] for item in training_predictions])
training_pred_one_hot = tf.keras.utils.to_categorical(training_pred_class_id,10)
validation_predictions = list(classifier.predict(input_fn=predict_validation_input_fn))
validation_probabilities = np.array([item['probabilities'] for item in validation_predictions])
validation_pred_class_id = np.array([item['class_ids'][0] for item in validation_predictions])
validation_pred_one_hot = tf.keras.utils.to_categorical(validation_pred_class_id,10)
# Compute training and validation errors.
training_log_loss = metrics.log_loss(training_targets, training_pred_one_hot)
validation_log_loss = metrics.log_loss(validation_targets, validation_pred_one_hot)
# Occasionally print the current loss.
print(" period %02d : %0.2f" % (period, validation_log_loss))
# Add the loss metrics from this period to our list.
training_errors.append(training_log_loss)
validation_errors.append(validation_log_loss)
print("Model training finished.")
# Remove event files to save disk space.
_ = map(os.remove, glob.glob(os.path.join(classifier.model_dir, 'events.out.tfevents*')))
# Calculate final predictions (not probabilities, as above).
final_predictions = classifier.predict(input_fn=predict_validation_input_fn)
final_predictions = np.array([item['class_ids'][0] for item in final_predictions])
accuracy = metrics.accuracy_score(validation_targets, final_predictions)
print("Final accuracy (on validation data): %0.2f" % accuracy)
# Output a graph of loss metrics over periods.
plt.ylabel("LogLoss")
plt.xlabel("Periods")
plt.title("LogLoss vs. Periods")
plt.plot(training_errors, label="training")
plt.plot(validation_errors, label="validation")
plt.legend()
plt.show()
# Output a plot of the confusion matrix.
cm = metrics.confusion_matrix(validation_targets, final_predictions)
# Normalize the confusion matrix by row (i.e by the number of samples
# in each class).
cm_normalized = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
ax = sns.heatmap(cm_normalized, cmap="bone_r")
ax.set_aspect(1)
plt.title("Confusion matrix")
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.show()
return classifier
classifier = train_nn_classification_model(
learning_rate=0.05,
steps=1000,
batch_size=30,
hidden_units=[3, 3],
training_examples=training_examples,
training_targets=training_targets,
validation_examples=validation_examples,
validation_targets=validation_targets)
predict_test_input_fn = create_predict_input_fn(
test_examples, test_targets, batch_size=100)
test_predictions = classifier.predict(input_fn=predict_test_input_fn)
test_predictions = np.array([item['class_ids'][0] for item in test_predictions])
accuracy = metrics.accuracy_score(test_targets, test_predictions)
print("Accuracy on test data: %0.2f" % accuracy)