-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluate_svn.py
executable file
·173 lines (122 loc) · 4.75 KB
/
evaluate_svn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#!/usr/bin/env python3
'''
Standardize the data based on training and apply PCA
'''
import sys
import argparse
import pathlib
import math
import confusion
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
## get values from command line if given
argparser = argparse.ArgumentParser(
description="Standardize and project the data using PCA")
argparser.add_argument("-s", "--seed", action="store",
default=None, type=int,
help="Set the random seed -- if set to any value, every "
"run of the program will use the same sequence of "
"random values")
argparser.add_argument("-c", "--classes", action="store", default=None,
help="A comma-separated list of class names to report in "
"evaluation metrics. If not supplied, class names "
"based on integer numbers starting at zero will be used.")
argparser.add_argument("-f", "--fig", action="store_true",
help="Stores the confusion matrix in a PDF figure")
argparser.add_argument("dirname",
type=pathlib.Path,
help="Sets the data directory -- alternate to -I")
args = argparser.parse_args(sys.argv[1:])
if args.classes is None:
CLASS_NAMES = None
else:
CLASS_NAMES = args.classes.split(',')
## set the seed if given for reproducible runs
if args.seed is not None:
random.seed(argparse.seed)
MAKE_FIG=args.fig
DATA_DIRNAME=args.dirname
print(" . Loading....")
X_training = pd.read_csv("%s/X_proj_train.csv" % DATA_DIRNAME)
X_validation = pd.read_csv("%s/X_proj_validation.csv" % DATA_DIRNAME)
X_test = pd.read_csv("%s/X_proj_test.csv" % DATA_DIRNAME)
y_training = pd.read_csv("%s/y_train.csv" % DATA_DIRNAME)
y_validation = pd.read_csv("%s/y_validation.csv" % DATA_DIRNAME)
y_test = pd.read_csv("%s/y_test.csv" % DATA_DIRNAME)
## need to convert X to numpy arrays and y to a column vector
X_training = X_training.to_numpy()
X_validation = X_validation.to_numpy()
X_test = X_test.to_numpy()
# convert from 1-d array to column vector
y_training = np.ravel(y_training.to_numpy())
y_validation = np.ravel(y_validation.to_numpy())
y_test = np.ravel(y_test.to_numpy())
##
## Use "grid search" to find good values for our SVC for this
## training set.
##
# Set up the list of values to search among
parameters = [
{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
{'C': [1, 10, 100, 1000], 'kernel': ['rbf'],
'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]
# perform the search
print(" . Performing grid search")
svc_search = GridSearchCV(
SVC(), # estimator object
parameters, # parameters to search among
n_jobs=-1, # -1 means "use all available CPUs"
verbose=1) # give a message indicating setup
svc_search.fit(X_validation, y_validation)
best_parameters = svc_search.best_params_
print(" . Best parameters found for SVN:", best_parameters)
print(" . Creating SVC model to use on training data")
# Here the "**" means that we take the list of keyword=arg
# values in best_parameters and convert it into a dictionary
# to pass as arguments to the creation of the SVC implementation
# that we will use to actually train on our training data.
#
# This line creates the classifier with the best training parameters
model = SVC(**best_parameters)
# ... and finally, we use our training data to train the classifier
model.fit(X_training, y_training)
##
## Evaluate the results
##
print(" . Performing evaluation:")
# Make some predictions on the test data -- this generates
# a set of new labels, which should match the test labels
# if our classifier is a strong one
y_pred = model.predict(X_test)
# If these labels are correct, they will match the ones in y_test
prediction_accuracy = metrics.accuracy_score(y_test, y_pred)
unique_labels = np.unique(y_training)
if len(unique_labels) <= 2:
prediction_precision = metrics.precision_score(y_test, y_pred)
prediction_recall = metrics.recall_score(y_test, y_pred)
else:
prediction_precision = metrics.precision_score(y_test, y_pred,
average="macro")
prediction_recall = metrics.recall_score(y_test, y_pred,
average="macro")
print('Performance metrics: accuracy %.2f, precision %.2f, recall %.2f'
% (prediction_accuracy, prediction_precision, prediction_recall))
outcome = pd.DataFrame()
outcome['predicted'] = y_pred
outcome['actual'] = y_test
outcome.to_csv('%s/svn_outcome.csv' % DATA_DIRNAME, index=False)
# Calculate confusion matrix and print it
cm = metrics.confusion_matrix(y_test, y_pred)
confusion.print_confusion_matrix(cm, CLASS_NAMES, y_training)
confusion.print_confusion_matrix(cm, CLASS_NAMES, y_training,
filename="%s/confusion.txt" % DATA_DIRNAME)
if MAKE_FIG:
confusion.confusion_matrix_heatmap(
'%s/confusion_heatmap.pdf' % DATA_DIRNAME,
cm, CLASS_NAMES, y_training)