-
Notifications
You must be signed in to change notification settings - Fork 0
/
Classification.py
2786 lines (2450 loc) · 139 KB
/
Classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
This file is part of CIUSuite 2
Copyright (C) 2018 Daniel Polasky and Sugyan Dixit
Module for classification schemes for CIU data groups
Authors: Dan Polasky, Sugyan Dixit
Date: 1/11/2018
"""
from Gaussian_Fitting import Gaussian
import numpy as np
import pandas
import matplotlib.pyplot as plt
import matplotlib.patches
import pickle
import os
import itertools
import random
import logging
import time
import tkinter
from tkinter import messagebox
from tkinter import ttk
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.colors import ListedColormap
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import precision_score, roc_curve, auc
from sklearn.feature_selection import f_classif, GenericUnivariateSelect
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from typing import List
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from CIU_analysis_obj import CIUAnalysisObj
from CIU_Params import Parameters
from Feature_Detection import Feature
# load main logger
logger = logging.getLogger('main')
def main_build_classification_new(cl_inputs_by_label, subclass_labels, params_obj, output_dir, known_feats=None):
"""
Main method for classification. Performs feature selection followed by LDA and classification
and generates output and plots. Returns a ClassificationScheme object to be saved for future
classification of unknowns. Allows use of subclasses.
NOTE: requires that input data has had axes equalized previously (assumes all axes are identical at this point)
:param cl_inputs_by_label: lists of ClInput containers sorted by class label
:type cl_inputs_by_label: list[list[ClInput]]
:param subclass_labels: list of subclass labels (strings). If no subclasses present, default to ['0']
:param output_dir: directory in which to save plots/output
:param params_obj: Parameters object with classification parameter information
:type params_obj: Parameters
:param known_feats: list of features (optional, allows manual mode)
:return: ClassificationScheme object with the generated scheme
:rtype: ClassificationScheme
"""
start_time = time.time()
# Data preparation for Gaussians (if applicable) and standardization
max_num_gaussians = prep_data_2d(cl_inputs_by_label, params_obj)
cl_inputs_by_label, means, stdevs = standardize_all_2d(cl_inputs_by_label, params_obj)
logger.debug('standardization finished: {:.2f}s'.format(time.time() - start_time))
if known_feats is None:
# convert to subclass oriented data (if no subclasses, will be a single entry in a list)
class_labels = [class_list[0].class_label for class_list in cl_inputs_by_label]
list_classif_inputs = subclass_inputs_from_class_inputs(cl_inputs_by_label, subclass_labels, class_labels)
# run feature selection and crossvalidation to select best features automatically
all_features = multi_subclass_ufs(list_classif_inputs, params_obj, output_dir, subclass_labels)
logger.debug('UFS finished: {:.2f}s'.format(time.time() - start_time))
# assess all features to determine which to use in the final scheme
best_features, crossval_score, all_crossval_data = crossval_main_new(cl_inputs_by_label, output_dir, params_obj, all_features, subclass_labels)
else:
# Manual mode: use the provided features and run limited crossvalidation
best_features, crossval_score, all_crossval_data = crossval_main_new(cl_inputs_by_label, output_dir, params_obj, known_feats, subclass_labels)
best_features = known_feats
all_features = known_feats
logger.debug('crossval finished: {:.2f}s'.format(time.time() - start_time))
# perform LDA and classification on the selected/best features
shaped_subsets = rearrange_ciu_by_feats(cl_inputs_by_label, best_features, params_obj)
flat_subsets = [x for class_list in shaped_subsets for x in class_list]
constructed_scheme = lda_svc_best_feats(flat_subsets, best_features, all_features, output_dir, subclass_labels, max_num_gaussians)
constructed_scheme.crossval_test_score = crossval_score
constructed_scheme.all_crossval_data = all_crossval_data
constructed_scheme.standard_means = means
constructed_scheme.standard_stdevs = stdevs
# plot output here for now, will probably move eventually
plot_classification_decision_regions(constructed_scheme, params_obj, output_dir)
logger.debug('classif (scheme = {}) finished: {:.2f}s'.format(constructed_scheme.name, time.time() - start_time))
return constructed_scheme
def multi_subclass_ufs(subclass_input_list, params_obj, output_path, subclass_labels):
"""
Perform univariate feature selection across classes using multiple subclasses (e.g. different charge states
of CIU fingerprints). Essentially performs standard UFS on each subclass separately and combines
all output features/scores into a single list, from which the best features can be chosen for
LDA/SVM classification. Inputs are structured like in standard UFS method, except provided
as a list of inputs for each subclass rather than a single input (as the standard method takes only
1 "subclass")
:param subclass_input_list: list of ClassifInput containers for each subclass
:type subclass_input_list: list[ClassifInput]
:param params_obj: parameters information
:type params_obj: Parameters
:param output_path: directory in which to save plot
:param subclass_labels: list of strings for scheme name
:return: list of all features, sorted in decreasing order of score from ALL subclasses
:rtype: list[CFeature]
"""
# Iterate over all subclass lists to generate feature score information
features = []
features_by_subclass = []
for subclass_input in subclass_input_list:
# generate all combinations of replicate datasets within the labels
shaped_label_list = subclass_input.shaped_label_list
scores = generate_products_for_ufs(subclass_input.analysis_objs_by_label, shaped_label_list, params_obj)
# Create a CFeature object to hold the information for this CV (feature)
mean_score = np.mean(scores, axis=0)
std_score = np.std(scores, axis=0)
cv_axis = subclass_input.analysis_objs_by_label[0][0].axes[1]
subclass_features = []
for cv_index, cv in enumerate(cv_axis):
feature = CFeature(cv, cv_index, mean_score[cv_index], std_score[cv_index], subclass_label=subclass_input.subclass_label)
features.append(feature)
subclass_features.append(feature)
features_by_subclass.append(subclass_features)
# sort feature scores either by mean - stdev ("error mode") or just mean alone.
if params_obj.classif_6_ufs_use_error_mode:
sorted_features = sorted(features, key=lambda x: (x.mean_score - x.std_dev_score), reverse=True)
else:
sorted_features = sorted(features, key=lambda x: x.mean_score, reverse=True)
unique_labels = get_unique_labels([x for label_list in subclass_input_list[0].shaped_label_list for x in label_list])
scheme_name = generate_scheme_name(unique_labels, subclass_labels)
plot_feature_scores_subclass(features_by_subclass, params_obj, scheme_name, output_path)
save_feature_scores(features_by_subclass, scheme_name, output_path)
return sorted_features
def generate_products_for_ufs(analysis_obj_list_by_label, shaped_label_list, params_obj):
"""
Generate all combinations of replicate data across classes for feature selection. Will
create a DataProduct object with the key information for each combination.
:param analysis_obj_list_by_label: list of lists of CIUAnalysisObj's, sorted by class label
:type analysis_obj_list_by_label: list[list[CIUAnalysisObj]]
:param shaped_label_list: list of lists of class labels with matching shape of analysis_obj_by_label
:param params_obj: parameter info
:type params_obj: Parameters
:return: list of DataProduct objects for each combination
:rtype: list[UFSResult]
"""
scores = []
for object_tuple, label_tuple in zip(itertools.product(*analysis_obj_list_by_label), itertools.product(*shaped_label_list)):
# create a UFSResult object for this combination
# data_list = [x.ciu_data for x in object_tuple]
data_list = [get_classif_data(x, params_obj, ufs_mode=True) for x in object_tuple]
label_list = [x for x in label_tuple]
product = UFSResult(data_list, label_list)
# Run feature selection for this combination
select = GenericUnivariateSelect(score_func=f_classif, mode='percentile', param=100)
select.fit(product.combined_data, product.numeric_label_arr)
product.fit_pvalues = select.pvalues_
product.fit_scores = select.scores_
product.fit_sc = -np.log10(select.pvalues_)
# scores.append(product.fit_scores)
scores.append(product.fit_sc) # don't save whole product to reduce memory load
return scores
def roc_curve_area_multiclass(x_train, y_train, x_test, y_test, svc=None):
"""
creates a one vs all classifer
calculates the roc fpr, tpr, for each class with one vs all, micro-average roc, and macro-average roc
:param x_test: x_test lda
:param x_train: x_train lda
:param y_train: y_train labels
:param y_test: y_test labels
:param svc: SVC classifier from cross validation to use for binary classifications. None for multiclass
:return: fpr, tpr, and roc_auc for class specific, micro-average, and macro-average
:rtype: dict
"""
output_dict = {}
temp_fpr = np.linspace(0, 1, 100)
unique_class_labels = np.unique(y_train)
if len(unique_class_labels) == 2:
# only 2 classes, and thus only 1 classifier. No OneVsRest classification needed
y_train_binary = binarize_2class(unique_class_labels, y_train)
y_test_binary = binarize_2class(unique_class_labels, y_test)
# use saved classifier for binary if available
if svc is not None:
clf = svc
else:
clf = SVC(kernel='linear', C=1, probability=True, max_iter=1000)
clf.fit(x_train, y_train_binary)
y_score = clf.decision_function(x_test)
# generate ROC curve
fpr_class, tpr_class, thr = roc_curve(y_test_binary, y_score)
interp_tpr = np.interp(temp_fpr, fpr_class, tpr_class)
interp_tpr[0] = 0
roc_auc_interp = auc(temp_fpr, interp_tpr)
interp_tpr_class = [[interp_tpr]]
# save micro/macro avg as just regular fpr/tpr
interp_tpr_micro = interp_tpr
interp_tpr_macro = interp_tpr
roc_auc_micro_interp = roc_auc_interp
roc_auc_macro_interp = roc_auc_interp
roc_auc_class_interp = [roc_auc_interp]
else:
# multiclass: must be binarized (converted to one vs rest classifiers) for ROC analysis
y_train_binary = label_binarize(y_train, classes=unique_class_labels)
y_test_binary = label_binarize(y_test, classes=unique_class_labels)
n_classes = y_train_binary.shape[1]
clf = OneVsRestClassifier(SVC(kernel='linear', C=1, probability=True, max_iter=1000)).fit(x_train, y_train_binary)
y_score = clf.decision_function(x_test)
# create dicts for fpr, tpr, and roc_auc
fpr_class = [[] for _ in range(n_classes)]
tpr_class = [[] for _ in range(n_classes)]
interp_tpr_class = [[] for _ in range(n_classes)]
roc_auc_class_interp = [[] for _ in range(n_classes)]
for index in range(n_classes):
fpr_class[index], tpr_class[index], _ = roc_curve(y_test_binary[:, index], y_score[:, index])
interp_tpr = np.zeros(np.shape(temp_fpr))
interp_tpr += np.interp(temp_fpr, fpr_class[index], tpr_class[index])
interp_tpr[0] = 0
interp_tpr_class[index].append(interp_tpr)
roc_auc_class_interp[index] = auc(temp_fpr, interp_tpr_class[index][0])
roc_auc_class_interp = np.asarray(roc_auc_class_interp)
interp_tpr_class = np.asarray(interp_tpr_class)
# compute micro-average ROC curve and ROC area
fpr_micro, tpr_micro, _ = roc_curve(y_test_binary.ravel(), y_score.ravel())
interp_tpr_micro = np.zeros(np.shape(temp_fpr))
interp_tpr_micro += np.interp(temp_fpr, fpr_micro, tpr_micro)
interp_tpr_micro[0] = 0
roc_auc_micro_interp = auc(temp_fpr, interp_tpr_micro)
# compute macro-average ROC curve and ROC area
all_fpr = np.unique(np.concatenate([fpr_class[x] for x in range(n_classes)]))
# interpolate all ROC curves at these fpr points
mean_tpr = np.zeros_like(all_fpr)
for index in range(n_classes):
mean_tpr += np.interp(all_fpr, fpr_class[index], tpr_class[index])
# average tpr and compute AUC
mean_tpr /= n_classes
interp_tpr_macro = np.zeros(np.shape(temp_fpr))
interp_tpr_macro += np.interp(temp_fpr, all_fpr, mean_tpr)
interp_tpr_macro[0] = 0
roc_auc_macro_interp = auc(temp_fpr, interp_tpr_macro)
# save final outputs
output_dict['tmp_fpr'] = temp_fpr
output_dict['tpr_class'] = interp_tpr_class
output_dict['roc_auc_class'] = roc_auc_class_interp
output_dict['tpr_micro'] = interp_tpr_micro
output_dict['roc_auc_micro'] = roc_auc_micro_interp
output_dict['tpr_macro'] = interp_tpr_macro
output_dict['roc_auc_macro'] = roc_auc_macro_interp
return output_dict
def binarize_2class(unique_class_labels, label_data):
"""
Binarize labels for 2-class case for ROC curve (convert labels to 0 or 1). Generally just
subtracting 1 from unique numeric labels, but more general support provided.
:param unique_class_labels: list of unique class labels (usually numeric)
:param label_data: data to convert
:return: binarized label_data
"""
label_encoding = {}
bin_index = 0
for class_label in unique_class_labels:
label_encoding[class_label] = bin_index
bin_index += 1
output_data = []
for label in label_data:
output_data.append(label_encoding[label])
return output_data
def plot_roc_cuve(roc_data, class_labels, schem_name, dirpath, params_obj, selected_features=None):
"""
Plot ROC curves for each number of features in a single PDF document. roc_data input is the
saved results dictionary from a CrossValRun container.
:param roc_data: saved results dictionary from a CrossValRun container.
:param class_labels: list of strings - labels for each class
:param schem_name:
:param dirpath:
:param selected_features:
:param params_obj: parameters
:type params_obj: Parameters
:return:
"""
tmp_fpr = roc_data['tmp_fpr'][0]
tpr_class_mean, tpr_class_std, roc_auc_class_mean, roc_auc_class_std = roc_data['tpr_class_mean'], roc_data['tpr_class_std'], roc_data['roc_auc_class_mean'], roc_data['roc_auc_class_std']
tpr_micro_mean, tpr_micro_std, roc_auc_micro_mean, roc_auc_micro_std = roc_data['tpr_micro_mean'], roc_data['tpr_micro_std'], roc_data['roc_auc_micro_mean'], roc_data['roc_auc_micro_std']
tpr_macro_mean, tpr_macro_std, roc_auc_macro_mean, roc_auc_macro_std = roc_data['tpr_macro_mean'], roc_data['tpr_macro_std'], roc_data['roc_auc_macro_mean'], roc_data['roc_auc_macro_std']
if selected_features is None:
pdf_output = os.path.join(dirpath, schem_name + '_ROC_curves.pdf')
try:
testfile = open(pdf_output, 'a')
testfile.close()
except PermissionError:
messagebox.showerror('Please Close the File Before Saving',
'The file {} is being used by another process! Please close it, THEN press the OK button to retry saving'.format(
pdf_output))
with PdfPages(pdf_output) as pdf:
for index in range(len(tpr_class_mean)):
for num, (tpr_class_mean_, tpr_class_std_, roc_auc_class_mean_, roc_auc_class_std_) in enumerate(zip(tpr_class_mean[index], tpr_class_std[index], roc_auc_class_mean[index], roc_auc_class_std[index])):
plt.plot(tmp_fpr, tpr_class_mean_[0], linestyle=':', label='{0} {1:0.2f} +/- {2:0.2f}'.format(class_labels[num], roc_auc_class_mean_, roc_auc_class_std_))
plt.fill_between(tmp_fpr, tpr_class_mean_[0] + tpr_class_std_[0], tpr_class_mean_[0] - tpr_class_std_[0], alpha=0.2)
# plot macro/micro averages only if more than one classifier
if len(tpr_class_mean[0]) > 1:
plt.plot(tmp_fpr, tpr_micro_mean[index], color='navy', label='ROC_micro {0:0.2f} +/- {1:0.2f}'.format(roc_auc_micro_mean[index], roc_auc_micro_std[index]))
plt.fill_between(tmp_fpr, tpr_micro_mean[index] + tpr_micro_std[index], tpr_micro_mean[index] - tpr_micro_std[index], color='black', alpha=0.4)
plt.plot(tmp_fpr, tpr_macro_mean[index], color='red', label='ROC_macro {0:0.2f} +/- {1:0.2f}'.format(roc_auc_macro_mean[index], roc_auc_macro_std[index]))
plt.fill_between(tmp_fpr, tpr_macro_mean[index] + tpr_macro_std[index], tpr_macro_mean[index] - tpr_macro_std[index], color='red', alpha=0.4)
plt.plot([0, 1], [0, 1], color='black', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
# plot titles, labels, and legends
if params_obj.plot_12_custom_title is not None:
plot_title = params_obj.plot_12_custom_title
plt.title(plot_title, fontsize=params_obj.plot_13_font_size, fontweight='bold')
elif params_obj.plot_11_show_title:
plot_title = 'ROC: {} Features'.format(index + 1)
plt.title(plot_title, fontsize=params_obj.plot_13_font_size, fontweight='bold')
if params_obj.plot_08_show_axes_titles:
plt.xlabel('False Positive Rate', fontsize=params_obj.plot_13_font_size, fontweight='bold')
plt.ylabel('True Positive Rate', fontsize=params_obj.plot_13_font_size, fontweight='bold')
plt.xticks(fontsize=params_obj.plot_13_font_size)
plt.yticks(fontsize=params_obj.plot_13_font_size)
if params_obj.plot_07_show_legend:
plt.legend(loc='best', fontsize='small')
try:
pdf.savefig()
except PermissionError:
messagebox.showerror('Please Close the File Before Saving', 'The file {} is being used by another process! Please close it, THEN press the OK button to retry saving'.format(pdf_output))
plt.savefig()
plt.close()
else:
# plot ROC curve for single number of selected features
index = len(selected_features) - 1 # indexed from 0 in the lists, not 1
for num, (tpr_class_mean_, tpr_class_std_, roc_auc_class_mean_, roc_auc_class_std_) in enumerate(zip(tpr_class_mean[index], tpr_class_std[index], roc_auc_class_mean[index], roc_auc_class_std[index])):
plt.plot(tmp_fpr, tpr_class_mean_[0], linestyle=':', label='{0} {1:0.2f} +/- {2:0.2f}'.format(class_labels[num], roc_auc_class_mean_, roc_auc_class_std_))
plt.fill_between(tmp_fpr, tpr_class_mean_[0] + tpr_class_std_[0], tpr_class_mean_[0] - tpr_class_std_[0], alpha=0.2)
# plot macro/micro averages only if more than one classifier
if len(tpr_class_mean[0]) > 1:
plt.plot(tmp_fpr, tpr_micro_mean[index], color='navy', label='ROC_micro {0:0.2f} +/- {1:0.2f}'.format(roc_auc_micro_mean[index], roc_auc_micro_std[index]))
plt.fill_between(tmp_fpr, tpr_micro_mean[index] + tpr_micro_std[index], tpr_micro_mean[index] - tpr_micro_std[index], color='black', alpha=0.4)
plt.plot(tmp_fpr, tpr_macro_mean[index], color='red', label='ROC_macro {0:0.2f} +/- {1:0.2f}'.format(roc_auc_macro_mean[index], roc_auc_macro_std[index]))
plt.fill_between(tmp_fpr, tpr_macro_mean[index] + tpr_macro_std[index], tpr_macro_mean[index] - tpr_macro_std[index], color='red', alpha=0.4)
plt.plot([0, 1], [0, 1], color='black', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
# plot titles, labels, and legends
if params_obj.plot_12_custom_title is not None:
plot_title = params_obj.plot_12_custom_title
plt.title(plot_title, fontsize=params_obj.plot_13_font_size, fontweight='bold')
elif params_obj.plot_11_show_title:
plot_title = 'ROC: {} Features'.format(len(selected_features))
plt.title(plot_title, fontsize=params_obj.plot_13_font_size, fontweight='bold')
if params_obj.plot_08_show_axes_titles:
plt.xlabel('False Positive Rate', fontsize=params_obj.plot_13_font_size, fontweight='bold')
plt.ylabel('True Positive Rate', fontsize=params_obj.plot_13_font_size, fontweight='bold')
plt.xticks(fontsize=params_obj.plot_13_font_size)
plt.yticks(fontsize=params_obj.plot_13_font_size)
if params_obj.plot_07_show_legend:
plt.legend(loc='best', fontsize='small')
output_name = os.path.join(dirpath, schem_name + '_final-ROC' + params_obj.plot_02_extension)
try:
plt.savefig(output_name)
except PermissionError:
messagebox.showerror('Please Close the File Before Saving', 'The file {} is being used by another process! Please close it, THEN press the OK button to retry saving'.format(output_name))
plt.savefig(output_name)
plt.close()
def crossval_main_new(cl_inputs_by_label, outputdir, params_obj, features_list, subclass_labels):
"""
Updated crossval method to allow multiple subclasses. Updated to reduce datasets to selected
features first, then perform crossval with modular methods.
:param cl_inputs_by_label: lists of ClInput containers sorted by class label
:type cl_inputs_by_label: list[list[ClInput]]
:param outputdir: directory in which to save output plots
:param params_obj: parameters container
:type params_obj: Parameters
:param features_list: List of CFeatures, sorted in decreasing order of score
:type features_list: list[CFeature]
:param subclass_labels: list of strings for scheme naming purposes
:return: list of selected features, test score for that # features, and all cross validation data
"""
# determine training size as size of the smallest class - 1 (1 test file at a time)
min_class_size = np.min([len(x) for x in cl_inputs_by_label])
training_size = min_class_size - params_obj.classif_91_test_size
if training_size < 2:
logger.warning('WARNING! Testing size provided ({}) was too large: at least one class had less than 2 replicates of training data. Test size of 1 used instead.'.format(params_obj.classif_91_test_size))
training_size = min_class_size - 1
label_list = [class_list[0].class_label for class_list in cl_inputs_by_label]
# optional max number of features to consider
if params_obj.classif_7_max_feats_for_crossval > 0:
max_features = params_obj.classif_7_max_feats_for_crossval
if max_features > len(features_list):
max_features = len(features_list) + 1
else:
max_features = len(features_list) + 1
# determine number of products. If less than # iterations (or if # iterations = 0), calculate out all products. Otherwise, randomly select to save memory
num_products = 1
for cl_input_list in cl_inputs_by_label:
num_permutations = len(list(itertools.permutations(cl_input_list, params_obj.classif_91_test_size)))
num_products *= num_permutations
if num_products < params_obj.classif_8_max_crossval_iterations or params_obj.classif_8_max_crossval_iterations == 0:
calc_all_products = True
else:
calc_all_products = False
current_features_list = []
all_results_by_feats = {}
for ind, feature in enumerate(features_list[:max_features]):
# Generate all combinations - NOTE: assumes that if subclasses are present, features know their subclass (should always be true)
logger.info('Performing cross validation for {} of {} features'.format(ind + 1, len(features_list[:max_features])))
current_features_list.append(feature)
# format all data
shaped_data_list = rearrange_ciu_by_feats(cl_inputs_by_label, current_features_list, params_obj)
# perform the cross validation for this feature combination
crossval_obj = CrossValRun(shaped_data_list, label_list, training_size, current_features_list)
if calc_all_products:
crossval_obj.divide_data_and_run_lda()
crossval_obj.assemble_class_products(params_obj.classif_8_max_crossval_iterations)
else:
crossval_obj.random_sample_run_lda(params_obj.classif_8_max_crossval_iterations)
# save results into a dictionary with same keys as original crossval dict
for key, value in crossval_obj.results.items():
try:
all_results_by_feats[key].append(crossval_obj.results[key])
except KeyError:
all_results_by_feats[key] = [crossval_obj.results[key]]
all_results_by_feats['train_scores_mean'] = np.asarray(all_results_by_feats['train_scores_mean'])
all_results_by_feats['train_scores_std'] = np.asarray(all_results_by_feats['train_scores_std'])
all_results_by_feats['test_scores_mean'] = np.asarray(all_results_by_feats['test_scores_mean'])
all_results_by_feats['test_scores_std'] = np.asarray(all_results_by_feats['test_scores_std'])
# save and plot crossvalidation score information
crossval_acc_data = [all_results_by_feats['train_scores_mean'], all_results_by_feats['train_scores_std'], all_results_by_feats['test_scores_mean'], all_results_by_feats['test_scores_std'], all_results_by_feats['roc_auc_micro_mean'], all_results_by_feats['roc_auc_micro_std']]
unique_labels = get_unique_labels(label_list)
scheme_name = generate_scheme_name(unique_labels, subclass_labels)
crossval_file = save_crossval_score(crossval_acc_data, scheme_name, outputdir)
save_roc_data(all_results_by_feats, crossval_file)
plot_crossval_scores(crossval_acc_data, scheme_name, params_obj, outputdir)
plot_roc_cuve(all_results_by_feats, unique_labels, scheme_name, outputdir, params_obj)
# plot_crossval_auc(all_results_by_feats['roc_auc_micro_mean'], all_results_by_feats['roc_auc_micro_std'], scheme_name, params_obj, outputdir)
# determine best features list from crossval scores
if params_obj.classif_4_score_mode == 'auc':
score_list = all_results_by_feats['roc_auc_micro_mean']
else:
score_list = all_results_by_feats['test_scores_mean']
best_num_feats, best_score = peak_crossval_score_detect(score_list, params_obj.classif_2_score_dif_tol)
output_features = features_list[0: best_num_feats]
plot_roc_cuve(all_results_by_feats, unique_labels, scheme_name, outputdir, params_obj, selected_features=output_features)
logger.info('Cross validation complete!')
return output_features, best_score, crossval_acc_data
def rearrange_ciu_by_feats(shaped_inputs_list, features_list, params_obj):
"""
For each CIU dataset in the original input, generate a rearranged (and possibly shrunken)
matrix of data in order of features in the provided features list. Designed to allow easy
access to the CV subset data of interest prior to doing crossval/LDA/etc. Handles subclasses.
:param features_list: list of features in descending order of score from UFS
:type features_list: list[CFeature]
:param shaped_inputs_list: lists by class of SubclassUnknown containers with classification input info
:type shaped_inputs_list: list[list[ClInput]]
:param params_obj: parameters container for getting classif data (raw data or gaussian mode)
:type params_obj: Parameters
:return: shaped output data with selected feature data in order
:rtype: list[list[DataSubset]]
"""
# Loop over the shaped input list, extracting data and maintaining the same organization
shaped_output_list = []
class_numeric_label = 1
for class_list in shaped_inputs_list:
class_outputs = []
for rep_obj in class_list:
# generate a subset container to hold the extracted data and associated metadata
rep_subset = rearrange_ciu_by_feats_helper(rep_obj, params_obj, features_list, class_numeric_label)
class_outputs.append(rep_subset)
shaped_output_list.append(class_outputs)
class_numeric_label += 1
return shaped_output_list
def rearrange_ciu_by_feats_helper(rep_obj, params_obj, features_list, class_numeric_label, num_gaussian_override=None):
"""
Rearrange CIU data in feature order for a single replicate to generate a single DataSubset
container to return.
:param rep_obj: input ClInput container with raw data
:type rep_obj: ClInput
:param params_obj: paramters
:type params_obj: Parameters
:param features_list: list of features in decreasing order of score
:type features_list: list[CFeature]
:param class_numeric_label: (int) numeric label for scheme. Set to 0 for unknown data
:param num_gaussian_override: For Gaussian mode with unknowns - require that the max num gaussians be that of the previously saved classification scheme
:return: DataSubset container with initialized data
:rtype: DataSubset
"""
data_output = []
if features_list[0].subclass_label is not None:
# multiple subclasses, so file_id is the 'best' subclass object plus the number of subclasses
len_subclasses = len(rep_obj.subclass_dict.keys())
file_id = rep_obj.subclass_dict[features_list[0].subclass_label].short_filename + '_{}SubCl'.format(len_subclasses)
else:
# only one object, so just use its filename
file_id = rep_obj.get_subclass_obj().short_filename
for feature in features_list:
if feature.subclass_label is not None:
subclass_obj = rep_obj.get_subclass_obj(feature.subclass_label)
else:
subclass_obj = rep_obj.get_subclass_obj()
# Determine the correct CV column to append to the growing matrix and do so
current_cv_axis = subclass_obj.axes[1]
this_cv_index = (np.abs(np.asarray(current_cv_axis) - feature.cv)).argmin()
raw_data = get_classif_data(subclass_obj, params_obj)
cv_col = raw_data.T[this_cv_index]
data_output.append(cv_col)
# generate a subset container to hold the extracted data and associated metadata
rep_subset = DataSubset(data_output, rep_obj.class_label, class_numeric_label, file_id, rep_obj.all_filenames, features_list)
return rep_subset
def arrange_lda_new(subset_list):
"""
Prepare data and label arrays for input to LDA methods given an input list of DataSubset containers
with formatted data and labels
:param subset_list: list of DataSubset containers with formatted data and labels
:type subset_list: list[DataSubset]
:return: x_data, numeric labels, string labels for direct input into LDA
"""
x_data = []
string_labels, numeric_labels = [], []
# assemble each dataset into a single list by combining all columns of the input matrices
for subset in subset_list:
for column in subset.data:
x_data.append(column)
string_labels.append(subset.class_label)
numeric_labels.append(subset.numeric_label)
# convert to numpy arrays for SKLearn analyses
x_data = np.asarray(x_data)
string_labels = np.asarray(string_labels)
numeric_labels = np.asarray(numeric_labels)
return x_data, numeric_labels, string_labels
def lda_svc_best_feats(flat_subset_list, selected_features, all_features, output_dir, subclass_labels, max_num_gaussians=0):
"""
Generate a Scheme container by performing final LDA/SVM analysis on the provided data.
:param flat_subset_list: list of DataSubset containers
:type flat_subset_list: list[DataSubset]
:param selected_features: selected (best) features to use in scheme construction
:type selected_features: list[CFeature]
:param all_features: list of all features input for reference
:type all_features: list[CFeature]
:param output_dir: directory in which to save output
:param subclass_labels: list of strings for scheme output naming
:param max_num_gaussians: for Gaussian mode, the max number of Gaussians to record in the scheme
:return: Scheme container
:rtype: ClassificationScheme
"""
train_data, train_numeric_labels, train_string_labels = arrange_lda_new(flat_subset_list)
svc, lda = run_lda_svc(train_data, train_numeric_labels)
x_lda = lda.transform(train_data)
expl_var_r = lda.explained_variance_ratio_
# build classification scheme
clf = SVC(kernel='linear', C=1, probability=True, max_iter=1000)
clf.fit(x_lda, train_numeric_labels)
y_pred = clf.predict(x_lda)
prec_score = precision_score(train_numeric_labels, y_pred, pos_label=1, average='weighted')
probs = clf.predict_proba(x_lda)
# initialize classification scheme object and return it
scheme = ClassificationScheme()
scheme.all_features = all_features
scheme.selected_features = selected_features
scheme.classifier = clf
scheme.classifier_type = 'SVC'
scheme.classif_prec_score = prec_score
scheme.lda = lda
scheme.explained_variance_ratio = expl_var_r
scheme.numeric_labels = train_numeric_labels
scheme.class_labels = train_string_labels
scheme.unique_labels = get_unique_labels(train_string_labels)
scheme.transformed_test_data = x_lda
scheme.params = clf.get_params()
scheme.input_feats = [feature.cv for feature in selected_features]
scheme.name = generate_scheme_name(train_string_labels, subclass_labels)
scheme.num_gaussians = max_num_gaussians
# Organize outputs by input file for easier viewing, then save
x_lda_by_file, y_pred_by_file, probs_by_file, filenames_by_file, combined_filenames = prep_outputs_by_file_new(x_lda, y_pred, probs, flat_subset_list)
save_lda_and_predictions(scheme, x_lda_by_file, y_pred_by_file, probs_by_file, filenames_by_file, combined_filenames, output_dir, unknowns_bool=False)
return scheme
def run_lda_svc(x_data, label_data):
"""
Run LDA and SVC analysis of a set of data and return the resulting classifier
:param x_data: input x data
:param label_data: input label data
:return: sklearn.svm.SVC classifier object and LDA object
:rtype: SVC, LinearDiscriminantAnalysis
"""
lda = LinearDiscriminantAnalysis(solver='svd', n_components=5)
lda.fit(x_data, label_data)
train_lda = lda.transform(x_data)
# max_iter=1000 needed to prevent occasional (and unpredictable) freezes with ridiculous iteration numbers
svm = SVC(kernel='linear', C=1, probability=True, max_iter=1000, cache_size=200)
try:
svm.fit(train_lda, label_data)
except ValueError:
logger.error('Error in SVM fitting. This should not be reached - check your input data for duplicates (same input used in multiple classes)')
return svm, lda
# todo: deprecate
# def standardize_data_old(ciu_data):
# """
# Standardize the input CIU data using the common (xi - x_mean) / stdev approach. The UFS
# is behaving strangely with negative values in the input so the output is 'floored' to
# have no negative values.
# :param ciu_data: 2D numpy array of CIU data
# :return:
# """
# # powertransf = PowerTransformer(method='yeo-johnson')
# # powertransf.fit(ciu_data)
# # std_data = powertransf.transform(ciu_data)
# # return std_data
#
# cv_data = np.swapaxes(ciu_data, 0, 1)
# output_data = np.ndarray(np.shape(cv_data))
#
# # smooth each column and return the data (axes swapped back to normal)
# index = 0
# while index < len(cv_data):
# current_col = cv_data[index]
# normed_col = (current_col - np.mean(current_col)) / np.std(current_col)
# # normed_col = abs((current_col - np.mean(current_col)) / np.std(current_col))
# normed_col[normed_col < 0] = 0 # set all negative values to 0
# output_data[index] = normed_col
# index += 1
# output_data = np.swapaxes(output_data, 0, 1)
# return output_data
def get_classif_data(analysis_obj, params_obj, ufs_mode=False):
"""
Initialize a classification data matrix in each analysis object in the lists according to the
classification mode specified in the parameters object. In All_Data mode, this is simply the
ciu_data matrix. In Gaussian mode, it will be Gaussian information from the object's fitted Gaussian
lists.
:param analysis_obj: analysis objects
:type analysis_obj: CIUAnalysisObj
:param params_obj: Parameters object with classification parameter information
:type params_obj: Parameters
:param ufs_mode: boolean, True if using for UFS (feature selection), which requires only centroids from gaussian fitting
:return: classification data matrix
"""
if params_obj.classif_1_input_mode == 'All_Data':
classif_data = analysis_obj.classif_input_std
else:
if not ufs_mode:
# for non-UFS, use full input (standardized) Gaussian dataset.
classif_data = analysis_obj.classif_input_std
else:
# for UFS, only use centroids - remove
centroids_by_cv = []
for cv_index, gaussian_data_at_cv in enumerate(analysis_obj.classif_input_std.T):
current_centroids = []
for feat_index, gaussian_attribute in enumerate(gaussian_data_at_cv):
if feat_index % 3 == 0:
current_centroids.append(gaussian_attribute)
centroids_by_cv.append(current_centroids)
classif_data = np.asarray(centroids_by_cv).T # transpose because we transposed at the start (in the loop) to access by CV
return classif_data
def prep_gaussfeats_for_classif(features_list, analysis_obj):
"""
Assemble a Gaussian-list-by-CV list from input features data. Fills any gaps between and within
features with Gaussian data from the filtered_gaussians list and assembles a complete list of
Gaussians by CV.
:param features_list: list of Features
:type features_list: list[Feature]
:param analysis_obj: CIUAnalysisObj with gaussian fitting and gaussian feature detect performed
:type analysis_obj: CIUAnalysisObj
:return: List of (Gaussian lists) sorted by CV
:rtype: list[list[Gaussian]]
"""
# make an empty list for Gaussians at each CV
final_gaussian_lists = [[] for _ in analysis_obj.axes[1]]
features_list = close_feature_gaps(features_list, analysis_obj.axes[1])
# iterate over features, filling any gaps within the feature and entering Gaussians into the final list
for feature in features_list:
# determine if the feature contains gaps
gaussian_cvs = [gaussian.cv for gaussian in feature.gaussians]
for cv in feature.cvs:
# append Gaussian(s) at this CV to the final list
try:
cv_index = np.where(analysis_obj.axes[1] == cv)[0][0]
except IndexError:
# A gaussian had a CV that was not in the analysis object's CV axis! This should be caught elsewhere
logger.warning('Gaussian had CV {}, but that CV is not in the CV axis of this file (after axes were equalized across all files). It will be ignored.'.format(cv))
continue
this_cv_gaussian = [x for x in feature.gaussians if x.cv == cv]
final_gaussian_lists[cv_index].extend(this_cv_gaussian)
if cv not in gaussian_cvs:
# a gap is present within this feature- create a Gaussian at median centroid/values to fill it
new_gaussian = Gaussian(amplitude=np.median([x.amplitude for x in feature.gaussians]),
centroid=feature.gauss_median_centroid,
width=np.median([x.width for x in feature.gaussians]),
collision_voltage=cv,
pcov=None,
protein_bool=True)
final_gaussian_lists[cv_index].append(new_gaussian)
for cv_index, cv in enumerate(analysis_obj.axes[1]):
if len(final_gaussian_lists[cv_index]) == 0:
# no Gaussians have been added here yet, so we need to add one. Add an empty Gaussian (all 0's)
final_gaussian_lists[cv_index].append(Gaussian(centroid=0, amplitude=0, width=0, collision_voltage=cv, pcov=None, protein_bool=False))
analysis_obj.classif_gaussians_by_cv = final_gaussian_lists
return final_gaussian_lists
def close_feature_gaps(features_list, cv_axis):
"""
Check all features for gaps in their CV lists, and fill in the gaps if any exist by inserting
appropriate CV values
:param features_list: list of Features
:type features_list list[Feature]
:param cv_axis: analysis_obj.axes[1]
:return: updated features list with gaps closed (in feature.cvs ONLY)
:rtype: list[Feature]
"""
cv_step = cv_axis[1] - cv_axis[0]
for feature in features_list:
for index, current_cv in enumerate(feature.cvs):
try:
next_cv = feature.cvs[index + 1]
except IndexError:
# reached the end, ignore
continue
while not (next_cv - current_cv) == cv_step:
# a gap is present - insert the next value to fill it
correct_next = current_cv + cv_step
feature.cvs.insert(index + 1, correct_next)
next_cv = feature.cvs[index + 1]
return features_list
def peak_crossval_score_detect(test_score_means, diff_from_max):
"""
Determine the best set of features based on crossvalidation testing scores generated in
crossval_main. Chooses the first 'peak' (point after which score decreases) that is within
tolerance of the overall maximum score in the data. This is to choose the minimum number of
features while still achieving a high score.
:param test_score_means: list of scores for cross validation test data, in increasing order of number of features
:param diff_from_max: maximum distance below the max value of test_score_means that a peak is allowed for selection. Default 0.05
:return: best number of features (index of test_score_means) and score (value)
"""
max_score = np.max(test_score_means)
for index, value in enumerate(test_score_means):
try:
if test_score_means[index + 1] < value:
# stop here (reached a peak) if within tolerance of max
if max_score - value <= diff_from_max:
return index + 1, value # index + 1 because we're determining the NUM of features, indexed from 1 (not 0)
except IndexError:
# reached the end of the list - return final index + 1 (because indexing from 1 for num feats) and value
return index + 1, value
def prep_data_2d(cl_inputs_by_label, params_obj):
"""
Wrapper method for 2D input list of ClInputs (for use in scheme construction) to ensure
all CIU analyses ultimately have the same size Gaussian data matrices for classification.
:param cl_inputs_by_label: lists of ClInput containers sorted by class label
:type cl_inputs_by_label: list[list[ClInput]]
:param params_obj: Parameters object with classification parameter information
:type params_obj: Parameters
:return: max number of Gaussians in the input data (also sets it to params_obj)
"""
max_num_gaussians = 0
for input_list in cl_inputs_by_label:
for cl_input in input_list:
for subclass_label, analysis_obj in cl_input.subclass_dict.items():
if not params_obj.classif_1_input_mode == 'All_Data':
if params_obj.classif_1_input_mode == 'Gaussian_Feat':
# prepare gaussian features if using feature mode for classification (saves to container)
gaussians_by_cv = prep_gaussfeats_for_classif(analysis_obj.features_gaussian, analysis_obj)
else:
# Gaussian raw mode, so Gaussians by CV comes directly from the container
gaussians_by_cv = analysis_obj.raw_protein_gaussians
analysis_obj.classif_gaussians_by_cv = gaussians_by_cv
# update the max number of gaussians if necessary
for gaussian_list in gaussians_by_cv:
if len(gaussian_list) > max_num_gaussians:
max_num_gaussians = len(gaussian_list)
# save num Gaussians to ensure all matrices same size
params_obj.silent_clf_4_num_gauss = max_num_gaussians
else:
# all data mode - initialize raw data for classification
analysis_obj.classif_input_raw = analysis_obj.ciu_data
if not params_obj.classif_1_input_mode == 'All_Data':
# second pass required for Gaussians to format the final input data matrix (now that we know that max num of Gaussians)
for input_list in cl_inputs_by_label:
for cl_input in input_list:
for subclass_label, analysis_obj in cl_input.subclass_dict.items():
if params_obj.classif_93_std_all_gaussians_bool:
input_classif_raw = prep_gaussian_input_raw(analysis_obj.classif_gaussians_by_cv)
else:
input_classif_raw = prep_gaussian_input_raw_old(analysis_obj.classif_gaussians_by_cv, max_num_gaussians)
analysis_obj.classif_input_raw = input_classif_raw
return max_num_gaussians
def prep_gaussian_input_raw(gaussians_by_cv, selected_cvs=None):
"""
Assemble a 2D numpy array of correct final dimensions from a list of Gaussians at
each collision voltage. Selected CVs can be provided (e.g. for unknown data). Flattens across
the Gaussians in each collision voltage so that a list of centroids is appended rather
:param gaussians_by_cv: list of Gaussian lists at each CV
:type gaussians_by_cv: list[list[Gaussian]
:param selected_cvs: list of CVs to consider for unknown data
:return: 2D numpy array of formatted Gaussian information for input to standardization/classification
"""
classif_data = []
for gaussian_list in gaussians_by_cv:
# skip any non-selected CVs if requested (i.e. in unknown analysis mode)
if selected_cvs is not None:
if not gaussian_list[0].cv in selected_cvs:
continue
attribute_list = [[], [], []]
if len(gaussian_list) == 0:
continue
for gaussian in gaussian_list:
attribute_list[0].append(gaussian.centroid)
attribute_list[1].append(gaussian.width)
attribute_list[2].append(gaussian.amplitude)
classif_data.append(attribute_list)
# classif_data = np.asarray(classif_data).T
classif_data = np.asarray(classif_data)
return classif_data
def prep_gaussian_input_raw_old(gaussians_by_cv, max_num_gaussians, selected_cvs=None):
"""
Assemble a 2D numpy array of correct final dimensions from a list of Gaussians at
each collision voltage. Selected CVs can be provided (e.g. for unknown data)
:param gaussians_by_cv:
:type gaussians_by_cv: list[list[Gaussian]
:param max_num_gaussians: maximum number of Gaussians in the classifying scheme
:param selected_cvs: list of CVs to consider for unknown data
:return: 2D numpy array of formatted Gaussian information for input to standardization/classification
"""
classif_data = []
for gaussian_list in gaussians_by_cv:
# skip any non-selected CVs if requested (i.e. in unknown analysis mode)
if selected_cvs is not None:
if not gaussian_list[0].cv in selected_cvs:
continue
attributes = ['cent', 'width', 'amp']
attribute_list = np.zeros(max_num_gaussians * len(attributes))
attribute_index = 0
if len(gaussian_list) == 0:
continue
for gaussian in gaussian_list:
attribute_list[attribute_index] = gaussian.centroid
attribute_index += 1
attribute_list[attribute_index] = gaussian.width
attribute_index += 1
attribute_list[attribute_index] = gaussian.amplitude
attribute_index += 1
classif_data.append(attribute_list)
# classif_data = np.asarray(classif_data).T
classif_data = np.asarray(classif_data)
return classif_data
def standardize_all_2d(cl_inputs_by_label, params_obj):
"""
Standardization wrapper to standardize across the complete input dataset. Intended to be
called prior to any UFS or crossval. Saves standardized data into a field in the CIUAnalysisObj
contained within all classification inputs for later reference. Returns the standardization
information (mean, stdev for each input feature) to be saved into the scheme to allow for
unknown standardization. Data is saved in array by feature, then CV.
NOTE: assumes axis equalization has already been performed
NOTE: assumes Gaussian feature prep has already been performed in using Gaussian mode
:param cl_inputs_by_label: lists of ClInput containers sorted by class label
:type cl_inputs_by_label: list[list[ClInput]]
:param params_obj: Parameters object with classification parameter information
:type params_obj: Parameters
:return: input 2D cl_input list, mean/stdev matrices used for standardization
"""
# Read input dimensions and prepare mean/stdev dataframes
example_input = cl_inputs_by_label[0][0]
example_obj = list(example_input.subclass_dict.values())[0]
subclass_labels = list(example_input.subclass_dict.keys())
cv_axis = example_obj.axes[1]
feature_axis = get_feature_axis(example_obj, params_obj.classif_1_input_mode, params_obj.classif_93_std_all_gaussians_bool)
means = {x: pandas.DataFrame(np.zeros((len(feature_axis), len(cv_axis))), index=feature_axis, columns=cv_axis) for x in subclass_labels}
stdevs = {x: pandas.DataFrame(np.zeros((len(feature_axis), len(cv_axis))), index=feature_axis, columns=cv_axis) for x in subclass_labels}
# Assemble the data across all classes to standardize by feature/cv/subclass
for cv_index, cv in enumerate(cv_axis):
for feature_index, feature in enumerate(feature_axis):
output_data_by_subcl = {x: [] for x in subclass_labels}
for class_clinput_list in cl_inputs_by_label:
for cl_input in class_clinput_list:
for subclass_label, analysis_obj in cl_input.subclass_dict.items():
if not params_obj.classif_1_input_mode == 'All_Data':
# In Gaussian mode, append data as normal
# test = analysis_obj.classif_input_raw[cv_index][feature_index]
if params_obj.classif_93_std_all_gaussians_bool:
output_data_by_subcl[subclass_label].extend(analysis_obj.classif_input_raw[cv_index][feature_index])
else:
output_data_by_subcl[subclass_label].append(analysis_obj.classif_input_raw[cv_index][feature_index])
else:
# In raw data mode, average across whole ATD rather than including each point
cv_col_matrix = np.swapaxes(analysis_obj.classif_input_raw, 0, 1)
cv_col_atd = cv_col_matrix[cv_index]
output_data_by_subcl[subclass_label].extend(cv_col_atd)
# compute mean/std
for subclass_label, data_array in output_data_by_subcl.items():
means[subclass_label].loc[feature, cv] = np.mean(data_array)