-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_splitting_functions.py
155 lines (127 loc) · 5.06 KB
/
data_splitting_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import os
import sys
import random
# Load in dataset being used
from dataset import *
data, labels, d2, l2 = load_custom_dataset()
# Create and return a training and testing set from the dataset provided
# The test set will be selected by randomly selecting 10% of the instances
# from the total dataset
# The return is four arrays: one containing the feature vectors for all
# instances in the training set, one containing the corresponding labels
# for all instances in the training set, and then the same two arrays containing
# the information for the testing set.
def split_train_test(data, labels):
# find 10% of total size of dataset
length = len(data)
ten_percent = int(length * 0.1)
# randomly select indices of 10% of set
indices = random.sample(range(length), ten_percent)
test_data = []
test_labels = []
train_data = data.copy()
train_labels = labels.copy()
# for each random index add that data to the testing set and remove
# from training. Indices are sorted and removed in reverse order so
# that the indexing will not be affected by changes in size due to
# removing items.
for i in sorted(indices, reverse = True):
test_data.append(data[i])
test_labels.append(labels[i])
del train_data[i]
del train_labels[i]
return train_data, train_labels, test_data, test_labels
# Separates training set for two-fold validation through
# random selection (and removal) of half of the data
# The return is four arrays: two contain the group of feature vectors for
# the two different groups, and the other two containing the corresponding
# labels
def two_fold_val(train_data, train_labels):
# find size of training set
length = len(train_data)
# randomly seperate in half
indices = random.sample(range(length), int(length / 2))
data1 = []
labels1 = []
data2 = train_data.copy()
labels2 = train_labels.copy()
for i in sorted(indices, reverse = True):
data1.append(train_data[i])
labels1.append(train_labels[i])
del data2[i]
del labels2[i]
return data1, labels1, data2, labels2
# Creates dataset for 5-fold validation by randomly breaking the
# training set into 5 (roughly) equal-sized chunks
# The return is two arrays: one containing each of the five sets of
# data groups, and the other containg the corresponding labels
def five_fold_val(data, labels):
datasets = []
labelsets = []
rem_data = data.copy()
rem_labels = labels.copy()
length = len(data)
for i in range(4):
indices = random.sample(range(len(rem_data)), int(length/5))
tmpdata = []
tmplabel = []
for j in sorted(indices, reverse = True):
tmpdata.append(rem_data[i])
tmplabel.append(rem_labels[i])
del rem_data[i]
del rem_labels[i]
datasets.append(tmpdata)
labelsets.append(tmplabel)
datasets.append(rem_data)
labelsets.append(rem_labels)
return datasets, labelsets
# Creates datasets for 10-fold validation by randomly breaking the
# training set into 10 (roughly) equal-sized sections
# The return is two arrays: one containing each of the ten sets of
# data groups, and the other containg the corresponding labels
def ten_fold_val(data, labels):
datasets = []
labelsets = []
rem_data = data.copy()
rem_labels = labels.copy()
length = len(data)
# for the first 9 datasets, keep randomly selecting a number of
# indices equal to 1/10 the size of the original dataset
# removing these from the remaining data will result in the remaining
# data after all iterations being the correct size for the 10th section
for i in range(9):
indices = random.sample(range(len(rem_data)), int(length/10))
tmpdata = []
tmplabel = []
for j in sorted(indices, reverse = True):
tmpdata.append(rem_data[i])
tmplabel.append(rem_labels[i])
del rem_data[i]
del rem_labels[i]
datasets.append(tmpdata)
labelsets.append(tmplabel)
datasets.append(rem_data)
labelsets.append(rem_labels)
return datasets, labelsets
# Creates datasets for leave-one-out cross validation
# Goes through all possible iterations of leaving one instance out of the
# training set
# Returns 4 arrays: two containing the feature vectors of the validation and
# training data, and two containing the labels. The arrays are organized so that
# training_data[0] will have the feature vectors for all instances except
# the instance at validation_data[0], and so on.
def LOO_val(data, labels):
validation_data = []
validation_labels = []
training_data = []
training_labels = []
for i in range(len(data)):
tmp_data = data.copy()
tmp_labels = labels.copy()
validation_data.append(tmp_data[i])
validation_labels.append(tmp_labels[i])
del tmp_data[i]
del tmp_labels[i]
training_data.append(tmp_data)
training_labels.append(tmp_labels)
return training_data, training_labels, validation_data, validation_labels