-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.py
102 lines (86 loc) · 3.28 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import math
from sklearn.model_selection import train_test_split
from utils.feature_select import *
from utils.ml_utils import train_all, train_mean
import numpy
import pandas as pd
#
# TODO 手术史 病史---数据清洗
def count_label(read_file):
"""
print labels with counts
:param read_file: read_csv
:return: none
"""
for label in read_file.columns.values.tolist():
count_map = {}
for val in read_file[label]:
if isinstance(val, int) or isinstance(val, float) or ('0' <= val[0] <= '9'):
if val == numpy.NaN:
print(val)
continue
if val not in count_map.keys():
count_map[val] = 1
else:
count_map[val] += 1
print(label)
print(count_map)
def remove_units(read_file):
"""
remove all the unit for measurement
:param read_file: read_csv
:return: read_file after removing unit for measurement
"""
for label in read_file.columns.values.tolist():
val_list = []
for val in read_file[label]:
if isinstance(val, int) or isinstance(val, float):
if not math.isnan(val):
val_list.append(val)
else:
val_list.append(0) # 用 0 填补空缺数值
continue
elif '0' <= val[0] <= '9':
tmp = []
for ctr in val:
if '0' <= ctr <= '9' or ctr == '.':
tmp.append(ctr)
val_list.append(float(''.join(tmp)))
i = 1
if val_list and len(val_list) != len(read_file):
print(label)
print(len(val_list))
print('error!')
if val_list:
read_file[label] = val_list
return read_file
analyse_feature = False
read_file = pd.read_csv('data/train.csv', low_memory=False)
if analyse_feature:
feature_analysis('age', read_file)
input()
read_file = process_data(read_file)
selected_features = find_top_feature(read_file)
selected_features.append('is_BPH')
read_file = read_file[selected_features]
print(read_file.columns)
features = read_file
# Labels are the values we want to predict
labels = np.array(features['is_BPH'])
# Remove the labels from the features
features = features.drop('is_BPH', axis=1)
# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)
# Split the data into training, validation and testing sets
train_features, X_temp, train_labels, y_temp = train_test_split(features, labels, test_size=0.4, random_state=42)
val_features, test_features, val_labels, test_labels = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
# train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.25,
# random_state=42)
print('-'*20+'validation'+'-'*20)
# train_mean(train_features, train_labels, val_features, val_labels)
train_all(train_features, train_labels, val_features, val_labels)
print('-'*20+'test'+'-'*20)
# train_mean(train_features, train_labels, test_features, test_labels)
train_all(train_features, train_labels, test_features, test_labels)