-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_import.py
120 lines (96 loc) · 5.22 KB
/
data_import.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import pandas as pd
import numpy as np
from pandas import DataFrame
def data_load(
file: list,
sheet: list,
index_col: int,
header: list
) -> DataFrame:
# load data
df = pd.read_excel(file, sheet_name=sheet, header=header, index_col=index_col)
# remove all df unnamed elements in multiindex
for i, columns_old in enumerate(df.columns.levels):
columns_new = np.where(columns_old.str.contains('Unnamed'), '', columns_old)
df.rename(columns=dict(zip(columns_old, columns_new)), level=i, inplace=True)
# join columns multi levels
df.columns = df.columns.map('|'.join).str.strip('|')
# reset column multi levels
# columns_new=[]
# for column in df.columns:
# for title in reversed(column):
# if title != '':
# columns_new.append(title)
# break
# df.columns = columns_new
# # for duplicated columns
# s = df.columns.to_series()
# df.columns = s.add(s.groupby(s).cumcount().astype(str).replace('0', ''))
return df
if __name__ == '__main__':
# properties of the loaded data table
excel_file = 'dataset/dataset.xlsx'
excel_sheet = 'Sum'
index_col = 0
header = [0] # [0, 1, 2]
# data loading
# df = data_load(excel_file, excel_sheet, index_col, header)
df = pd.read_excel(excel_file, sheet_name='Sum', header=header, index_col=index_col)
data_list = pd.Series(df.columns)
# replace string values for num 1|0
df = df.replace('есть', 1)
df = df.replace('нет', 0)
df = df.replace('муж', 1)
df = df.replace('жен', 0)
df = df.replace('АКШ', 1)
df = df.replace('БиМКШ', 0)
# Change objects ans specific data into categorical data
# spec_categ_cols = ['Пред. |Оценка|ФК ХСН', 'Пред. |ЭХО-КГ до|ФВ ЛЖ'] # list of extra categorical cols
categorical_cols = [col for col in df.columns if df[col].dtype == "object"]
object_nunique = list(map(lambda col: df[col].nunique(), categorical_cols))
print('The number of unique entries in each column with categorical data:')
# print(pd.Series(dict(zip(categorical_cols, object_nunique))),'\n')
# categorical_cols.extend(spec_categ_cols)
# df_categ_labeled = pd.get_dummies(df[categorical_cols]) # encoding categorical columns
# df = pd.concat([df, df_categ_labeled], ignore_index=False, axis=1)
# df = df.drop(categorical_cols, axis=1)
# find binary data
binary_cols = df[df.columns].isin([0, np.nan, 1]).all() # find NaN|1|0 cols in all data
binary_cols_lst = list(binary_cols[binary_cols == True].index) # list NaN|of 1|0 cols
# replace NaN to 0 in binary data
df[binary_cols_lst] = df[binary_cols_lst].fillna(0)
# data type transformation
df[binary_cols_lst] = df[binary_cols_lst].astype(np.uint8) # uint8 for 1|0 cols
int64_cols_lst = df.dtypes[df.dtypes == 'int64'].index # list of int64 cols
df[int64_cols_lst] = df[int64_cols_lst].astype(np.float64) # int64 -> float64
print('Checking types of data:')
print(pd.Series(df.dtypes).groupby(df.dtypes).count())
# drop imbalanced data outside 10-90%
binary_disbalace = df[binary_cols_lst]
binary_disbalace = binary_disbalace[binary_disbalace == 1].count() \
/ len(binary_disbalace)
drop_cals = binary_disbalace[(binary_disbalace < 0.10) |
(binary_disbalace > 0.90)].index # drop data outside 10-90%
drop_cals = list(drop_cals)
# drop_cals.remove('Пред. |Общ.|Пол') # save a specific element
# drop_cals.remove('Пред. |ФР ССЗ|АГ') # save a specific element
df.drop(drop_cals, axis=1, inplace=True)
# drop correlated predictors with float type
df.drop(['Интраоп Время пережатия аорты',
'Интраоп T-тела',
'Интраоп Количество КП'], axis=1, inplace=True) # drop correlated columns
# drop custom data
del_cols = df.columns[df.columns.str.contains('^Отдал.*') == True]
df.drop(del_cols, axis=1, inplace=True) # drop custom columns
# check the remained data
data_list = pd.DataFrame(data_list, index = data_list.values)
remained_data = pd.Series('yes', index=[col for col in list(df.columns)
if col in list(data_list.index)])
data_list.rename(columns={data_list.columns[0]: 'remained'}, inplace=True)
data_list['remained'] = remained_data
data_list.to_excel("dataset/remained_data_list.xlsx") # export list of prepared data
# fill NA/NaN values in float data
float_cols_lst = list(df.dtypes[df.dtypes == 'float64'].index)
df[float_cols_lst] = df[float_cols_lst].fillna(df[float_cols_lst].mean(), axis=0)
# save prepared data to excel
df.to_excel("dataset/prepared_data.xlsx", sheet_name='prepared_data')