-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_preprocess.py
89 lines (64 loc) · 2.4 KB
/
data_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
def substrings_in_string(big_string, substrings):
for substring in substrings:
if big_string.find(substring) != -1:
return substring
return np.nan
def change_labels(df, column):
for c in column:
le = LabelEncoder()
le.fit(df[c])
df[c] = le.transform(df[c])
return df
def handle_age(row, title_age_mean):
if row['Age'] == 0:
row['Age'] = title_age_mean[row['Title']]
return row
# def change(df):
def process(df):
title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
'Don', 'Jonkheer']
df['Title']=df['Name'].map(lambda x: substrings_in_string(x, title_list))
#replacing all titles with mr, mrs, miss, master
def replace_titles(x):
title=x['Title']
if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
return 'Mr'
elif title in ['Countess', 'Mme']:
return 'Mrs'
elif title in ['Mlle', 'Ms']:
return 'Miss'
elif title =='Dr':
if x['Sex']=='Male':
return 'Mr'
else:
return 'Mrs'
else:
return title
df['Title']=df.apply(replace_titles, axis=1)
#Creating new family_size column
df['Family_Size']=df['SibSp']+df['Parch']
df['Age*Class']=df['Age']*df['Pclass']
df['Fare_Per_Person']=df['Fare']/(df['Family_Size']+1)
df.fillna({'Cabin':'Unknown'}, inplace = True)
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
df['Deck']=df['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))
df.drop(['Cabin', 'Name', 'Ticket'], axis = 1, inplace= True)
# print(df.columns)
# print(df.shape)
df.fillna({'Embarked': 'S'}, inplace = True)
df = change_labels(df, ['Sex', 'Embarked', 'Title','Deck'])
# print(df)
# df = df.apply(LabelEncoder().fit_transform)
df.fillna({'Age': 0},inplace=True)
unique_titles = df['Title'].unique()
title_age_mean = {}
for title in unique_titles:
title_age_mean[title] = np.mean(df[df['Title'].isin([title])]['Age'])
# print(title_age_mean)
df = df.apply(lambda x: handle_age(x, title_age_mean), axis = 1)
df.to_csv('./temp.csv')
return df