-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfunctions (1).py
215 lines (182 loc) · 11.3 KB
/
functions (1).py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
def compare_columns(df, x, color):
"""Takes in a dataframe and a histogram of column with color grouping """
fig = px.histogram(df, x=x, color=color)
fig.update_layout(barmode='group')
fig.update_traces(marker_line_color='rgb(8,48,107)', marker_line_width=1.5, opacity=0.6)
fig.show()
def plot_feature_importances(model):
"""Takes in a model and plots feature importance"""
n_features = data_train.shape[1]
plt.figure(figsize=(8,8))
plt.barh(range(n_features), model.feature_importances_, align='center')
plt.yticks(np.arange(n_features), data_train.columns.values)
plt.xlabel('Feature importance')
plt.ylabel('Feature')
def base_model(target, predictors):
("""Input: target, predictors
Runs through a basic 'regular' tree model for baseline
Output: feature importance plot, confustion matrix, classification report""")
data = pd.get_dummies(predictors)
data_train, data_test, target_train, target_test = train_test_split(data, target,test_size = 0.25, random_state=123)
tree_clf = DecisionTreeClassifier(criterion='gini', max_depth=5)
tree_clf.fit(data_train, target_train)
tree_clf.feature_importances_
pred = tree_clf.predict(data_test)
print(confusion_matrix(target_test, pred))
print(classification_report(target_test, pred))
print("Testing Accuracy for Decision Tree Classifier: {:.4}%".format(accuracy_score(target_test, pred) * 100))
def column_inspect(df, colname):
"""Takes in a dataframe and column name and return meta data a histogram of column"""
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
print("Number of NaNs:", df[[colname]].isna().sum())
print("Unique Values:", df[colname].unique())
print("Number of Unique Values:", df[colname].nunique())
fig = px.histogram(df, x=colname)
fig.update_layout(barmode='group')
fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)', marker_line_width=1.5, opacity=0.6)
fig.update_layout(title_text=colname)
fig.show()
fig = px.histogram(df, x=colname, color='status_group')
fig.update_layout(barmode='group')
fig.update_traces(marker_line_color='rgb(8,48,107)', marker_line_width=1.5, opacity=0.6)
fig.show()
### Train Test Split
def val_group(df):
"""This function takes in the Tanzania waterpump dataframe and set each column to their nessesary dtype"""
target = df['status_group']
features = df.drop('status_group', axis=1)
data_train, data_test, target_train, target_test = train_test_split(target, features, test_size = 0.25, random_state=42)
return data_train, data_test, target_train, target_test
### Set Dtypese ####
def set_dtypes(df):
"""This function takes in the Tanzania waterpump dataframe and set each column to their nessesary dtype"""
#int
df[['id', 'construction_year']] = df[['id', 'construction_year']].astype(int)
#float
df[['amount_tsh', 'gps_height', 'longitude',
'latitude', 'num_private', 'population']] = df[['amount_tsh', 'gps_height', 'longitude', 'latitude',
'num_private', 'population']].astype(float)
#string
df[['date_recorded', 'funder', 'installer','wpt_name', 'basin', 'subvillage', 'region', 'region_code',
'district_code', 'lga', 'ward', 'public_meeting', 'recorded_by', 'scheme_management', 'scheme_name',
'permit', 'extraction_type', 'extraction_type_group', 'extraction_type_class', 'management',
'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity',
'quantity_group', 'source', 'source_type', 'source_class', 'waterpoint_type', 'waterpoint_type_group',
'status_group']] = df[['date_recorded', 'funder', 'installer', 'wpt_name', 'basin', 'subvillage', 'region',
'region_code','district_code', 'lga', 'ward', 'public_meeting', 'recorded_by',
'scheme_management', 'scheme_name', 'permit', 'extraction_type',
'extraction_type_group', 'extraction_type_class', 'management', 'management_group',
'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity',
'quantity_group', 'source', 'source_type', 'source_class', 'waterpoint_type',
'waterpoint_type_group', 'status_group' ]].astype(str)
return df
### Drop columns ###
def drop_columns(df):
"""This function takes in the Tanzania waterpump dataframe and drops unneeded columns"""
#drop columns
new_df = df.drop(['wpt_name', 'installer', 'id', 'date_recorded', 'recorded_by','payment_type', 'quality_group',
'quantity_group', 'source', 'source_class', 'waterpoint_type','scheme_name', 'extraction_type_group',
'extraction_type_class', 'management_group','num_private'], axis=1)
return new_df
###Transform Categorical Columns###
def transform_dataframe_categorical(df):
"""This function takes in the Tanzania waterpump dataframe and returns the processes dataframe needed
for modeling"""
#fill nan_values
df.fillna('unknown', inplace=True)
#classify know and unknown years
df['construction_year'].mask(df['construction_year'] != 0, 'known', inplace=True)
df['construction_year'].mask(df['construction_year'] == 0, 'unknown', inplace=True)
#Groups scheme by type
df['scheme_management'].mask(df['scheme_management'] == 'Other', 'other', inplace=True)
df['scheme_management'].mask(df['scheme_management'] == 'Unkown', 'other', inplace=True)
df['scheme_management'].mask(df['scheme_management'] == 'VWC', 'private', inplace=True)
df['scheme_management'].mask(df['scheme_management'] == 'Private operator', 'private', inplace=True)
df['scheme_management'].mask(df['scheme_management'] == 'Private', 'private', inplace=True)
df['scheme_management'].mask(df['scheme_management'] == 'Biore', 'private', inplace=True)
df['scheme_management'].mask(df['scheme_management'] == 'Company', 'private', inplace=True)
df['scheme_management'].mask(df['scheme_management'] == 'Parastatal', 'private', inplace=True)
df['scheme_management'].mask(df['scheme_management'] == 'Trust', 'private', inplace=True)
df['scheme_management'].mask(df['scheme_management'] == 'WUG', 'government', inplace=True)
df['scheme_management'].mask(df['scheme_management'] == 'Water Board', 'government', inplace=True)
df['scheme_management'].mask(df['scheme_management'] == 'WUA', 'governement', inplace=True)
df['scheme_management'].mask(df['scheme_management'] == 'Water authority', 'government', inplace=True)
df['scheme_management'].mask(df['scheme_management'] == 'SWC', 'government', inplace=True)
df['scheme_management'].mask(df['scheme_management'] == 'None', 'other', inplace=True)
df['scheme_management'].mask(df['scheme_management'] == 'unknown', 'other', inplace=True)
return df
###Bin Categorical Columns###
def map_binarizer(df):
Categorical = ['status_group', 'funder', 'installer', 'wpt_name', 'basin', 'subvillage', 'region',
'region_code', 'district_code', 'lga', 'ward', 'public_meeting', 'scheme_management',
'scheme_name', 'permit', 'construction_year', 'extraction_type', 'extraction_type_group',
'extraction_type_class', 'management', 'management_group', 'payment', 'payment_type',
'water_quality', 'quality_group', 'quantity', 'quantity_group', 'source', 'source_type',
'source_class', 'waterpoint_type', 'waterpoint_type_group', 'public_meeting',
'scheme_management', 'scheme_name', 'permit', 'extraction_type', 'extraction_type_group',
'extraction_type_class', 'management', 'management_group', 'construction_year']
for col in df:
if col in Categorical:
my_mapper = DataFrameMapper([('Title', sklearn.preprocessing.LabelBinarizer())], input_df=True)
return my_mapper
### Transform Continous Columns###
def transform_dataframe_continous(df):
"""This function takes in the Tanzania waterpump dataframe and returns the processes dataframe needed for modeling"""
import numpy as np
#fill nan_values
df['gps_height'] == df['gps_height'].replace(0, np.nan)
df['population'] == df['population'].replace(0, np.nan)
df['longitude'] == df['longitude'].replace(0., np.nan)
df['latitude'] == df['latitude'].replace(-0.06, np.nan)
df['gps_height'] == df['gps_height'].replace(0, np.nan)
#Takes the average of the watersource type to normalize average_tsh
df.groupby(['source_type', 'basin', 'waterpoint_type_group'])['amount_tsh'].transform('mean')
#take the average population of other pumps in the area
df['population'].fillna(df.groupby(['region', 'lga', 'ward', 'subvillage'])['population'].transform('mean'))
#take the average longitude of other pumps in the area
df['longitude'].fillna(df.groupby(['region', 'lga', 'ward', 'subvillage'])['longitude'].transform('mean'))
#take the average latitude of other pumps in the area
df['latitude'].fillna(df.groupby(['region', 'lga', 'ward', 'subvillage'])['latitude'].transform('mean'))
#replace gps_height 0s with average
df['gps_height'].fillna(df.groupby(['region', 'lga', 'ward', 'subvillage'])['gps_height'].transform('mean'))
return df
def proccess_data(df):
###This function takes in a dataframe and passes them through the processing functions it returns a dataframe###
set_dtypes(df)
transform_dataframe(df)
return df
def base_model(df, y, X):
("""Input: target, predictors
Runs through a basic 'regular' tree model for baseline
Output: feature importance plot, confustion matrix, classification report""")
#data = pd.get_dummies(predictors)
data_train, data_test, target_train, target_test = train_test_split(X, y, target,test_size = 0.25, random_state=123)
tree_clf = DecisionTreeClassifier(criterion='gini', max_depth=5)
tree_clf.fit(data_train, target_train)
pred = tree_clf.predict(data_test)
print(tree_clf.feature_importances_ )
print(confusion_matrix(target_test, pred))
print(classification_report(target_test, pred))
print("Testing Accuracy for Decision Tree Classifier: {:.4}%".format(accuracy_score(target_test, pred) * 100))
def p_x_given_class(obs_row, feature, class_):
"""calculates point estimate"""
mu = aggs[feature]['mean'][class_]
std = aggs[feature]['std'][class_]
# Observation
obs = obs_row[feature]
p_x_given_y = stats.norm.pdf(obs, loc=mu, scale=std)
return p_x_given_y
def predict_class(obs_row):
"""predicts the probability of a certain frequency occuring"""
c_probs = []
for c in range(2):
# Initialize probability to relative probability of class
p = len(y_train[y_train == c])/len(y_train)
for feature in X.columns:
p *= p_x_given_class(obs_row, feature, c)
c_probs.append(p)
return np.argmax(c_probs)