-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathblightModel.py
142 lines (113 loc) · 6.61 KB
/
blightModel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# Script to make the ML function
import pandas as pd
import numpy as np
# Imports
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import GradientBoostingClassifier
def blight_model():
# init data frames :)
dfTrain = pd.read_csv('dataframes/train.csv', engine='python')
dfTest = pd.read_csv('dataframes/test.csv', engine='python')
dfAddresses = pd.read_csv('dataframes/addresses.csv', engine='python')
dfGeo = pd.read_csv('dataframes/latlons.csv', engine='python')
# Clean data
dfTrain2 = dfTrain.copy(deep=True)
# Drop rows with target column with NAN
dfTrain2 = dfTrain2[pd.notnull(dfTrain2['compliance'])]
# Print number of NAs per column
# dfTrain2.isnull().sum()
# Drop column with full NAN or with almost all NAN
dfTrain2 = dfTrain2.drop(['violation_zip_code', 'grafitti_status', 'non_us_str_code'], axis=1)
# Drop leak data (google "Data Science Leak Data" for more information)
dfTrain2 = dfTrain2.drop(['compliance_detail', 'collection_status', 'payment_amount',
'payment_date', 'payment_status', 'balance_due'], axis=1)
# Add Geo addresses from the GeoDataFrame to the main data frame
newdf = dfAddresses.merge(dfGeo, on=['address'])
dfTrain2 = dfTrain2.merge(newdf, on=['ticket_id'])
# Initialize df for testing from the clean data frame
dfTest2 = dfTest.merge(newdf, on=['ticket_id'])
#We explore the remainning features
#for i in range(len(dfTrain2.columns)):
# print(dfTrain2.groupby(dfTrain2.columns[i]).size())
# print('')
# Far below there are plots and grouppedBy that I used to have a deeper understanding
# a) admin_fee and state_fee are constant in 100% of all the rows, country is mainly USA, cleanup cost is 0
# b) Late_fee, its a fee that is paid if its not paid on time. However is palces since the beginning of the sanction (no data leak)
# c) Hearing date can induce to data leak as it is placed if it wasnt paid on time
# d) judgment_amount includes all fees, it diverges from fine_fee, and it can include data leak from late_fee
# e) ticket_ID the ID of each row is not meaningfull, ticket_issued_date doesnt impact in the violator
# f) inspector_name is a not a variable that impacts in the violator, the violator is independant from it
# g) agency_name same as inspector_name
# h) violator_name appears several times with different instances with misspells. Name shouldnt be relevant, we drop it
# i) Each violator_name becomes a new instance category, only relevant if the same violator name (the same category)
# is already in the system. Therefore if Max is in the system 10 times and she hasnt paid anything, on the new 11th instance
# the prediction is that she won't pay. This is useless.
dfTrain2 = dfTrain2.drop(['admin_fee', 'state_fee', 'country', 'ticket_issued_date',
'hearing_date',
'ticket_id',
'clean_up_cost', 'judgment_amount', 'inspector_name', 'violator_name',
'agency_name'], axis=1)
# We continue to explore the features with histograms
dfTrainP = dfTrain2.loc[dfTrain2['compliance'] == 1.0].copy()
dfTrainN = dfTrain2.loc[dfTrain2['compliance'] == 0.0].copy()
# Init an array with string to work as a filter
features = ['zip_code', 'violation_code', 'fine_amount', 'discount_amount',
'lat', 'lon', 'disposition', 'late_fee']
# Obtain the features from the clean data frame train in a new copy
X_traina = dfTrain2.filter(features).copy(deep = True)
#Obtain label data
y = dfTrain2.filter(['compliance']).copy(deep = True)
# Obtain features for dfTest2
dfTest2 = dfTest2.filter(features)
# Perform deep copy to have a traceability
X_train2 = X_traina.copy(deep=True)
X_testSubmit = dfTest2.copy(deep=True)
# Obtain the type of each column (feature)
Xdata = X_traina.dtypes
# Change the value of the 'Object' feautres to a categorial value
for i in range(len(Xdata)):
if Xdata[i] =='O':
column = Xdata.index[i]
#Converts the strings and object types into int64 or float
X_train2[column] = LabelEncoder().fit_transform(X_train2[column].astype(str))
X_testSubmit[column] = LabelEncoder().fit_transform(X_testSubmit[column].astype(str))
# This makes the column of unique categorial value, meaning that number 3 is not bigger than 2,
# its just an instance named 3
pd.Categorical(X_train2[column],categories=X_train2[column].unique())
pd.Categorical(X_testSubmit[column],categories=X_testSubmit[column].unique())
#Input mean values to NANs
my_imputer = Imputer()
X_train2 = my_imputer.fit_transform(X_train2)
X_testSubmit = my_imputer.fit_transform(X_testSubmit)
# Get the scalar
scaler = MinMaxScaler()
X_train, X_test, y_train, y_test = train_test_split(X_train2, y, random_state = 0)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Use ML algorithms
# Knn (not the best approach for this data set)
# knn = KNeighborsClassifier(n_neighbors = 75).fit(X_train_scaled, y_train)
# Tree clasiffier
# clf = DecisionTreeClassifier(random_state=0).fit(X_train_scaled, y_train)
# clf = RandomForestClassifier(random_state=0).fit(X_train_scaled, y_train)
# Tree gradient boost is the one that performed best
clf = GradientBoostingClassifier(random_state=0, n_estimators=50, learning_rate=0.5).fit(X_train_scaled, y_train)
y_scores = clf.predict_proba(X_test_scaled)
fpr, tpr, threshold = roc_curve(y_test, y_scores[:, 1])
roc_auc = auc(fpr, tpr)
# For this set we obtained: 0.814747374362
print("The AUC for this set is: ",roc_auc)
#Now we predict the Test to submit set
X_test_scaled_submit = scaler.transform(X_testSubmit)
y_scores_Submit = clf.predict_proba(X_test_scaled_submit)
# We make a new series of two columns, first column is the ticke id, second column is the prediction to be paid on time
# The name of the series is "compliance"
dfaux = pd.DataFrame(y_scores_Submit)
dfaux['ticket_id'] = dfTest['ticket_id']
s = pd.Series(dfaux[1].values, name = 'compliance',
index=pd.Index(dfaux['ticket_id'], name='ticket_id'))
return s