-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathloan-approval-prediction.py
461 lines (346 loc) · 16.7 KB
/
loan-approval-prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
# -*- coding: utf-8 -*-
"""Thesis.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1nv8Q5ScnrXJLw_GV0uaUjn8HFZ2krYKe
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import RandomOverSampler,SMOTE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_fscore_support
#ML models
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
#input data set
df = pd.read_csv('/content/drive/MyDrive/loan_data_set.csv')
import re
#convert column headers to lower place
df.columns = df.columns.str.lower().str.replace(' ', '_')
#df.columns = [re.sub(r'(?<!\s)(?=[A-Z])| ', '_', col).lower() for col in df.columns]
#print columns of the dataframe
df.columns
#check structure of the data
print(df.dtypes)
#check the first five rows of the data
print (df.head())
print(df.columns)
#Summary Statistics
df.describe()
#Exploratory Data Analysis
#distribution of the loan amounts
n, bins, patches = plt.hist(x=df['loanamount'], bins='auto', color='blue',alpha=0.7, rwidth=0.85)
plt.xlabel("Loan Amount")
plt.show()
#check the number of defaults and non-default
df['loan_status'].value_counts()
#Check the distribution of Loan Status across Property area
#Semi Urban has the highest number of defaults?
print(pd.crosstab(df['loan_status'],df['property_area'],))
df.boxplot(column=['applicantincome'], by='property_area')
plt.title("Distribution of Applicant's income by Property Area")
plt.suptitle('')
#check for null columns
df.isnull().sum()
#Feature Engineering
df['total_income'] = df['applicantincome'] + df['coapplicantincome']
print(df.head())
# Plot a scatter plot of income against age
plt.scatter(np.log10(df['total_income']), np.log10(df['loanamount']),color='blue', alpha=0.5)
plt.xlabel('Total ncome')
plt.ylabel('Loan Amount')
plt.show()
#Data Visualisation
# Barplot of Credit History by Loan Amount
plt.bar(df['credit_history'], df['loanamount'], color='cyan', alpha=0.5)
plt.xlabel('Credit History')
plt.ylabel('Loan Amount')
plt.xticks([0,1])
plt.show()
#those with credit history access higher loan amounts
# Create a cross table of Loan status, and Gender and average Income
pd.crosstab(df['gender'], df['loan_status'], values=df['total_income'], aggfunc='mean').plot(kind='bar',figsize=(6,6),color=['blue','cyan'])
plt.legend(['No','Yes'])
plt.title('Bar plot of Gender by Loan Status')
plt.ylabel('Average Income')
plt.show()
pd.crosstab(df.property_area,df.loan_status).plot(kind='bar',figsize=(6,6),color=['blue','cyan'])
plt.title('Property area vs Loan status')
plt.xlabel('Property Area')
plt.ylabel('count')
plt.xticks(rotation=0)
plt.show()
#Semi urban has the highest default. This could be because they have the highest debt to income ratio.
#Create a pairplot
# first replace 'Y' with 1 and 'N' with 0 in loan status column in order to include it in the pair plot
#df['loan_status'] = df['loan_status'].replace({'Y': 1, 'N': 0})
#secondly convert dependents to numbers
#df['dependents']= df['dependents'].map(lambda x: int(x.replace('+', '')) if '+' in x else int(x))
dep_dict = {'Y': 1, 'N': 0, '1':1, '2':2, '3+':3}
df = df.applymap(lambda x: dep_dict.get(x) if x in dep_dict else x)
#coerce dependents column to numeric data type
df['dependents'] = pd.to_numeric(df['dependents'], errors='coerce')
#create a pairplot with the numeric variables
sns.pairplot(df[['loanamount', 'self_employed', 'applicantincome', 'coapplicantincome',
'loan_amount_term','credit_history','dependents', 'loan_status']])
# Create a box plot of Loan Amount by Loan status
plt.clf()
df.boxplot(column = ['loanamount'], by = 'loan_status')
plt.title('Loan Amount by Loan Status')
plt.suptitle('')
plt.show()
#The average applicant income seems have no significant difference.
#checking the frequency of applicants whom are graduates and non-graduates
#Most oif the applicants are graduates
df['education'].value_counts()
#number of male and female applicants
#there are more male than female applicants
df['gender'].value_counts()
#Replace missing values of the columns with the median, mode as appropriate
df['dependents'].fillna(df['dependents'].mode()[0],inplace=True)
df['self_employed'].fillna(df['self_employed'].mode()[0],inplace=True)
df['loan_amount_term'].fillna(df['loan_amount_term'].mode()[0],inplace=True)
df['credit_history'].fillna(df['credit_history'].mode()[0],inplace=True)
df['loanamount'].fillna(df['loanamount'].median(),inplace=True)
df.isnull().any()
# one-hot encode the Property Area and 'Education' columns
#To enable the machine learning algorithms read the columns
df = pd.get_dummies(df, columns=['property_area', 'education'])
print(df)
#Correlation Map
#Measure the strength of the relationships amongst the variables
#applicant income and loan amount have a fair postive correlation -0.62
#loan status and credit history also have a fair postive correlation - 0.54
#total income and education(graduate) have a fair positive corelation
corr = df.corr()
sns.heatmap(corr, cmap='coolwarm')
#ML Models
#Logistic Regression
#predict using the model
#df_log_reg.predict()
#Split data into train and test
X = df.drop(['loan_id', 'gender','married','self_employed','loan_status', 'applicantincome','coapplicantincome'], axis=1)
y= df['loan_status']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.3, random_state=42)
# Imbalanced Data
#the defaults are 422 far more than the non-defaults
#undersample the defaults to be equal to the defaults
#Undersampling technique
#undersampling the train dataset
# X_y_train = pd.concat([X_train.reset_index(drop = True),y_train.reset_index(drop = True)], axis = 1)
# #Create data sets for defaults and non-defaults
# nondefaults = X_y_train[X_y_train['loan_status'] == 0]
# defaults = X_y_train[X_y_train['loan_status'] == 1]
# # Undersample the defaults
# defaults_under = defaults.sample(len(nondefaults))
# # Concatenate the undersampled nondefaults with defaults
# X_y_train_under = pd.concat([defaults_under.reset_index(drop = True),nondefaults.reset_index (drop = True)], axis = 0)
# X_train_balanced = X_y_train_under.drop(['loan_status'], axis=1)
# y_train_balanced= X_y_train_under['loan_status']
#oversampling
#oversample = RandomOverSampler(sampling_strategy='minority', random_state=42)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X, y)
# # fit and apply the oversampling strategy to the non-default loans in the training data
#X_train_resampled, y_train_resampled = oversample.fit_resample(X_train, y_train)
#scale and pipe the standard scaler and logistic model
pipe = make_pipeline(StandardScaler(), LogisticRegression())
# apply scaling on training data
pipe.fit(X_train_resampled, y_train_resampled)
Pipeline(steps=[('standardscaler', StandardScaler()),
('logisticregression', LogisticRegression())])
#making predictions
y_pred = pipe.predict(X_test)
#Evaluating the Logistic Regression model
# apply scaling on testing data, without leaking training data.
#check coefficients of the model
coefficients = pipe.named_steps['logisticregression'].coef_
#coefficients
# Print the classification report
# The recall for defaults is 0.84 meaning 84% of true defaults were predicted correctly.
target_names = ['Non-Default', 'Default']
print(classification_report(y_test,y_pred, target_names=target_names))
#Create an ROC Chart of the model's performance
# Create predictions and store them in a variable
preds = pipe.predict_proba(X_test)
# Plot the ROC curve of the probabilities of default
prob_default = preds[:, 1]
fallout, sensitivity, thresholds = roc_curve(y_test,prob_default)
plt.plot(fallout, sensitivity, color = 'darkorange')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
# Compute the AUC and store it in a variable
#the auc score is 0.94
#ROC chart shows the tradeoff between all values of our false positive rate (fallout) and true positive rate (sensitivity).
#the roc curve shows a fair lift
auc_log_reg = roc_auc_score(y_test ,prob_default)
print(auc_log_reg)
# Set the threshold for defaults to 0.5
#create a dataframe
preds_df = pd.DataFrame(prob_default, columns =['prob_default'])
preds_df['loan_status'] = preds_df['prob_default'].apply(lambda x: 1 if x > 0.5 else 0)
# Print the confusion matrix
print(confusion_matrix(y_test,preds_df['loan_status']))
# Store the number of loan defaults and non defaults from the prediction data
num_defaults = preds_df['loan_status'].value_counts()[1]
num_non_defaults = preds_df['loan_status'].value_counts()[0]
# Store the default recall from the classification report
#the default recall is
default_recall = precision_recall_fscore_support(y_test,preds_df['loan_status'])[1][1]
default_recall
log_ = pipe.named_steps['logisticregression']
# Calculate the cross validation scores for 4 folds
cv_scores = cross_val_score(log_,X_train_resampled, y_train_resampled,cv = 4)
cv_scores
#the scores array([0.7218543 , 0.73509934, 0.70198675, 0.66225166]) show that the model is consistent
# Calculate the estimated impact of the default recall rate
#Estimated loss of
#print( np.mean(df['loanamount']) * num_defaults * (1 - default_recall))
#assuming a loan of $100 is given to 100,000 people
#the expected loss would be 100,000 * 100 * (1-) = $
#however compared to the entire loan amount of $10,000,000, it is loss of 11%
# #XGBOOST
pipe_xgb = make_pipeline(StandardScaler(), XGBClassifier())
Pipeline(steps=[('standardscaler', StandardScaler()),
('xgbclassifier', XGBClassifier())])
# #Hyper parameter tuning to find the best hyper parameters
params = {
'xgbclassifier__learning_rate': uniform(0.01, 0.3), # range of learning rates
'xgbclassifier__max_depth': randint(1, 10), # range of max depths
'xgbclassifier__min_child_weight': randint(1, 10), # range of min child weights
'xgbclassifier__subsample': uniform(0.5, 0.5), # range of subsample values
'xgbclassifier__colsample_bytree': uniform(0.5, 0.5), # range of colsample_bytree values
'xgbclassifier__gamma': uniform(0, 1) # range of gamma values
}
# # Use RandomizedSearchCV to perform random search
rs = RandomizedSearchCV(pipe_xgb, param_distributions=params, n_iter=100, cv=5, random_state=42)
# # Fit the model on the training data
rs.fit(X_train_resampled,y_train_resampled)
# Print the best hyperparameters
print('Best hyperparameters:', rs.best_params_)
#Best hyperparameters: {'xgbclassifier__colsample_bytree': 0.8629778394351197, 'xgbclassifier__gamma': 0.8971102599525771,
#'xgbclassifier__learning_rate': 0.27612592727953517, 'xgbclassifier__max_depth': 8, 'xgbclassifier__min_child_weight': 1,
# 'xgbclassifier__subsample': 0.8210158230771438}
# get the best hyperparameters
best_params = rs.best_params_
print(best_params)
# create a new XGBClassifier instance with the best hyperparameters
xgb_clf = XGBClassifier(**best_params)
# create a new pipeline with StandardScaler and the updated XGBClassifier instance
pipe_xgb_new = make_pipeline(StandardScaler(), xgb_clf)
# fit the pipeline with the updated XGBClassifier to the training data
pipe_xgb_new.fit(X_train_resampled, y_train_resampled)
# try:
# # Use RandomizedSearchCV to perform random search
# rs = RandomizedSearchCV(pipe_xgb, param_distributions=params, n_iter=100, cv=5, random_state=42)
# # Fit the model on the training data
# rs.fit(X_train_resampled,y_train_resampled)
# except Exception as e:
# print("Error occurred:", e)
# #making predictions
y_xgb_predict = pipe_xgb_new.predict(X_test)
# #classification report
target_names = ['Non-Default', 'Default']
print(classification_report(y_test,y_xgb_predict, target_names=target_names))
#Evaluate the performance of the model
# #checking probabilities of default
y_xgb_pred_proba = pipe_xgb_new.predict_proba(X_test)
y_xgb_prob_default = y_xgb_pred_proba[:,1]
#auc score higher than the logistic regressio. it has a higer lift
auc_xgb = roc_auc_score(y_test,y_xgb_prob_default)
auc_xgb
# #Cross validating models
xgb_ = pipe_xgb_new.named_steps['xgbclassifier']
# Calculate the cross validation scores for 4 folds
cv_scores = cross_val_score(xgb_,X_train_resampled, y_train_resampled,cv = 4)
# # Print the cross validation scores
# #the scores are 0.84768212 0.87417219 0.86754967 0.90728477 steadily improving
# #this shows that the model is consistent
print(cv_scores)
# Print the confusion matrix
print(confusion_matrix(y_test,y_xgb_predict))
# # Print the average accuracy and standard deviation of the scores
print("Average accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(),
cv_scores.std() * 2))
#ROC Curve
fallout, sensitivity, thresholds = roc_curve(y_test,y_xgb_prob_default)
plt.plot(fallout, sensitivity, color = 'darkorange')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
#Comparing Models
xgb_preds_df = pd.DataFrame(y_xgb_predict, columns =['loan_status'])
# Store the number of loan defaults from the prediction data
num_defaults_xgb = xgb_preds_df['loan_status'].value_counts()[1]
# Store the default recall from the classification report
#the default recall is
default_recall_xgb = precision_recall_fscore_support(y_test,xgb_preds_df['loan_status'])[1][1]
default_recall
#the scores from the classification_report() are all higher for the gradient boosted tree.
#This means the tree model is better in all of these aspects. Let's check the ROC curve.
# Print the logistic regression classification report
target_names = ['Non-Default', 'Default']
print(classification_report(y_test, preds_df['loan_status'], target_names=target_names))
# Print the gradient boosted tree classification report
print(classification_report(y_test, xgb_preds_df['loan_status'], target_names=target_names))
# Print the default F-1 scores for the logistic regression
print('The f1 score for the Logisitic Regression model is {:.2f}'.format(precision_recall_fscore_support(y_test,preds_df['loan_status'], average = 'macro')[2]))
# Print the default F-1 scores for the gradient boosted tree
print('The f1 score for the XGBoost model is {:.2f}'.format(precision_recall_fscore_support(y_test,xgb_preds_df['loan_status'], average = 'macro')[2]))
# ROC chart components
fallout_lr, sensitivity_lr, thresholds_lr = roc_curve(y_test,prob_default)
fallout_gbt, sensitivity_gbt, thresholds_gbt = roc_curve(y_test,y_xgb_prob_default)
# ROC Chart with both
plt.plot(fallout_lr, sensitivity_lr, color = 'blue', label='%s' % 'Logistic Regression')
plt.plot(fallout_gbt,sensitivity_gbt, color = 'green', label='%s' % 'GBT')
plt.plot([0, 1], [0, 1], linestyle='--', label='%s' % 'Random Prediction')
plt.title("ROC Chart for LR and GBT on the Probability of Default")
plt.xlabel('Fall-out')
plt.ylabel('Sensitivity')
plt.legend()
plt.show()
#gradient boosted tree has the best overall performance.
#checking the calibration of the two models to see how stable the default prediction performance is across probabilities.
# Calculate the estimated impact of the default recall rate
#Estimated loss of
print( np.mean(df['loanamount']) * num_defaults_xgb * (1 - default_recall_xgb))
#assuming a loan of $100 is given to 100,000 people
#the expected loss would be 100,000 * 100 * (1-) = $
#however compared to the entire loan amount of $10,000,000, it is loss of
#The Gradient boost has an improvement from the logistic regression
#This shows how a slight improvement in a model can reduce losses for a business
#Generate weight of columns
# Get the XGBClassifier object from the pipeline
xgb_clf = pipe_xgb_new.named_steps['xgbclassifier']
# Get the feature importances
importances = xgb_clf.feature_importances_
#print(importances)
# Get the feature names from the StandardScaler object in the pipeline
feature_names = X_train_resampled.columns
# Create a dictionary of feature importance scores mapped to feature names
feature_importances = dict(zip(feature_names, importances))
# Sort the feature importances by value (in descending order)
sorted_feature_importances = dict(sorted(feature_importances.items(), key=lambda x: x[1], reverse=True))
# Get the top 5 most important features
top_features = list(sorted_feature_importances.keys())[:5]
print('The top 5 most important features are:')
for i in top_features:
print(i)