-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlocal_app.py
361 lines (308 loc) · 14.1 KB
/
local_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
"""Credit Scoring Dashboard App
Author: Etienne Lardeur https://github.com/EtienneLardeur
Source: https://github.com/EtienneLardeur/Streamlit_App
launch (local) with command line: streamlit run local_app.py
"""
import streamlit as st
import streamlit.components.v1 as components
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import urllib
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import make_pipeline
from lime.lime_tabular import LimeTabularExplainer
import shap
# warning on pyplot
st.set_option('deprecation.showPyplotGlobalUse', False)
# path & files to load
MODEL_SAV_FILE = "model.sav"
DESC_PKL_FILE = 'desc.pkl'
FINAL_PKL_FILE = 'final.pkl'
SHAP_EXP_FILE = 'shap.exp'
SHAP_VAL_FILE = 'shap.val'
GITHUB_ROOT = (
"https://raw.githubusercontent.com/EtienneLardeur/Streamlit_App/main/"
)
# cache means unique function execution at start
# use pickle to load an object
def load_obj(file: str):
"""An instance of an object from the pickle file"""
github_url = GITHUB_ROOT + file
with urllib.request.urlopen(github_url) as open_file: # type: ignore
return pickle.load(open_file)
@st.cache(suppress_st_warning=True)
def bulk_init():
def initialize_desc():
# load
df = load_obj(DESC_PKL_FILE)
# create the list of features
dflist = df['Feature'].tolist()
return df, dflist
desc, field_list = initialize_desc()
def initialize_inputs():
# load
df = load_obj(FINAL_PKL_FILE)
# transform
inputsdf = df.drop(columns=['RISK_FLAG', 'RISK_PROBA'])
id_list = df.index.tolist()
return df, inputsdf, id_list
final, inputs, sk_id_list = initialize_inputs()
def initialize_model():
# load
mdl = load_obj(MODEL_SAV_FILE)
# transform
pipeline = make_pipeline(mdl)
return pipeline
pipe = initialize_model()
def initialize_shap():
# load
shap_exp = load_obj(SHAP_EXP_FILE)
shap_val = load_obj(SHAP_VAL_FILE)
return shap_exp, shap_val
shap_explainer, shap_values = initialize_shap()
return desc, field_list, final, inputs, sk_id_list, pipe, shap_explainer, shap_values
desc, field_list, final, inputs, sk_id_list, pipe, shap_explainer, shap_values = bulk_init()
# function to apply threshold to positive probabilities to create labels
@st.cache
def to_labels(pos_probs, threshold):
return (pos_probs >= threshold).astype('int')
# get native mofidiable predictions from "final" and store in a "result" df
@st.cache(allow_output_mutation=True)
def get_native_predictions(final):
# native labels
risk_flag = final['RISK_FLAG']
# native proba
risk_proba = final['RISK_PROBA']
# return native failure rate
pred_good = (risk_flag == 0).sum()
pred_fail = (risk_flag == 1).sum()
failure_rate = round(pred_fail / (pred_good + pred_fail), 2)
# create the first restults df
results = final.copy()
return results, failure_rate, risk_proba
# create original restults df & failure rate
results, failure_rate, risk_proba = get_native_predictions(final)
# features to show
features_to_show = []
# actualize predictions
@st.cache(allow_output_mutation=True)
def actualize_predictions(final, threshold):
# unchanged native proba
risk_proba = final['RISK_PROBA']
# new predictions
risk_flag = to_labels(risk_proba, threshold)
# return new failure rate
pred_good = (risk_flag == 0).sum()
pred_fail = (risk_flag == 1).sum()
failure_rate = round(pred_fail / (pred_good + pred_fail), 2)
# update results
results['RISK_FLAG'] = risk_flag
return results, failure_rate
st.write("""
# Credit scoring of client's applications
""")
# Sidebar ##################################################
st.sidebar.header('Inputs Panel')
### Sidebar - subsection Failure Rate Control ###
st.sidebar.subheader('- *Supervisor Only* Failure Rate Control')
st.sidebar.write('Initial Failure Rate', failure_rate)
def threshold_prediction_component():
new_threshold = st.sidebar.slider(
label='Threshold:',
min_value=0.,
value=0.5,
max_value=1.)
new_failure_rate = failure_rate
results, new_failure_rate = actualize_predictions(
final,
new_threshold)
st.sidebar.write('Current Failure Rate', new_failure_rate)
return new_threshold
curr_threshold = threshold_prediction_component()
### Sidebar - subsection Client selection ###
st.sidebar.subheader('- Client selection')
def client_input_features():
sk_id_curr = st.sidebar.selectbox('Please select Client ID', sk_id_list, 0)
sk_row = results.loc[[sk_id_curr]]
return sk_row, sk_id_curr
select_sk_row, select_sk_id = client_input_features()
### Sidebar - subsection tune ###
st.sidebar.subheader('- *tbd* Tune Application')
### Sidebar - subsection feature description ###
st.sidebar.subheader('- *tbc* Get full description of a feature')
def field_description():
field = st.sidebar.selectbox('Please select a feature', field_list, 0)
Description = desc[desc['Feature'] == field]['Description']
pd.options.display.max_colwidth = len(Description)
return Description
txt_field_desc = field_description()
st.sidebar.text(txt_field_desc)
# Main page ##################################################
st.subheader('- __*Demo only*__ Generate applications sample')
def application_samples_component():
''' display samples
'''
if st.button('Samples'):
st.markdown('predicted __without__ difficulty to repay - sample')
st.write(results[results['RISK_FLAG'] == 0].sample(3))
st.markdown('predicted __with__ difficulty to repay - sample')
st.write(results[results['RISK_FLAG'] == 1].sample(3))
application_samples_component()
st.subheader('Selected Client')
st.write(select_sk_row)
# SHAP section #################################################
st.subheader(' __*Experiment:*__ Generate SHAP explainer')
def shap_explaination(sk_id_curr):
''' compute and display explainer
'''
if st.button("Explain Results by SHAP"):
with st.spinner('Calculating...'):
st.write('__SH__apley __A__dditive ex__P__lanations provide an overview of how most important features impacts Class prediction')
# st.write('*__Summary plot__ shows, considering __any application__, the distribution of features values colored by Class prediction*')
st.write('*__Force plot__ shows, __depending on the ground data selected__, how opposite are the features strenghs*')
st.write('*Green means feature value makes Default Risk lower while Red means feature value makes Default Risk higher*')
# recover index position of sk_id_curr
idx = inputs.index.get_loc(sk_id_curr)
# create individual fig
ind_fig = shap.force_plot(
shap_explainer.expected_value[1],
shap_values[1][idx],
inputs.iloc[[idx]], plot_cmap="PkYg")
ind_fig_html = f"<head>{shap.getjs()}</head><body>{ind_fig.html()}</body>"
# create collective fig
col_fig = shap.force_plot(
shap_explainer.expected_value[1],
shap_values[1][0,:],
inputs.iloc[0,:], plot_cmap="PkYg")
col_fig_html = f"<head>{shap.getjs()}</head><body>{col_fig.html()}</body>"
# create
feat_fig = shap.force_plot(
shap_explainer.expected_value[1],
shap_values[1][:500,:],
inputs.iloc[:500,:], plot_cmap="PkYg")
feat_fig_html = f"<head>{shap.getjs()}</head><body>{feat_fig.html()}</body>"
# Display the summary plot
# st.write('__ - SHAP Summary plot of Class 1: Failure Risk__')
# st.write('*Blue means negative impact to Risk while Red means positive impact*')
# st.write('__*Red*__ means Class 1: Failure Risk')
# st.write('__*Blue*__ means opposite')
# shap.summary_plot(shap_values[1], inputs, show=False)
# st.pyplot(bbox_inches='tight')
# Display explainer HTML object col_fig
st.write('__ - SHAP Force plot considering entire new Applications data (test)__')
# st.write('*Green means feature value makes Risk lower while Red means feature value makes Risk higher*')
components.html(col_fig_html, height=120)
# Display explainer HTML object ind_fig
st.write('__ - SHAP Force plot for the selected Application__')
# st.write('*Green means feature value makes Risk lower while Red means feature value makes Risk higher*')
components.html(ind_fig_html, height=120)
# Display explainer HTML object feat_fig
st.write('__ - SHAP Force plot to provide feature analysis along a sample of Applications (here 10% of test set)__')
# st.write('*Green means feature value makes Risk lower while Red means feature value makes Risk higher*')
components.html(feat_fig_html, height=350)
shap_explaination(select_sk_id)
# Lime section ################################################
st.subheader('__*Actionable:*__ Generate LIME explainer')
st.write('__ - L__ocal __I__nterpretable __M__odel-agnostic __E__xplanations:')
def lime_explaination(inputs, results, select_sk_id):
''' compute and display explainer
'''
st.write('*Please set the number of __features__ you want to analyse (LIME will grab most important first)*')
nb_features = st.slider(
label='Number of Features to analyse',
min_value=7,
value=10,
max_value=15)
st.write('*Please set the number of __similar applications__ you want to compare with (similarity according to most important features)*')
nb_neighbors = st.slider(
label='Number of similar applications to consider',
min_value=10,
value=20,
max_value=50)
if st.button("Explain Results by LIME"):
with st.spinner('Calculating...'):
lime_explainer = LimeTabularExplainer(
training_data = inputs.values,
mode='classification',
training_labels = results[['RISK_FLAG']],
feature_names = inputs.columns)
exp = lime_explainer.explain_instance(
inputs.loc[select_sk_id].values,
pipe.predict_proba,
num_features=nb_features)
# introduce next step
st.write('__ - LIME explaination for the selected Client:__')
st.write('*Positive value __Red__ means __Support__ the Class 1: Failure Risk*')
st.write('*Negative value __Green__ means __Contradict__ the Class 1: Failure Risk*')
# Get features_to_show list
id_cols = [item[0] for item in exp.as_map()[1]]
# Create inputs restricted to the features_to_show
df_lime = inputs.filter(
inputs.columns[id_cols].tolist())
# sk_id_row = df_lime.loc[[select_sk_id]]
# compute inputs for plots
exp_list= exp.as_list()
vals = [x[1] for x in exp_list]
names = [x[0] for x in exp_list]
axisgb_colors = ['#fee0d2' if x > 0 else '#c7e9c0' for x in vals]
vals.reverse()
names.reverse()
colors = ['red' if x > 0 else 'green' for x in vals]
pos = np.arange(len(exp_list)) + .5
# create tab plot
tab = plt.figure()
plt.barh(pos, vals, align='center', color=colors)
plt.yticks(pos, names)
plt.title('Local explanation for Class 1: Failure Risk')
st.pyplot(tab)
# st.write(sk_id_row)
# find nb_neighbors nearest neighbors to catch anomaly
nearest_neighbors = NearestNeighbors(
n_neighbors=nb_neighbors,
radius=0.4)
nearest_neighbors.fit(df_lime)
neighbors = nearest_neighbors.kneighbors(
df_lime.loc[[select_sk_id]],
nb_neighbors + 1,
return_distance=False)[0]
neighbors = np.delete(neighbors, 0)
# compute values for neighbors, class0 and class1
df_lime['RISK_FLAG'] = results['RISK_FLAG']
neighbors_values = pd.DataFrame(
df_lime.iloc[neighbors].mean(),
index=df_lime.columns,
columns=['Neighbors_Mean'])
st.write('__- Neighbors Risk Flag averaged__',
neighbors_values.Neighbors_Mean.tail(1).values[0])
st.write('*Nb. Neighborood __do not__ take Risk prediction values into account*')
client_values = df_lime.loc[[select_sk_id]].T
client_values.columns = ['Client_Value']
class1_values = pd.DataFrame(
df_lime[df_lime['RISK_FLAG'] == 1].mean(),
index=df_lime.columns,
columns=['Class_1_Mean'])
class0_values = pd.DataFrame(
df_lime[df_lime['RISK_FLAG'] == 0].mean(),
index=df_lime.columns,
columns=['Class_0_Mean'])
any_values = pd.concat(
[class0_values.iloc[:-1],
class1_values.iloc[:-1],
neighbors_values.iloc[:-1],
client_values],
axis=1)
colorsList = ('tab:green', 'tab:red', 'tab:cyan', 'tab:blue')
fig, axs = plt.subplots(nb_features, sharey='row', figsize=(8, 4 * nb_features))
for i in np.arange(0, nb_features):
axs[i].barh(any_values.T.index,
any_values.T.iloc[:, i],
color=colorsList)
axs[i].set_title(str(any_values.index[i]), fontweight="bold")
axs[i].patch.set_facecolor(axisgb_colors[i])
st.write('__ - Details of LIME explaination for each features: __')
st.write('*Nb. You may compare Client value with mean of its Neighbors, Class 1 & Class 0*')
st.write('*Colored lightred / lightgreen foreground is related to Class 1: Failure Risk Support / Contradict*')
st.pyplot(fig)
lime_explaination(inputs, results, select_sk_id)