-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathresidualplots.py
268 lines (223 loc) · 8.89 KB
/
residualplots.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
# Code for diagnostic plots-- sourced from: https://www.statsmodels.org/dev/examples/notebooks/generated/linear_regression_diagnostics_plots.html
style_talk = 'seaborn-talk' #refer to plt.style.available
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels
from statsmodels.tools.tools import maybe_unwrap_results
from statsmodels.graphics.gofplots import ProbPlot
from statsmodels.stats.outliers_influence import variance_inflation_factor
import matplotlib.pyplot as plt
from typing import Type
class Linear_Reg_Diagnostic():
"""
Diagnostic plots to identify potential problems in a linear regression fit.
Mainly,
a. non-linearity of data
b. Correlation of error terms
c. non-constant variance
d. outliers
e. high-leverage points
f. collinearity
Author:
Prajwal Kafle (p33ajkafle@gmail.com, where 3 = r)
Does not come with any sort of warranty.
Please test the code one your end before using.
"""
def __init__(self,
results: Type[statsmodels.regression.linear_model.RegressionResultsWrapper]) -> None:
"""
For a linear regression model, generates following diagnostic plots:
a. residual
b. qq
c. scale location and
d. leverage
and a table
e. vif
Args:
results (Type[statsmodels.regression.linear_model.RegressionResultsWrapper]):
must be instance of statsmodels.regression.linear_model object
Raises:
TypeError: if instance does not belong to above object
Example:
>>> import numpy as np
>>> import pandas as pd
>>> import statsmodels.formula.api as smf
>>> x = np.linspace(-np.pi, np.pi, 100)
>>> y = 3*x + 8 + np.random.normal(0,1, 100)
>>> df = pd.DataFrame({'x':x, 'y':y})
>>> res = smf.ols(formula= "y ~ x", data=df).fit()
>>> cls = Linear_Reg_Diagnostic(res)
>>> cls(plot_context="seaborn-paper")
In case you do not need all plots you can also independently make an individual plot/table
in following ways
>>> cls = Linear_Reg_Diagnostic(res)
>>> cls.residual_plot()
>>> cls.qq_plot()
>>> cls.scale_location_plot()
>>> cls.leverage_plot()
>>> cls.vif_table()
"""
if isinstance(results, statsmodels.regression.linear_model.RegressionResultsWrapper) is False:
raise TypeError("result must be instance of statsmodels.regression.linear_model.RegressionResultsWrapper object")
self.results = maybe_unwrap_results(results)
self.y_true = self.results.model.endog
self.y_predict = self.results.fittedvalues
self.xvar = self.results.model.exog
self.xvar_names = self.results.model.exog_names
self.residual = np.array(self.results.resid)
influence = self.results.get_influence()
self.residual_norm = influence.resid_studentized_internal
self.leverage = influence.hat_matrix_diag
self.cooks_distance = influence.cooks_distance[0]
self.nparams = len(self.results.params)
def __call__(self, plot_context='seaborn-paper'):
# print(plt.style.available)
with plt.style.context(plot_context):
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(10,10))
self.residual_plot(ax=ax[0,0])
self.qq_plot(ax=ax[0,1])
self.scale_location_plot(ax=ax[1,0])
self.leverage_plot(ax=ax[1,1])
plt.show()
self.vif_table()
return fig, ax
def residual_plot(self, ax=None):
"""
Residual vs Fitted Plot
Graphical tool to identify non-linearity.
(Roughly) Horizontal red line is an indicator that the residual has a linear pattern
"""
if ax is None:
fig, ax = plt.subplots()
sns.residplot(
x=self.y_predict,
y=self.residual,
lowess=True,
scatter_kws={'alpha': 0.5},
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8},
ax=ax)
# annotations
residual_abs = np.abs(self.residual)
abs_resid = np.flip(np.sort(residual_abs))
abs_resid_top_3 = abs_resid[:3]
for i, _ in enumerate(abs_resid_top_3):
ax.annotate(
i,
xy=(self.y_predict[i], self.residual[i]),
color='C3')
ax.set_title('Residuals vs Fitted', fontweight="bold")
ax.set_xlabel('Fitted values')
ax.set_ylabel('Residuals')
return ax
def qq_plot(self, ax=None):
"""
Standarized Residual vs Theoretical Quantile plot
Used to visually check if residuals are normally distributed.
Points spread along the diagonal line will suggest so.
"""
if ax is None:
fig, ax = plt.subplots()
QQ = ProbPlot(self.residual_norm)
QQ.qqplot(line='45', alpha=0.5, lw=1, ax=ax)
# annotations
abs_norm_resid = np.flip(np.argsort(np.abs(self.residual_norm)), 0)
abs_norm_resid_top_3 = abs_norm_resid[:3]
for r, i in enumerate(abs_norm_resid_top_3):
ax.annotate(
i,
xy=(np.flip(QQ.theoretical_quantiles, 0)[r], self.residual_norm[i]),
ha='right', color='C3')
ax.set_title('Normal Q-Q', fontweight="bold")
ax.set_xlabel('Theoretical Quantiles')
ax.set_ylabel('Standardized Residuals')
return ax
def scale_location_plot(self, ax=None):
"""
Sqrt(Standarized Residual) vs Fitted values plot
Used to check homoscedasticity of the residuals.
Horizontal line will suggest so.
"""
if ax is None:
fig, ax = plt.subplots()
residual_norm_abs_sqrt = np.sqrt(np.abs(self.residual_norm))
ax.scatter(self.y_predict, residual_norm_abs_sqrt, alpha=0.5);
sns.regplot(
x=self.y_predict,
y=residual_norm_abs_sqrt,
scatter=False, ci=False,
lowess=True,
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8},
ax=ax)
# annotations
abs_sq_norm_resid = np.flip(np.argsort(residual_norm_abs_sqrt), 0)
abs_sq_norm_resid_top_3 = abs_sq_norm_resid[:3]
for i in abs_sq_norm_resid_top_3:
ax.annotate(
i,
xy=(self.y_predict[i], residual_norm_abs_sqrt[i]),
color='C3')
ax.set_title('Scale-Location', fontweight="bold")
ax.set_xlabel('Fitted values')
ax.set_ylabel(r'$\sqrt{|\mathrm{Standardized\ Residuals}|}$');
return ax
def leverage_plot(self, ax=None):
"""
Residual vs Leverage plot
Points falling outside Cook's distance curves are considered observation that can sway the fit
aka are influential.
Good to have none outside the curves.
"""
if ax is None:
fig, ax = plt.subplots()
ax.scatter(
self.leverage,
self.residual_norm,
alpha=0.5);
sns.regplot(
x=self.leverage,
y=self.residual_norm,
scatter=False,
ci=False,
lowess=True,
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8},
ax=ax)
# annotations
leverage_top_3 = np.flip(np.argsort(self.cooks_distance), 0)[:3]
for i in leverage_top_3:
ax.annotate(
i,
xy=(self.leverage[i], self.residual_norm[i]),
color = 'C3')
xtemp, ytemp = self.__cooks_dist_line(0.5) # 0.5 line
ax.plot(xtemp, ytemp, label="Cook's distance", lw=1, ls='--', color='red')
xtemp, ytemp = self.__cooks_dist_line(1) # 1 line
ax.plot(xtemp, ytemp, lw=1, ls='--', color='red')
ax.set_xlim(0, max(self.leverage)+0.01)
ax.set_title('Residuals vs Leverage', fontweight="bold")
ax.set_xlabel('Leverage')
ax.set_ylabel('Standardized Residuals')
ax.legend(loc='upper right')
return ax
def vif_table(self):
"""
VIF table
VIF, the variance inflation factor, is a measure of multicollinearity.
VIF > 5 for a variable indicates that it is highly collinear with the
other input variables.
"""
vif_df = pd.DataFrame()
vif_df["Features"] = self.xvar_names
vif_df["VIF Factor"] = [variance_inflation_factor(self.xvar, i) for i in range(self.xvar.shape[1])]
print(vif_df
.sort_values("VIF Factor")
.round(2))
def __cooks_dist_line(self, factor):
"""
Helper function for plotting Cook's distance curves
"""
p = self.nparams
formula = lambda x: np.sqrt((factor * p * (1 - x)) / x)
x = np.linspace(0.001, max(self.leverage), 50)
y = formula(x)
return x, y