-
Notifications
You must be signed in to change notification settings - Fork 0
/
regressions.py
108 lines (87 loc) · 3.8 KB
/
regressions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import statsmodels.api as sm
import pandas as pd
from typing import List
import os
import csv
import seaborn as sns
class Regression_Wrapper:
def __init__(self, df_raw: pd.DataFrame, df_clean: pd.DataFrame, date_str: List[str] = ["date"]):
self.df_raw = df_raw
self.df_clean = df_clean
self.date_str = date_str
self.models = dict()
self.model_summaries = dict()
self.plots = dict()
self.used_data = dict()
self.x_cols = dict()
self.y_col = dict()
def get_raw(self) -> pd.DataFrame:
return self.df_raw
def get_clean(self) -> pd.DataFrame:
return self.df_clean
def run_linear_regression(self, model_name: str, x_cols: List[int], y_col: int, df_type: str = "clean") -> None:
if df_type == "clean":
df = self.df_clean
self.used_data[model_name] = "clean"
else:
df = self.df_raw
self.used_data[model_name] = "raw"
self.x_cols[model_name] = [df.columns[i] for i in x_cols]
self.y_col[model_name] = df.columns[y_col]
X = df.iloc[:, x_cols]
X = sm.add_constant(X) # Adds a constant term to the predictor
y = df.iloc[:, y_col]
model = sm.OLS(y, X).fit()
self.models[model_name] = model
self.model_summaries[model_name] = model.summary()
def get_linear_regression_summary(self, model_name: str) -> str:
if model_name in self.model_summaries:
return str(self.model_summaries[model_name])
else:
return "Model not found."
def write_regression_results_to_csv(self, model_name: str, filepath: str) -> None:
coefficients = self.models[model_name].params
pvals = self.models[model_name].pvalues
rsquared = self.models[model_name].rsquared
fvalue = self.models[model_name].fvalue
f_pvalue = self.models[model_name].f_pvalue
conf_int = self.models[model_name].conf_int()
summary = self.models[model_name].summary()
# Extract notes from the summary
notes = summary.extra_txt
# Check if the file exists and if it is empty
file_exists = os.path.isfile(filepath)
is_empty = os.path.getsize(filepath) == 0 if file_exists else True
with open(filepath, mode='a', newline='') as f:
writer = csv.writer(f)
# Write the header if the file is empty
if is_empty:
header = ['model_name', 'variable', 'value', 'pvalue', 'conf_int_lower', 'conf_int_upper', 'rsquared', 'fvalue', 'f_pvalue', 'notes']
writer.writerow(header)
# Write the data for each coefficient
for idx, coef in coefficients.items():
row = [
model_name,
idx,
coef,
float(pvals[idx]),
float(conf_int.loc[idx, 0]),
float(conf_int.loc[idx, 1]),
float(rsquared),
float(fvalue),
float(f_pvalue),
notes
]
writer.writerow(row)
def write_regression_latex(self, model_name: str, filepath: str) -> None:
with open(filepath, mode='w') as f:
f.write(self.model_summaries[model_name].as_latex())
def save_plot_png(self, model_name: str, filepath: str) -> None:
if len(self.x_cols[model_name]) != 1:
raise ValueError("Can only plot one x variable at a time.")
if self.used_data[model_name] == "clean":
df = self.df_clean
else:
df = self.df_raw
fig = sns.regplot(x=self.x_cols[model_name][0], y=self.y_col[model_name], data=df)
fig.get_figure().savefig(filepath)