-
Notifications
You must be signed in to change notification settings - Fork 1
/
project.R
125 lines (105 loc) · 4.08 KB
/
project.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
df<-read.csv("/Users/candicewu/Desktop/mortgage data.csv")
#load the libraries
library(zoo)
library(ggplot2)
library(kableExtra)
library(dplyr)
library(caTools)
library(psych)
library(corrplot)
library(car)
library(stats)
library(moments)
#get general observation about the dataset
dim(df)
head(df)
tail(df)
summary(df)
names(df)
#remove unnecessary columns
df<-df[,-1]
head(df)
#create matrix plot
par(mfrow = c(1, 1))
df.matrix <- cor(df)
corrplot(df.matrix, method = "number")
# Create a new plot window with multiple histogram
par(mfrow = c(3, 3))
hist(df$MORTGAGE30US)
hist(df$FEDFUNDS)
hist(df$CPILFESL)
hist(df$TB3MS)
hist(df$GS20)
hist(df$HQMCB10YR)
hist(df$LREM64TTUSM156S)
hist(df$NGDPNSAXDCUSQ)
hist(df$USSTHPI)
#find the skewness
#https://www.statology.org/skewness-kurtosis-in-r/
skewness(df)
#since USSTHPI and LREM64TTUSM156S are moderately skewed, we are going to apply square_root
transformed_df <- data.frame(df, sqrt_USSTHPI = sqrt(df$USSTHPI))
transformed_df$sqrt_LREM64TTUSM156S <- sqrt(df$LREM64TTUSM156S)
head(transformed_df)
#compare the result for the variables that are transformed and without transformed
par(mfrow = c(1, 2))
boxplot(transformed_df$USSTHPI, main = "USSTHPI Boxplot", xlab = "USSTHPI Variable")
boxplot(transformed_df$sqrt_USSTHPI, main = "sqrt_USSTHPI Boxplot", xlab = "sqrt_USSTHPI Variable")
boxplot(transformed_df$LREM64TTUSM156S, main = "LREM64TTUSM156S Boxplot", xlab = "LREM64TTUSM156S Variable")
boxplot(transformed_df$sqrt_LREM64TTUSM156S, main = "sqrt_LREM64TTUSM156S Boxplot", xlab = "sqrt_LREM64TTUSM156SVariable")
#Estimate OLS with all the variables before applied transformation(full model)
model1<-lm(MORTGAGE30US~ ., data=df)
summary(model1)
#Estimate OLS with all the variables
model2<-lm(MORTGAGE30US~ ., data=transformed_df)
summary(model2)
#check multicollinearity
vif(model1)
vif(model2)
#as an arbitrary rule of thumb, the VIF should not be larger than 10
#drop insignificant variables with high VIF values one at a time
transformed_df <- subset(transformed_df, select=-c(CPILFESL,TB3MS,GS20,LREM64TTUSM156S,NGDPNSAXDCUSQ,USSTHPI))
head(transformed_df)
#best model
model3<-lm(MORTGAGE30US~ ., data=transformed_df)
summary(model3)
vif(model3)
# Analyze residuals against each of the x-variables using scatterplot
par(mfrow = c(2, 2))
residuals <- residuals(model3)
#https://www.statology.org/residual-plot-r/
plot(transformed_df$FEDFUNDS, residuals, xlab = "FEDFUNDS", ylab = "Residuals", main = "Residual Plot for FEDFUNDS")
#add a horizontal line at 0
abline(0,0, col='blue')
plot(transformed_df$sqrt_USSTHPI, residuals, xlab = "sqrt_USSTHPI", ylab = "Residuals", main = "Residual Plot for sqrt_USSTHPI")
abline(0,0, col='blue')
plot(transformed_df$HQMCB10YR, residuals, xlab = "HQMCB10YR", ylab = "Residuals", main = "Residual Plot for HQMCB10YR")
abline(0,0, col='blue')
plot(transformed_df$sqrt_LREM64TTUSM156S, residuals, xlab = "sqrt_LREM64TTUSM156S", ylab = "Residuals", main = "Residual Plot for sqrt_LREM64TTUSM156S")
abline(0,0, col='blue')
#https://www.statology.org/weighted-least-squares-in-r/
#load lmtest package
library(lmtest)
#perform white's test
bptest(model3, ~fitted(model3)+I(fitted(model3)^2))
#HO: Homoscedasticity is present
#HA: Heteroscedasticity is present
#Since the p-value is less than 0.05, we reject the null hypothesis.
#We can conclude that there is sufficient evidence to say that heteroscedasticity is present in the regression model.
#https://www.statology.org/robust-standard-errors-in-r/
library(lmtest)
library(sandwich)
#calculate robust standard errors for model coefficients
coeftest(model3, vcov = vcovHC(model3, type = 'HC0'))
#Since heteroscedasticity is present in the original regression model,
#this estimate for the standard error is more trustworthy
#perform weighted least squares regression
#define weights to use
wt <- 1 / lm(abs(model3$residuals) ~ model3$fitted.values)$fitted.values^2
#perform weighted least squares regression
wls_model <- lm(MORTGAGE30US~., data = transformed_df, weights=wt)
#view summary of model
summary(wls_model)
# regression table
library(stargazer)
stargazer(model1,model3,wls_model, type = "text")