-
Notifications
You must be signed in to change notification settings - Fork 0
/
R_script.R
231 lines (146 loc) · 5.46 KB
/
R_script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
# Loading the data. The first 2 row deleted manually in excel before loading the data.
library(readxl)
ISSSR <- read_excel("ISSSR_dataset.xlsx",
na = "NA")
# Keeping important variables only:
dat <- ISSSR[,c("GNI per capita (constant 2010 US$)",
"CHE / GDP",
"Any BF age 12m",
"DALY Asthma",
"DALY Diabetes",
"DALY IBD",
"DALY Anxiety")]
# Exploring the data head
head(dat)
# Exploring the variable types
str(dat)
# Rounding the Any BF age 12m
dat$`Any BF age 12m` <- round(dat$`Any BF age 12m`,0)
# Summary statistic for each variabl
summary(dat)
#########################
### For DALY Asthma #####
#########################
# Subsetting for Asthma
dat1 <- dat[,c("GNI per capita (constant 2010 US$)",
"CHE / GDP",
"Any BF age 12m",
"DALY Asthma")]
# Bi-variate analysis:
library(GGally)
ggpairs(dat1)
# Model fitting:
fit1 <- lm(`DALY Asthma`~., data = dat1)
# Checking the assumption of the model graphically
shapiro.test(fit1$residuals)
# The null hypothesis for shapiro test is "Data follows normal distribution". So, the
# p-value need to be greater than level of significance to interpret that the data is normal. In this
# case the p-value is less than 0.05 (if we consider 5% level of significance), so the null
# hypothesis is rejected and conclude that the residual of fit1 are not normal.
# So, need transformation of dependent variable.
### The distribution of dependent variable is is positively skweed,
# as a result we choose log transformation.
# Model fitting: Model after taking log transformation
fit2 <- lm(log(`DALY Asthma`)~., data = dat1)
# Checking normality
shapiro.test(fit2$residuals) # seems fine
# Exploring the summary of the model
summary(fit2)
# None of the independent variable have any statistically significant impact on DALY Asthma at 5% level of significance,
# because all the p-values are greater than 0.05
# Checking multicollinearity
car::vif(fit2)
# vif values greater than 5 is problematic. So, in our case it's fine.
# Checking heteroscedasticity
lmtest::bptest(fit2) # Breusch-Pagan test
# Result interpretation for heteroscedasticity
# The test have a p-value greater that a significance level of 0.05,
# therefore we can not reject the null hypothesis that the variance of the residuals is constant
# and infer that heteroscedasticity is not indeed present.
#########################
### For DALY Diabetes ###
#########################
# Subsetting for Asthma
dat2 <- dat[,c("GNI per capita (constant 2010 US$)",
"CHE / GDP",
"Any BF age 12m",
"DALY Diabetes")]
# Bi-variate analysis:
library(GGally)
ggpairs(dat2)
# Model fitting:
fit3 <- lm(`DALY Diabetes`~., data = dat2)
# Checking normality
shapiro.test(fit3$residuals)
# Residual of fit3 not normal. So, we need to use transformation.
# Model fitting:
fit4 <- lm(log(`DALY Diabetes`)~., data = dat2)
# Checking normality
shapiro.test(fit4$residuals) # seems fine
# Exploring the summary of the model
summary(fit4)
# None of the independent variable have any statistically significant impact on DALY Diabetes at 5% level of significance,
# because all the p-values are greater than 0.05
# Checking multicollinearity
car::vif(fit4)
# vif values greater than 5 is problematic. So, in our case it's fine.
# Checking heteroscedasticity
lmtest::bptest(fit4) # Breusch-Pagan test
####################
### For DALY IBD ###
####################
# Subsetting for Asthma
dat3 <- dat[,c("GNI per capita (constant 2010 US$)",
"CHE / GDP",
"Any BF age 12m",
"DALY IBD")]
# Bi-variate analysis:
library(GGally)
ggpairs(dat3)
# Model fitting:
fit5 <- lm(`DALY IBD`~., data = dat3)
# Checking normality
shapiro.test(fit5$residuals)
# So, we need to use transformation.
# Model fitting:
fit6 <- lm(log(`DALY IBD`)~., data = dat3)
# Checking normality
shapiro.test(fit6$residuals) # residual still not normal.
# So, we explore the data transformation graph on pdf file and try to get idea by
# which transformation we can make the residual normal. And we found 1/sqrt() transformation
# would work for our DALY IBD.
# Model fitting:
fit7 <- lm(1/sqrt(`DALY IBD`)~., data = dat3)
# Checking normality
shapiro.test(fit7$residuals) # Data seems normal now
# Model summary:
summary(fit7)
# Checking multicollinearity
car::vif(fit7)
# Checking heteroscedasticity
lmtest::bptest(fit7) # Breusch-Pagan test
#########################
### For DALY Anxiety ###
#########################
# Subsetting for Asthma
dat4 <- dat[,c("GNI per capita (constant 2010 US$)",
"CHE / GDP",
"Any BF age 12m",
"DALY Anxiety")]
# Bi-variate analysis:
library(GGally)
ggpairs(dat4)
# Model fitting:
fit8 <- lm(`DALY Anxiety`~., data = dat4)
# Checking normality
shapiro.test(fit8$residuals) # Not normal, so need transformation.
# Model fitting:
fit9 <- lm(log(`DALY Anxiety`)~., data = dat4)
# Checking normality
shapiro.test(fit9$residuals) # Seems fine
# Model summary:
summary(fit9)
# Checking multicollinearity
car::vif(fit9)
# Checking heteroscedasticity
lmtest::bptest(fit9) # Breusch-Pagan test