-
Notifications
You must be signed in to change notification settings - Fork 0
/
Power Law Scaling vs Alternate Linear Models in MSAs.Rmd
279 lines (224 loc) · 7.47 KB
/
Power Law Scaling vs Alternate Linear Models in MSAs.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
---
output:
word_document: default
pdf_document: default
html_document: default
---
#---
#title: "Power Law Scaling Proposal vs Alternate Linear Models in MSAs"
#author: "Alex Haase"
#output:
# word_document: default
# pdf_document: default
# html_document: default
#---
#Overview
#Required Libraries
```{r}
require(boot)
library(MASS)
library(ggplot2)
rm(list=ls()) # clear global environment
getwd() # double check wd
msadata = read.csv("http://dept.stat.lsa.umich.edu/~bbh/s485/data/gmp-2006.csv", TRUE, ",")
class(msadata) #create msa dataframe
#msadata objects
msaName = msadata$MSA #MSA name (metropolitan statistical areas)
pcgmp = msadata$pcgmp #per-capita GMP
pop = msadata$pop #population
finance = msadata$finance
prof.tech = msadata$prof.tech
ict = msadata$ict
management = msadata$management
```
#Conceptual Preliminaries and Initial Hypothesis
primary fields: MSA name, pcgmp, pop
comparison variables: prof.tech, information, ict, management
I agree with the competing theory to the supra-linear power
law scaling proposition. It seems more reasonable that
moving alone is not the only factor to attribute to economic
productivity; it can also be attributed to current financial
establishments and concentrations, along with information,
communication, and technology.
#Explanatory Analysis and GMP equation
```{r}
#per-capita_GMP = GMP/N, therefore: GMP = pop * per-capita_GMP
gmp = pop * as.double(pcgmp)
#gmp[1] #testing
#gmp[2] #testing
```
#Summarizing the proportions of data by variable and handling Missing data
```{r}
#first ommitting each case with an "NA" present per each variable
omit_finance = na.omit(finance)
omit_prof.tech = na.omit(prof.tech)
omit_itc = na.omit(ict)
omit_management = na.omit(management)
financeMean = mean(omit_finance)
#financeMean
prof.TechMean = mean(omit_prof.tech)
#prof.TechMean
itcMean = mean(omit_itc)
#itcMean
managementMean = mean(omit_management)
#managementMean
#removing entire rows with "NA" present in the dataset
#str(msadata)
#complete.cases(msadata) #return boolean for rows with NA
clean_msadata = msadata[complete.cases(msadata), ] #no NA's
#str(clean_msadata)
#clean_msadata
```
#Scatterplots via ggplot with smoothing
```{r}
#GMP vs Pop
gmp_vs_pop_Plot = ggplot(msadata, aes(x=pop, y=gmp)) +
geom_point(shape=18, color="blue") +
geom_smooth(method=lm, se=FALSE, linetype="dashed", color="darkred")
gmp_vs_pop_Plot
#log GMP vs Pop
log_GMP_vs_pop_Plot = ggplot(msadata, aes(x=pop, y=log(gmp))) +
geom_point(shape=18, color="blue") +
geom_smooth(method=lm, se=FALSE, linetype="dashed", color="darkred")
log_GMP_vs_pop_Plot
#log GMP vs log Pop
log_GMP_vs_log_pop_Plot = ggplot(msadata, aes(x=log(pop), y=log(gmp))) +
geom_point(shape=18, color="blue") +
geom_smooth(method=lm, se=FALSE, linetype="dashed", color="darkred")
log_GMP_vs_log_pop_Plot
```
Notes: the log of both variables (GMP and pop) seems to be best choice for visual representation
of the data. The other two plots heavily clusters the data on the lefthand side of the plot.
log provides a much more expansive view of the data, resulting in a cleaner visual image.
#log(GMP) vs. secondary variables
```{r}
#gmp vs finance
log_GMP_vs_finance = ggplot(msadata, aes(x=finance, y=log(gmp))) +
geom_point(shape=18, color="blue") +
geom_smooth(method=lm, se=FALSE, linetype="dashed", color="darkred")
log_GMP_vs_finance
#gmp vs prof.tech
log_GMP_vs_prof.tech = ggplot(msadata, aes(x=prof.tech, y=log(gmp))) +
geom_point(shape=18, color="blue") +
geom_smooth(method=lm, se=FALSE, linetype="dashed", color="darkred")
log_GMP_vs_prof.tech
#gmp vs ict
log_GMP_vs_ict = ggplot(msadata, aes(x=ict, y=log(gmp))) +
geom_point(shape=18, color="blue") +
geom_smooth(method=lm, se=FALSE, linetype="dashed", color="darkred")
log_GMP_vs_ict
#gmp vs management
log_GMP_vs_management = ggplot(msadata, aes(x=management, y=log(gmp))) +
geom_point(shape=18, color="blue") +
geom_smooth(method=lm, se=FALSE, linetype="dashed", color="darkred")
log_GMP_vs_management
```
#Fitting The Power Law Model
Using lm to linearly regress log(GMP) and log(pcgmp) on the log of the population size
```{r}
lm_GMP_pop = lm(log(gmp)~log(pop), data=msadata)
summary(lm_GMP_pop)
lm_pcgmp_pop = lm(log(pcgmp)~log(pop), data=msadata)
summary(lm_pcgmp_pop)
```
Generate residual plots to scrutinize the credibility of the normal, homoskedastic errors version of the regression model
```{r}
plot(lm_GMP_pop)
plot(lm_pcgmp_pop)
```
#Squared error loss on the log scale
```{r}
fit_loggmp_logpop = lm(log(gmp) ~ log(pop), data = msadata)
loss1 = mean(resid(fit_loggmp_logpop)^2)
loss1
fit_loggmp_ict= lm(log(gmp) ~ ict, data = msadata)
loss2 = mean(resid(fit_loggmp_ict)^2)
loss2
fit.loggmp_finance = lm(log(gmp) ~ finance, data = msadata)
loss3 = mean(resid(fit.loggmp_finance)^2)
loss3
```
#5-fold cross-validation
First applying a generalized linear model to the dataset,
and see how the cross-validated error estimate changes
with each degree polynomial.
```{r}
glm.fit = glm(pcgmp~pop, data=msadata)
degree=1:5
cv.error5=rep(0,5)
for(d in degree){
glm.fit = glm(pcgmp~poly(pop, d), data=msadata)
cv.error5[d] = cv.glm(msadata,glm.fit,K=5)$delta[1]
}
plot(cv.error5, data = msadata, main = "CV Generalization Error 5-fold",
xlab = "degree", ylab = "cv.error5")
```
#Fitting and assessment of alternate models
Fit the alternative models and eval using SEL and 5-fold CV
```{r}
pcgmp_finance = merge(pcgmp, finance, by=0)
#pcgmp_finance
omit_pcgmp_finance = na.omit(pcgmp_finance)
#omit_pcgmp_finance
pcgmp1 = omit_pcgmp_finance$x
#pcgmpg1
finance1 = omit_pcgmp_finance$y
#finance1
altM_pcgmp_finance = glm(log(pcgmp1)~finance1, data=omit_pcgmp_finance)
degree=1:5
cv.error5a=rep(0,5)
for(d in degree){
altM_pcgmp_finance = glm(pcgmp1~poly(finance1, d), data=omit_pcgmp_finance)
cv.error5a[d] = cv.glm(omit_pcgmp_finance,altM_pcgmp_finance,K=5)$delta[1]
}
plot(cv.error5a, data = omit_pcgmp_finance, main = "CV Gen. Error (log_pcgmp and finance) 5-fold",
xlab = "degree", ylab = "cv.error5")
pcgmp_ict = merge(pcgmp, ict, by=0)
#pcgmp_finance
omit_pcgmp_ict = na.omit(pcgmp_ict)
#omit_pcgmp_finance
pcgmp2 = omit_pcgmp_ict$x
#pcgmpg1
ict1 = omit_pcgmp_ict$y
#finance1
altM_pcgmp_ict = glm(log(pcgmp2)~ict1, data=omit_pcgmp_ict)
degree=1:5
cv.error5b=rep(0,5)
for(d in degree){
altM_pcgmp_ict = glm(pcgmp2~poly(ict1, d), data=omit_pcgmp_ict)
cv.error5b[d] = cv.glm(omit_pcgmp_ict,altM_pcgmp_ict,K=5)$delta[1]
}
plot(cv.error5b, data = omit_pcgmp_ict, main = "CV Gen. Error (log_pcgmp and ict) 5-fold",
xlab = "degree", ylab = "cv.error5")
```
#Standard error loss of the above two models:
```{r}
fit_pcgmp_finance = lm(log(pcgmp1) ~ finance1, data = omit_pcgmp_finance)
loss_pcg_finance = mean(resid(fit_loggmp_logpop)^2)
loss_pcg_finance
fit_pcgmp_ict = lm(log(pcgmp2) ~ ict1, data = omit_pcgmp_ict)
loss_pcg_ict = mean(resid(fit_pcgmp_ict)^2)
loss_pcg_ict
```
#Additional Hypothesis Test on Holdout data
```{r}
holdout = read.csv("http://dept.stat.lsa.umich.edu/~bbh/s485/data/gmp-2006-holdout.csv", TRUE, ",")
#holdout objects
msaName = holdout$MSA #MSA name (metropolitan statistical areas)
pcgmp_h = holdout$pcgmp #per-capita GMP
pop_h = holdout$pop #population
finance_h = holdout$finance
prof.tech_h = holdout$prof.tech
ict_h = holdout$ict
management_h = holdout$management
gmp_h = pop_h * as.double(pcgmp_h)
anova1 = aov(log(gmp_h)~log(pop_h))
anova1$coefficients
summary(anova1)
anova2 = aov(log(gmp_h)~ict_h)
anova2$coefficients
summary(anova2)
anova3 = aov(log(gmp_h)~finance_h)
anova3$coefficients
summary(anova3)
```