-
Notifications
You must be signed in to change notification settings - Fork 14
/
ch4.R
206 lines (163 loc) · 9.59 KB
/
ch4.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
### Week 3 Chapter 4 Statistical Learning
require(ISLR) #require command is similar to libary command; load library, has the dataset on Stock Market
names(Smarket) #gives names of variables; Direction is the response(outcome) variable
[1] "Year" "Lag1" "Lag2" "Lag3" "Lag4" "Lag5" "Volume"
[8] "Today" "Direction"
summary(Smarket)
Year Lag1 Lag2 Lag3
Min. :2001 Min. :-4.922000 Min. :-4.922000 Min. :-4.922000
1st Qu.:2002 1st Qu.:-0.639500 1st Qu.:-0.639500 1st Qu.:-0.640000
Median :2003 Median : 0.039000 Median : 0.039000 Median : 0.038500
Mean :2003 Mean : 0.003834 Mean : 0.003919 Mean : 0.001716
3rd Qu.:2004 3rd Qu.: 0.596750 3rd Qu.: 0.596750 3rd Qu.: 0.596750
Max. :2005 Max. : 5.733000 Max. : 5.733000 Max. : 5.733000
Lag4 Lag5 Volume Today Direction
Min. :-4.922000 Min. :-4.92200 Min. :0.3561 Min. :-4.922000 Down:602
1st Qu.:-0.640000 1st Qu.:-0.64000 1st Qu.:1.2574 1st Qu.:-0.639500 Up :648
Median : 0.038500 Median : 0.03850 Median :1.4229 Median : 0.038500
Mean : 0.001636 Mean : 0.00561 Mean :1.4783 Mean : 0.003138
3rd Qu.: 0.596750 3rd Qu.: 0.59700 3rd Qu.:1.6417 3rd Qu.: 0.596750
Max. : 5.733000 Max. : 5.73300 Max. :3.1525 Max. : 5.733000
#Lag1-Lag5 gives previous day's price
?Smarket # Daily percentage returns for the S&P 500 stock index between 2001 and 2005.
pairs(Smarket,col=Smarket$Direction) #Create a matrix of scatterplots and make color contrast for the binary classes
# Binary Logistic regression model, using Direction as outcome variable
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
data=Smarket,family=binomial)
summary(glm.fit) #Gives summary output
Call:
glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
Volume, family = binomial, data = Smarket)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.446 -1.203 1.065 1.145 1.326
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.126000 0.240736 -0.523 0.601
Lag1 -0.073074 0.050167 -1.457 0.145
Lag2 -0.042301 0.050086 -0.845 0.398 #Nothing is significant in model
Lag3 0.011085 0.049939 0.222 0.824
Lag4 0.009359 0.049974 0.187 0.851
Lag5 0.010313 0.049511 0.208 0.835
Volume 0.135441 0.158360 0.855 0.392
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1731.2 on 1249 degrees of freedom #Null deviance is the deviance of mean
Residual deviance: 1727.6 on 1243 degrees of freedom #Deviance of residuals
AIC: 1741.6
Number of Fisher Scoring iterations: 3
glm.probs=predict(glm.fit,type="response") #Give predicted probabilities with command type = "response"
glm.probs[1:5] #Show the 1st five observations (rows) of predicted values
1 2 3 4 5
0.5070841 0.4814679 0.4811388 0.5152224 0.5107812
glm.pred=ifelse(glm.probs>0.5,"Up","Down") #Set threshold at 0.5. If True that > 0.5, then "Up"; else, "Down"
attach(Smarket)
table(glm.pred,Direction) #glm.pred is the predicted values while Direction is the true values
Direction #This is a Confusion Matrix
glm.pred Down Up
Down 145 141 # The diagonal values are the correct classifications
Up 457 507 # The off-diagonal values are the incorrect classifications
mean(glm.pred==Direction) #This gives the classification rate of model
[1] 0.5216 # Performed slightly better than chance (0.50)
# Make training and test set
train = Year<2005 #Create training data set
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
data=Smarket,family=binomial, subset=train) #The subset = train where train is < 2005
#Use the remaining !train for the test data set
glm.probs=predict(glm.fit,newdata=Smarket[!train,],type="response") type = "response" #gives predicted probabilities
glm.pred=ifelse(glm.probs >0.5,"Up","Down") #Make an up/down variable, if true, then "Up", else false, then "Down"
Direction.2005=Smarket$Direction[!train] #Make a variable of the true values
table(glm.pred,Direction.2005) #Give table of classification of test data set with predicted against true values
Direction.2005
glm.pred Down Up #This is a Confusion Matrix
Down 77 97 #Off-diagonal values are the mistakes
Up 34 44 #Diagonal values are where we got it right
mean(glm.pred==Direction.2005) #Classification rate
[1] 0.4801587 #The mean is less than 50%, which is the null rate, which could be overfitting of model
#Fit smaller model (only include 2 predictors, lag1 and lag2)
glm.fit=glm(Direction~Lag1+Lag2,
data=Smarket,family=binomial, subset=train)
summary(glm.fit)
Call:
glm(formula = Direction ~ Lag1 + Lag2, family = binomial, data = Smarket,
subset = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.345 -1.188 1.074 1.164 1.326
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.03222 0.06338 0.508 0.611
Lag1 -0.05562 0.05171 -1.076 0.282
Lag2 -0.04449 0.05166 -0.861 0.389
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1383.3 on 997 degrees of freedom #Deviance of the mean
Residual deviance: 1381.4 on 995 degrees of freedom #Residual deviance
AIC: 1387.4
Number of Fisher Scoring iterations: 3
glm.probs=predict(glm.fit,newdata=Smarket[!train,],type="response")
glm.pred=ifelse(glm.probs >0.5,"Up","Down")
table(glm.pred,Direction.2005)
Direction.2005 #Confusion Matrix
glm.pred Down Up
Down 35 35 #Correct classifications are on the diagonal
Up 76 106 #Incorrect classifcations are on the off-diagonal
mean(glm.pred==Direction.2005) #This is the classification rate of model
[1] 0.5595238 #Mean shows a better classification rate than previous dataset
106/(76+106)
[1] 0.5824176 #Correct classification rate for "Up" when predicting "Up" is 0.58
require (ISLR) # This has the dataset on the Stock market
require(MASS) # This has the Linear Discriminant Analysis commands that we need
## Linear Discriminant Analysis
lda.fit=lda(Direction~Lag1+Lag2,data=Smarket, subset=Year<2005)
lda.fit
Call:
lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = Year <
2005)
Prior probabilities of groups: #Gives proportions of Up and Down in dataset
Down Up
0.491984 0.508016
Group means: #Summary of group means
Lag1 Lag2
Down 0.04279022 0.03389409
Up -0.03954635 -0.03132544
Coefficients of linear discriminants: #LDA coefficients
LD1
Lag1 -0.6420190
Lag2 -0.5135293
plot(lda.fit)
#Create a test dataset
Smarket.2005=subset(Smarket,Year==2005) #dataframe is Smarket, subset out the observations with Year is 2005
lda.pred=predict(lda.fit,Smarket.2005)
lda.pred[1:5,] #Look at the first 5 observations of the predicted values
Error in lda.pred[1:5, ] : incorrect number of dimensions #Error because data is not in matrix form
class(lda.pred) #What form is the dataset in? List.
[1] "list"
data.frame(lda.pred)[1:5,] #Cast to a dataframe and then try to look at first 5 observations
class posterior.Down posterior.Up LD1
999 Up 0.4901792 0.5098208 0.08293096
1000 Up 0.4792185 0.5207815 0.59114102
1001 Up 0.4668185 0.5331815 1.16723063
1002 Up 0.4740011 0.5259989 0.83335022
1003 Up 0.4927877 0.5072123 -0.03792892
table(lda.pred$class,Smarket.2005$Direction) #Create a table of the predicted classes against the true classes
Down Up #This is a Confusion Matrix
Down 35 35 #Diagonal elements are the correct classification
Up 76 106 #Off-diagonal elements are the mistakes
mean(lda.pred$class==Smarket.2005$Direction) #Classification rate of the model, which is 56%
[1] 0.5595238 #Model performs 6% better than chance (flipping a coin)
## K-Nearest Neighbors (very handy to have in toolbox, effective most of the time)
library(class)
?knn
k-nearest neighbour classification for test set from training set. For each row of the test set, the k nearest (in Euclidean distance) training set vectors are found, and the classification is decided by majority vote, with ties broken at random. If there are ties for the kth nearest vector, all candidates are included in the vote.
attach(Smarket) #Put variable names in workspace
ls() # Check to see it is there
Xlag=cbind(Lag1,Lag2) #Make a matrix with lag1 and lag2
Xlag[1:5, ] #Take a look at first five observations
train=Year<2005 # Designate train as Year less than 2005
#Make a k-Nearest Neighbor model where knn(x-variables from train, x-variables from test, class labels from train, k = , )
knn.pred=knn(Xlag[train,],Xlag[!train,],Direction[train],k=1)
table(knn.pred,Direction[!train]) #Generate a Confusion Matrix with predicted againsted true values
knn.pred Down Up
Down 43 58
Up 68 83
mean(knn.pred==Direction[!train]) #Obtain the mean, which gives model performance on prediction
[1] 0.5 #Model prediction rate is 50%, which is the same as flipping a coin