-
Notifications
You must be signed in to change notification settings - Fork 1
/
mvp_ar_pois.R
192 lines (147 loc) · 6.13 KB
/
mvp_ar_pois.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# Need some functions to fit a Poisson-family GLM.
# Some of the predictor variables will be lagged versions of the
# ar_pois
# level 0
# some sort of wrapper function, not sure of exact linkage to functions below
ar_pois <- function(dat, holdType=c("end"), nHold=672, nAhead=24*7*(60/nMin), nMin=15, date_col="datetime_rnd", resp_col="counts", time_preds=c("minute", "hour", "dow", "month"), lags=c(1,2,3,4, (24*(60/nMin))+c(-1,0,1), (7*24*(60/nMin))+c(-1,0,1))){
time_preds <- match.arg(time_preds, several.ok=TRUE)
train_ind <- ar_pois_hold(dat, holdType, nHold, returnType="ind")
dat_train <- dat[train_ind]
dat_test <- dat[!train_ind]
dt <- dat_train[,get(date_col)]
y <- dat_train[,get(resp_col)]
train_mod <- ar_pois_fit(y, xreg=time_factors(dt, types=time_preds, lag_max=max(lags)), lags=lags)
dat_forecast <- ar_pois_forecast(mod=train_mod, nAhead=nAhead, nMin=nMin, time_preds=time_preds)
names_df <- names(dat_forecast)
if("minute"%in%names_df){dat_forecast[,minute:=as.integer(as.character(minute))]}
if("hour"%in%names_df){dat_forecast[,hour:=as.integer(as.character(hour))]}
if("dow"%in%names_df){dat_forecast[,dow:=as.integer(as.character(dow))]}
if("month"%in%names_df){dat_forecast[,month:=as.integer(as.character(month))]}
col_by <- intersect(colnames(dat_test), colnames(dat_forecast))
dat_out <- merge(dat_test, dat_forecast, by=col_by, all=TRUE)
# dat_out <- merge(dat_test, dat_forecast, by=c('datetime_rnd', 'minute', 'hour', 'dow', 'month'), all=TRUE)
# dat_out <- merge(dat_test, dat_forecast, by=c('datetime_rnd'), all=TRUE)
dat_out <- dat_out[,list(datetime_rnd, counts, counts_hat)]
return(dat_out)
}
# test_dat <- dat2[SubDivision=="C8" & datetime_rnd<="2017-06-27"]
# test <- ar_pois(test_dat)
# test[,plot(datetime_rnd, counts, col='black', type='l', ylim=range(c(counts, counts_hat)))]
# test[,lines(datetime_rnd, counts_hat, col='red')]
# dev.new(); test[,plot(counts, counts_hat)]
# test[,cor(counts, counts_hat)]
# ar_pois_hold
# level 1a
# use date range to hold out some elements
# could also use date pattern (every n-th week)
# no, don't do pattern, because pain in the ass with lagged predictors!
# if you want more hold out/ test data than can be achieved by the forecast horizon, need to do rolling or extending windows ==> ref old convo with donald
ar_pois_hold <- function(dat, holdType=c("end"), nHold=672, returnType=c("list","ind")){
holdType <- match.arg(holdType)
if(holdType=="end"){
if(returnType=="ind"){
train_ind <- rep(TRUE, nrow(dat))
lt <- length(train_ind)
train_ind[lt:(lt-nHold+1)] <- FALSE
return(train_ind)
}else{
dat_train <- head(dat, -nHold)
dat_test <- tail(dat, nHold)
return(list(train=dat_train, test=dat_test))
}
}
}
# ar_pois_fit
# level 1b
# call GLM to fit model to training data
# as input, needs data already sub
# as output, will give model object
ar_pois_fit <- function(y, xreg=NULL, lags=c(1,2,3,4, (24*(60/nMin))+c(-1,0,1), (7*24*(60/nMin))+c(-1,0,1)), nMin=15){
X0 <- lag_design(y=y, lags=lags)
if(is.null(xreg)){
X <- as.data.frame(X0)
}else{
stopifnot(any(class(xreg)%in%c("data.table","data.frame", "matrix")))
X <- as.data.frame(cbind(X0, xreg))
}
mod <- glm(y~., data=X, family=poisson)
return(mod)
}
# tsVec <- egDat[,counts]
# dt <- egDat[,datetime_rnd]
# test_mod <- ar_pois_fit(tsVec, xreg=time_factors(dt))
# summary(test_mod) # AIC = 28955
# cor(fitted(test_mod), tail(tsVec, -673)) # 0.4608277
# ar_pois_forecast
# level 1c
# as input, will accept a fitted model and desired forecast horizon
# will call update function
ar_pois_forecast <- function(mod, nAhead=24*7*(60/nMin), nMin=15, time_preds=c("minute", "hour", "dow", "month")){
time_preds <- match.arg(time_preds, several.ok=TRUE)
dat <- mod$data
dt <- dat[,"datetime"]
y <- dat[,"y"]
# the time indices predictors can be made all at once for the
# entire forecast horizon, because it only depends on the time index/ datetime
dt_ahead <- seq(from=tail(dt, 1), by=nMin*60, length.out=nAhead+1)#[-1]
tf_ahead <- time_factors(dt_ahead, lag_max=1, types=time_preds)
# the predictors that are lagged versions of the response can only be produced for 1 time step ahead
def_lags <- c(1,2,3,4, (24*(60/nMin))+c(-1,0,1), (7*24*(60/nMin))+c(-1,0,1))
# y <- c(1:max(def_lags), NA)
# loop through
# update state vector at each time step with new prediction
pred_vec <- rep(NA, nAhead)
y_vec <- y
for(i in 1:nAhead){
y_vec <- tail(y_vec, max(def_lags))
y_vec <- c(y_vec,NA)
X_lag <- lag_design(y=y_vec, lags=def_lags, update=TRUE)
X <- cbind(X_lag, tf_ahead[i])
X[,y:=NULL]
y_pred_t1 <- predict(mod, newdata=X, type='response')
pred_vec[i] <- y_pred_t1
y_vec[length(y_vec)] <- y_pred_t1
}
dt_out <- cbind(tf_ahead, counts_hat=pred_vec)
# dt_out[,plot(datetime, count_hat, type='l')]
setnames(dt_out, "datetime", "datetime_rnd")
return(dt_out)
}
# ====================
# = Helper Functions =
# ====================
lag_design <- function(y, lags, nMin=15, update=FALSE){
lag_max <- max(lags)
lags_names <- paste0("lag",lags)
tsVec0 <- y
tsVec <- tail(y, -lag_max)
if(update){
tsVec_lag <- matrix(embed(tsVec0, lag_max+1)[,-1, drop=FALSE], nrow=1) #[,-(lag_max+1)]
dimnames(tsVec_lag) <- list(NULL, paste0('lag',1:lag_max))
}else{
# tsVec_lag <- embed(tsVec0, lag_max+1)[,-(lag_max+1)]
# dimnames(tsVec_lag) <- list(NULL, paste0('lag',lag_max:1))
tsVec_lag <- embed(tsVec0, lag_max+1)[,-1]
dimnames(tsVec_lag) <- list(NULL, paste0('lag',1:lag_max))
}
mod_X0 <- cbind(y=tsVec, tsVec_lag[,lags_names, drop=FALSE])
mod_X <- mod_X0[, c("y",lags_names), drop=FALSE]
return(mod_X)
}
time_factors <- function(dt, types=c("minute", "hour", "dow", "month"), lag_max=24*7*(60/15)+1){
types <- match.arg(types, several.ok=TRUE)
d <- data.table(datetime=dt)
if("minute"%in%types){
d[,minute:=as.factor(minute(datetime))]
}
if("hour"%in%types){
d[,hour:=as.factor(hour(datetime))]
}
if("dow"%in%types){
d[,dow:=as.factor(as.integer(format.Date(datetime, format="%u"))-1)]
}
if("month"%in%types){
d[,month:=as.factor(month(datetime))]
}
return(tail(d, -lag_max)) # added [] so it would print ... this still confuses me, seems to be working though
}