-
Notifications
You must be signed in to change notification settings - Fork 0
/
timeseries_course.R
399 lines (282 loc) · 12.1 KB
/
timeseries_course.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
head(AirPassengers)
library(forecast)
library(ggplot2)
a = ts(AirPassengers, start=1949, end=1960, frequency = 12)
seasonplot(a, year.labels = TRUE, col=rainbow(12), labelgap = 0.35, cex=0.75)
g?seasonplot
ggseasonplot(a)
b <- read.csv('irregular-sensor.csv', header = FALSE)
class(b$V1)
library(zoo)
library(tidyr)
?separate
irreg.split = separate(b, col = V1, into = c('date', 'time'), sep=8, remove = T)
sensor.date = strptime(irreg.split$date, '%m/%d/%y')
irregts.df = data.frame(date=as.Date(sensor.date), measurement = b$V2)
irreg.date = zoo(irregts.df$measurement, order.by = irregts.df$date)
ag.aggr = aggregate(irreg.date, as.Date, mean)
############### Outliers and Missing Values ###############
tsoutliers() #lib forecast
ts() #lib tsoutliers
#missing data imputation
#zoo na.fill() na.locf() na.trim()
#forecast na.interp(), tsclean()
#Stationarity - no trend, mean stays constant, variance stays constant
#Heteroscedasticty - same mean, but different Variance - non stationary
#trend - mean is not constant, variance is - non-stationary
#Seasonal - variance and mean the same, spikes with same distance
#exponential trend - mean not constant, variance non constant - non-stationary
############### simple methods ###############
# naive() last observation snaive()
# meanf() rwf()
#AIC compares complexity of differnet models, penalizes the more complex models - lower the better
#accuracy()
#ideally we want random residuals ( all the patern should be in the model), they should be uncorrelated
#normally distributed, mean of 0 and constant variance
#non zero mean can be taken care of with addition/substraction
#correlation- modelling tools e.g. differencing
#normal distribtion - transformations e.g. logs
#Stationarity - has the data the same statistical properties throughtout the time series (variance, mean, autocorrelation)
#unit root test - fUnitroots adn urca?
############### Exercise ###############
set.seed(54)
myts <- ts(c(rnorm(50,34,10), rnorm(67,7,1), runif(23,3,14)))
#myts <- log(myts)
#myts <- diff(myts)
plot(myts)
acf(myts)
summary(myts)
acf(myts)
library(tseries)
adf.test(myts)
#seems like the dataset is non-stationary - has different variance and mean overtime
#we need to make the dataset stationary
plot(diff(myts))
acf(diff(myts))
myts_n <- naive(myts, 10)
plot(myts_n)
myts_m <- meanf(myts, 10)
myts_r <- rwf(myts,10, drif= T)
legend('bottomleft', lty=1, col=c('blue','green','red'), bty='n', cex=0.75, legend=c('Naive', 'Mean', 'Drift'))
length(myts)
myts_train <- myts[1:112]
myts_test <- myts[113:140]
myts_naive <- naive(myts_train, 28)
myts_mean <- meanf(myts_train, 28)
myts_drift <- rwf(myts_train, drift = T, 28)
plot(myts)
lines(myts_naive$mean, col = 'red')
lines(myts_mean$mean, col = 'green')
lines(myts_drift$mean, col = 'green')
accuracy(myts_naive, myts_test)
accuracy(myts_mean, myts_test)
accuracy(myts_drift, myts_test)
#checking residuals
plot(myts_n$residuals)#heteroscedasticity is prsent - variance is not the same
#
mean(myts_n$residuals[2:140])
hist(myts_n$residuals)
#normality test:
shapiro.test(myts_n$residuals) #null hypothesis is the it is a normal distribution - less than 0.05 so we can reject it
acf(myts_n$residuals[2:139])
#we can do a log transformation to which would improve greatly the residual performanc
#nnetar nfor libraries for neural nets
##################### Seasonal Decomposition ########################
#drawbacks - slow to catch changes and assumes constant seasonality
#can be either additive or multiplicative
#decomposing the AirPassengers dataset
plot(AirPassengers)
frequency(AirPassengers)
#we see a trend and seasonal pattern
#increasing amplitude of seasons- probably multiplicative decomposition?
air_ad <- decompose(AirPassengers, type='additive')
air_mu <- decompose(AirPassengers, type='multiplicative')
#in the multiplicative more seems to have gone to the random
plot(air_ad)
plot(air_mu)
plot(air_ad$trend + air_ad$random)
##################### Simple Moving Average ########################
#define the number of observations and take their average
library('TTR')
x=c(1,2,3,4,5,6,7)
plot(x,type='l')
SMA(x,n=3)
plot(SMA(x,n=3))
#trying with lynx
lynxsmooth = SMA(lynx, n=4)
plot(lynx)
plot(lynxsmooth)
lines(lynxsmooth, col='red')
##################### Exponential Smoothign #####################
#ses() for datasets without trend and seasonality
# holt() with trend without seasonality; damp for declining trend over time
#hw() trend + seasonality + Damp
#ets() - automated model selection - based on information criterion
#models are based on coefficients, reactive - coefficient close to 1 puts really high weight on the more recent observations
#coefficient closer to 0 is smoother taking into accoutn old data as well
# alpha - initial level; beta - trend; gamma - seasonality;
#default zzz - autoselection - A addivtive, M multiplicatve, N non-present
#example:
library(forecast)
etsmodel = ets(nottem)
etsmodel
#model - error - Additive, trend - None, Seasonality - Additive
#alpha and gamam seem close to 0, more of a smoother model
plot(nottem,lwd=2)
lines(etsmodel$fitted, col='red')
plot(forecast(etsmodel, h=12))
#manually choosing error and seasonality to be multiplicative, letting R decide on trend
etsmodmult = ets(nottem,model='MZM')
etsmodmult
plot(nottem,lwd=2)
lines(etsmodmult$fitted, col='red')
##################### ARIMA #####################
#autoregressive integrated moving average
# (p,d,q)
# p - autoregressive part (AR)
# d - degree of differencing (Integrated)
# q -
#ARAIMA requires stationary time series! if we start we non-stationary one it will be differenced until its
#stationary
#auto.arima() vs arima() - base arima is manual, whereas autoarima automatically calculates the parameters, wheres
#in arima we have to mainly use pacf and acf to choose some of the parametrs
#Seasonal datasets are easier to model with exponential smoothing or seasonal decomposition?
#ARIMA(1,0,0) = first order lag of AR
# Yt = Constant + Coefficient * Yt-1 + Error_Term_t
#ARIMA(0,0,1) = first order MA
#ARIMA(1,0,1) - ARMA
#Yt = Constant + Coefficient * Yt-1 + Coefficient * ErrorTerm_T-1 + Error_Term_t
### auto.arima - popular and easy to use, but there exists a danger of producing uninformed, low quality models
#good as a benchmark
#example:
plot(lynx)
#no seasonality, cyclical; logically based on the nature of the dataset if we extract above a certain number
#of lynx, next season we will have less
#checking for autoregression (p) with acf and pacf -
tsdisplay(lynx)
tsdisplay(lynx)
#the dataset looks like it is stationary, the plot looks harmonic, so d might be 0
adf.test(lynx)
#the test confirms it
auto.arima(lynx)
#2,0,2 ARIMA
auto.arima(lynx, stepwise = FALSE, trace= TRUE, approximation = F)
#4,0,0
##################### Reproducing ARIMA model manuallu #####################
#using lynx dataset
#model 2,0,0
myarima = arima(lynx, order= c(2,0,0))
myarima
#Yt = Constant + Coefficient_1 * Y_t-1 + Coefficient_2 * Y_t-2 + Error_Term_t
#Yt - mean = Coefficient_1(Y_t-1 - mean) + Coefficient_2(Yt-2 - mean) + Error_term_t
#how does R calculate the the last values...
tail(lynx)
tail(residuals(myarima))
#manually calculating for the last value
(2657 - 1545.4458) * 1.1474 + (1590 - 1545.4458) * (-0.5997) + 601.8380
3396 - 1545.4458
#manually calculating for MA(2) model (0,0,2) ARIMA
myarima = arima(lynx, order=c(0,0,2))
#Yt= Constant + Error_Term_t-1 * Coefficient_T-1 + Error_Term_t-2 * Coefficient_T-2 + Error_Term_T
myarima
tail(residuals(myarima))
844.7122 * 1.1407 + 255.91108 * 0.4697 + 766.83050
3396 - 1545.37
##################### ARIMA simulation #####################
set.seed(123)
asim <- arima.sim(model = list(order = c(1,0,1) , ar= c(0.4), ma=c(0.3)), n = 1000) + 10
plot(asim)
library(zoo)
plot(rollmean(asim,50))
plot(rollmean(asim,25))
#stationarity
adf.test(asim)
#dataset is stationary
#autoregression
tsdisplay(asim)
#checking if auto.arima will cofnrim the parameters we have seleection
auto.arima(asim, trace = T, stepwise = F, approximation = F)
#1,0,1 is indeed chosen ar1 = 0.31 we indicated 0.4 ma1 = 0.33 - closer to the 0.3 we indicated and mean 10
##################### Manuam Model Selection #####################
#R base arima() does not calculate a constant if there is a differencing step
# Arima() from forecast packag could be used then
#sample with lynx dataset
#testing for stationarity
adf.test(lynx)
#seems like the dataset is stationary, se we should not need any differencing
#d = 0
#testing for autoregression based on acf and pacf
tsdisplay(lynx)
#generally the acf tells us more about the MA part - q, whereas the pacf tells us more about the AR part - p
#acf is significant at almost every lag
#pacf is is significant at 1,2 and possibly 4
#we can check what happens with an AR(2) model
myarima <- Arima(lynx, order=c(2,0,0))
#checking the info criterion - aICC 1878
#we would expect there to be no autoregression in the residuals and they to be normally distributed
checkresiduals(myarima)
#ACF still shows autoregreesion, generally if it is at the beginning it is a clear sign that
#there still is autoregression present, if it is towards the end it could be by chance
#however we can still try to adjust the model and see if we can improve
myarima <- Arima(lynx, order=c(3,0,0))
#aic - 1880 - worse than the previous one
checkresiduals(myarima)
#problematic lags are still outside the threshold
myarima <- Arima(lynx, order=c(4,0,0))
#aic 1874 - best one so far
checkresiduals(myarima)
#autoregression seems to be eliminated
#although residuals are still not exactly normal, although close
#with p = 5, we get worse results than AR(4)
#therefore AR(4) seems to be the model we could choose
### exaple of MA model
set.seed(123)
myts <- arima.sim(model = list(order = c(0,0,2), ma=c(0.3, 0.7)), n=1000) + 10
#checking for stationarity
adf.test(myts)
#we do not need differencing
tsdisplay(myts)
#we can start with the 2 lags outside of boundary on the ACF -> MA part
myarima <- Arima(myts, order=c(0,0,2))
#aic = 2828
checkresiduals(myarima)
#normally distributed residuals
#autocorrelation seems fine as well - only one lag outside of boundary whichs is fine under 0.95 assumption
#with MA(3) the mdoel shows no improvement
##################### Forecasting #####################
#r base predict() forecast package - forecast()
myarima <- auto.arima(lynx, stepwise = F, approximation = F)
#10 years forecast
arimafore <- forecast(myarima, h = 10)
plot(arimafore)
#forecasted results
arimafore$mean
#zooming on the forecast
plot(arimafore, xlim=c(1930, 1944))
#ets for comparison
myets <- ets(lynx)
etsfore <- forecast(myets, h=10)
#comparing the two models
library(ggplot2)
autoplot(lynx) + forecast::autolayer(etsfore$mean, series='ETS Model') +
forecast::autolayer(arimafore$mean, series='ARIMA Model') +
xlab('year') + ylab('Lynx Trappings') + guides(colour=guide_legend(title='Forecast Method')) +
theme(legend.position = c(0.8,0.8))
##################### Adding explonotary (predictors) to ARIMA) #####################
setwd('C://Users/hgospodinov/Documents/Data Science Training/R Stuff/timeseries')
cyprinidae <- read.csv('cyprinidae.csv')
#do cyprinade (small fish) answer with a hormonal reaction to the presence of a predotary catfish
ggplot(cyprinidae, aes(y= concentration, x = X)) + geom_point() + aes(color=predator_presence)
#seems like there is a strong concentration between the two
x = ts(cyprinidae$concentration)
y = cyprinidae$predator_presence
mymodel <- auto.arima(x, xreg= y, stepwise = F, approximation = F)
#xreg is always explanatory variable
mymodel
#xreg will be added as soon as we have a TRUE in predatory presence g
checkresiduals(mymodel)
#forecasting
y1 = c(T,T,F,F,F,F,T,F,T,F)
plot(forecast(mymodel, xreg=y1))
#zooming in
plot(forecast(mymodel, xreg=y1),xlim=c(230,260))