-
Notifications
You must be signed in to change notification settings - Fork 0
/
Tranport_forecasting_project.R
599 lines (403 loc) · 18.7 KB
/
Tranport_forecasting_project.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
library(ggplot2)
library(dplyr)
library(fpp3)
library(tidyverse)
library(seasonal)
# Filtered data for Trucks across US-Canada Border
df=read.csv("Border_Crossing_Entry_Data.csv")
print(str(df))
df_clean <- df |> distinct()
# Remove duplicates based on specific columns
df_clean <- df |>
distinct(Port.Name, State, Port.Code, Border, Date, Measure, Value, Latitude, Longitude, Point, .keep_all = TRUE)
sum(duplicated(df_clean))
US_Canada <- df_clean |>
filter(df_clean$Border=="US-Canada Border" & df_clean$Measure=="Trucks")
df1<-US_Canada |>
select('State','Measure','Value')
US_Canada
#Aggregated data for conversion to total number of truck for each state irrespective of the port of entry
agg_data=aggregate(df1$Value, by=list(df1$State,df1$Measure), FUN=sum)
agg_data$State=agg_data$`Group.1`
agg_data$Measure=agg_data$`Group.2`
agg_data$Value=agg_data$x
agg_data<- agg_data|>
select('State','Measure','Value')
agg_data
options(scipen = 999)
library(scales)
#Bar plot to visualize the aggregated data for Trucks per state
ggplot(data=agg_data, aes(x=State, y=Value)) + geom_col() +
labs(title='Frequency of Trucks from different States',
x='State',
y='Number of Trucks') +
theme_gray(base_size = 18) +
theme(axis.text.x = element_text(angle=45, hjust = 1)) +
scale_y_continuous(labels = scales::comma)
#As evident from the aggregation table and the histogram, Michigan seems to have highest movement of trucks across the border and hence we would wish to study the trend and accordingly forecast the traffic for next 6 months.
#Filtering Michigan from the Truck dataframe we extracted at the beginning for US-Canada border
US_Canada_MI <- US_Canada |>
filter(State=="Michigan")
US_Canada_MI
#Box plot view of Trucks values at all ports of entry located in Michigan
boxplot(Value~Port.Name, data=US_Canada_MI,main="Box plot of the trucks across border ports located in Michigan") +
scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9"))
#Aggregating data to ignore Port of entries and combine all truck values in Michigan as per monthly frequency
agg_data=aggregate(US_Canada_MI$Value, by=list(US_Canada_MI$Port.Name,US_Canada_MI$Measure), FUN=sum)
print(agg_data)
MI_TimeSeries <- aggregate(US_Canada_MI$Value, by=list(US_Canada_MI$Date,US_Canada_MI$Measure), FUN=sum)
print(MI_TimeSeries)
#We get total of 341 observations representing 341 months of data of number of trucks coming from Canada and entering US through Michigan border. This becomes are base dataset that we will convert to tsibble and plot the time series using Date Index.
MI_TimeSeries<- MI_TimeSeries |>
rename(Date=Group.1, Measure=Group.2, Value=x)
str(MI_TimeSeries)
#Converting date index into year-month format using mutate for easier index reference at monthly frequency
MI_TimeSeries<- MI_TimeSeries %>%
mutate(Date = yearmonth(Date))
str(MI_TimeSeries)
#Converted data set to time sibble object and used autoplot for plotting timeseries data
MI_TimeSeries_Plot <- as_tsibble(MI_TimeSeries, index="Date")
options(repr.plot.width = 15, repr.plot.height = 8)
MI_TimeSeries_Plot |>
autoplot(Value) +
labs(title="Monthly data of trucks crossing US-CAN border from Michigan") +
scale_x_yearmonth(date_breaks="1 year", date_labels="%Y") +
theme_gray(base_size = 18) +
theme(axis.text.x = element_text(angle=45, hjust = 1))
#OBSERVATIONS FROM THE INITIAL TIME SERIES PLOT
#1. Data has rising trends within periods 1996 Jan till Dec 2008 and from Dec 2009 till Dec 2019.
#2. There is an evident seasonality after every 6 months as we can see a dip after every 6 months in trucks data across all the years
#3. There are two significant and sharp dips at year 2009 and 2020 representing Economic Depression and COVID-19 events that disrupted movement of transportation across US and Canada due to economic and legal restrictions.
#4. There is high variability as in trucks data from 1996 -2009 as compared to period 2010-2024. This is evident from the height of spikes that is significantly higher in 1996-2009. Hence this shows unstable variance in data that we would need to correct in order to get a better visualization of trends and seasonality.
#5. There is no stationarity in data as there is trend with seasonality
MI_TimeSeries_Qtr<- MI_TimeSeries %>%
mutate(Date = yearquarter(Date))
MI_TimeSeries_Qtr <- MI_TimeSeries_Qtr[order(MI_TimeSeries_Qtr$Date),]
MI_TimeSeries_Qtr <- MI_TimeSeries_Qtr %>% group_by(Date) %>%
summarise(Total_Value=sum(Value),
.groups = 'drop')
str(MI_TimeSeries_Qtr)
#Converted quarterly data set to time sibble object and used autoplot for plotting timeseries data
MI_TimeSeries_Qtr <- as_tsibble(MI_TimeSeries_Qtr, index="Date")
options(repr.plot.width = 15, repr.plot.height = 8)
ggplot(data=MI_TimeSeries_Qtr, aes(x=Date, y=Total_Value)) + geom_line() +
labs(title='Quarterly Frequency of Trucks from different States ') +
theme_gray(base_size = 18) +
theme(axis.text.x = element_text(angle=45, hjust = 1)) +
scale_x_yearquarter(date_breaks = "2 years",
date_minor_breaks = "1 year") +
scale_y_continuous(labels = scales::comma)
#ACF PLOTS AND DECOMPOSITION TECHNIQUES
ACF_plot<- MI_TimeSeries_Plot |>
select('Date', 'Measure','Value')
ACF_plot <- ACF_plot[order(ACF_plot$Date),]
ACF_plot
options(repr.plot.width = 12, repr.plot.height = 8)
ACF_plot |>
ACF(ACF_plot$Value, lag_max=48) |>
autoplot() +
labs(title="ACF plot of monthly trucks data", scale=c(1,10)) +
theme_gray(base_size = 18)
#The ACF plot above shows clear positive autocorrelation which shows that the future value of number of trucks crossing US-CAN border is positively correlated with past values.
#Code to display ACF and PACF plot against the original time series
ACF_plot |> gg_tsdisplay(Value, plot_type = "partial") +
labs(title="GG Residual Plots for the Time Series", scale=c(1,10))
#Data in the ACF plot without difference shows some signs of trends with less stationarity.
#Another insight we can see is that trucks number have dropped fairly after 2010 with highest figure of 250,000 around year 2005 while highest figure of 220,000 around year 2008
#Plotting ACT plot for first difference of Measure variable
ACF_plot$diff <- difference(ACF_plot$Value)
ACF_plot |>
ACF(ACF_plot$diff, lag_max=48) |>
autoplot() +
labs(title="ACF plot of first difference of trucks data", scale=c(1,10)) +
theme_gray(base_size = 18) +
theme(axis.text.x = element_text(angle=45, hjust = 1))
ACF_plot |> gg_tsdisplay(diff, plot_type = "partial") +
labs(title="GG display view of ACF and PACF Plots", width=800, height=500)
#After first difference the data looks quite stationary and it does not seem that we would need 2nd difference. But to avoid any assumptions we will try to take second difference and see if there is any change to the stationarity of data.
install.packages("urca")
library(urca)
# KPSS test for first difference
ACF_plot |> features(diff, unitroot_kpss)
ACF_plot |>
ACF(difference(ACF_plot$diff), lag_max=48) |>
autoplot() +
labs(title="ACF plot of second difference of trucks data", scale=c(1,10)) +
theme_gray(base_size = 18) +
theme(axis.text.x = element_text(angle=45, hjust = 1))
ACF_plot |> gg_tsdisplay(difference(diff), plot_type = "partial") +
labs(title="ACF and PACF plots for secon differenced data")
#KPSS test for the second difference
ACF_plot |> features(difference(diff), unitroot_kpss)
#From the visual review and hypothesis test of stationarity, it is fairly to make below assumptions :
#1. ACF plot of original data shows positive autocorrelation which indicates that present values of trucks crossing border have a correlation with past values.
#2. There seems to be a slight trend in the line plot of original data and some seasonal variations as well. We will use decomposition plots for check for trend and seasonality in order to check what time series models will be appropriate to fit the data.
#3. For ARIMA models, since stationarity is one of the requirements, we see that ggplot of first difference visually and through kpss test shows good amount of stationarity. On the other hand the second difference is very much similar to first one , hence we conclude that first different would be sufficient to build ARIMA and SARIMA models.
MI_TimeSeries_Plot |>
autoplot(log(Value)) +
labs(title="Log transformation of Monthly data of trucks") +
theme_gray(base_size = 18) +
theme(axis.text.x = element_text(angle=45, hjust = 1))
MI_TimeSeries_Plot |>
autoplot(Value) +
labs(title="Untransformed data of trucks crossing US-CAN border from Michigan") +
theme_gray(base_size = 18) +
theme(axis.text.x = element_text(angle=45, hjust = 1))
#Box Cox Transformation parameter tuning
lambda<- MI_TimeSeries_Plot |>
features(Value,features = guerrero) |>
pull(lambda_guerrero)
#Data plot after box cox transformation
MI_TimeSeries_Plot |>
autoplot(box_cox(Value,lambda)) +
labs(title="Box Cox transformer for Measure of Trucks") +
theme_gray(base_size = 18) +
theme(axis.text.x = element_text(angle=45, hjust = 1))
print("optimal Lambda for boxcox",lambda)
#From Transformations we can see that Box Cox transformations hardly had any effect variance in the values across the months. Although log trasnformations had shown some significant improvement in capturing the data variance over time and stablized it to some extent. However, the data is still non-stationary.
MI_TimeSeries_Plot$Log_adjust <- log(MI_TimeSeries_Plot$Value)
MI_TimeSeries_Plot
# STL and classical decomposition of the log adjusted values
MI_dcmp1 <- MI_TimeSeries_Plot|>
model(
classical = classical_decomposition(Value, type="additive"),
stl1 = STL(Value)
)
# Plotting STL of the log adjusted values
stlcomp1<- MI_dcmp1|> select(stl1) |> components(MI_dcmp1)
stlcomp1 |> autoplot()+
labs(title = "STL decomposition of Value data of Trucks ") +
theme_gray(base_size = 18) +
theme(axis.text.x = element_text(angle=45, hjust = 1))
# Plotting Classical decomposition of the log adjusted values
clscomp1<- MI_dcmp1|> select(classical) |> components(MI_dcmp1)
clscomp1 |> autoplot()+
labs(title = "Classical decomposition of Value data of Trucks ") +
theme_gray(base_size = 18) +
theme(axis.text.x = element_text(angle=45, hjust = 1))
MI_TimeSeries_Train <- MI_TimeSeries_Plot |>
filter(year(Date) < 2024)
ETS_fit <- MI_TimeSeries_Train |>
model(
ETS(Value ~ error("A")+trend("Ad")+season("A"))
)
tidy(ETS_fit) |> select(term,estimate)
fc <- ETS_fit |>
forecast(h=12)
fc |>
autoplot(MI_TimeSeries_Plot) +
labs(title= "ETS Model Forecast plot for 12 months ") +
theme_gray(base_size = 18) +
theme(axis.text.x = element_text(angle=45, hjust = 1))
ETS_fit |> report()
result_table <- fc %>%
as_tibble %>%
select(Date, .mean)
result_table
MI_TimeSeries_Test <- MI_TimeSeries_Plot |>
filter(year(Date) == 2024)
MI_TimeSeries_Test
install.packages("PerMat")
library(PerMat)
actual <- MI_TimeSeries_Test$Value
predicted<- result_table$.mean
RMSE(actual, predicted)
MAPE(actual, predicted)
MAE(actual, predicted)
ggplot(result_table, aes(x = Date, y = .mean)) + geom_line()
#XTL DECOMPOSITION
xtl_dcmp<- MI_TimeSeries_Train |>
model(
xtl= X_13ARIMA_SEATS(Value ~ x11()), #multiple options but we use x11
)
xtl_dcmp<- xtl_dcmp|> select(xtl) |> components(xtl_dcmp)
xtl_dcmp
library(knitr)
xtl_dcmp |> tail() |> kable()
xtl_dcmp |> autoplot()
#BASIC MOVING AVERAGE FORECAST USING NAIVE, DRIFT AND SEASONAL NAIVE
#Moving Averages Forecast
MI_Trucks_fit<- MI_TimeSeries_Plot |>
model(
MI_Mean= MEAN(Value),
MI_Naive=NAIVE(Value),
MI_Drift= NAIVE(Value~drift())
)
MI_Trucks_fit |> forecast(h=30) |> autoplot(MI_TimeSeries_Plot, level=NULL) +
labs(title="Forecast of basic Moving Averages methods ") +
theme_gray(base_size = 18) +
theme(axis.text.x = element_text(angle=45, hjust = 1))
naive_fit <- MI_Trucks_fit |> select(MI_Naive)
naive_fit |> gg_tsresiduals()
drift_fit <- MI_Trucks_fit |> select(MI_Drift)
drift_fit |> gg_tsresiduals()
MI_seasonal<-MI_TimeSeries_Plot |> model(SNAIVE(Value))
MI_seasonal |> forecast(h=20) |> autoplot(MI_TimeSeries_Plot) +
labs(title="Seasonal Naive Forecast for next 20 months") +
theme_gray(base_size = 18) +
theme(axis.text.x = element_text(angle=45, hjust = 1))
#Linear Regression Using XTL
xtl_dcmp
Dcmp_TS_MI <- xtl_dcmp |>
select(Date, Value, trend, seasonal, irregular, season_adjust)
Dcmp_TS_MI<- Dcmp_TS_MI |>
as_tsibble(index=Date)
#MULTIPE REGRESSION TIME SERIES MODELING USING TREND AND SEASONALITY
#Multiple Linear Regression of Value with trend and seasonal adjusted values
Linear_XTL_train <- Dcmp_TS_MI |>
filter(year(Date) < 2023)
Linear_XTL_test <- Dcmp_TS_MI |>
filter(year(Date) >= 2023)
Linear_XTL_Multiple <- Linear_XTL_train |>
model(TSLM(Value ~ trend + seasonal))
Linear_XTL_Multiple |> report()
Linear_XTL_Multiple |> gg_tsresiduals()
Linear_XTL_Multiple |> report()
fc <- forecast(Linear_XTL_Multiple, new_data = Linear_XTL_test)
Dcmp_TS_MI |>
autoplot(Value) +
labs(title="Multiple Linear Regression Forecast for 12 months from Jan-Dec 2023") +
theme_gray(base_size = 18) +
theme(axis.text.x = element_text(angle=45, hjust = 1)) +
autolayer(fc)
fc
Multiple_Linear_Predict <- fc$.mean
Multiple_Linear_Actual<-Linear_XTL_test$Value
as.data.frame(Multiple_Linear_Predict)
#MODEL EVALUATION METRICS FOR MULTIPLE LINEAR REGRESSION
RMSE(Multiple_Linear_Actual,Multiple_Linear_Predict)
MAPE(Multiple_Linear_Actual,Multiple_Linear_Predict)
MAE(Multiple_Linear_Actual,Multiple_Linear_Predict)
#ARIMA MODEL
#We Checked stationarity of the timeseries data earlier in this Code file and we will now use it for Modeling.
#So calling the difference codes again to show ACF and PACF plots of first difference since we saw that first difference was sufficient to get sufficient stationarity in the data.
#Plotting ACT plot for first difference of Measure variable
ACF_plot$diff <- difference(ACF_plot$Value)
ACF_plot |>
ACF(ACF_plot$diff, lag_max=48) |>
autoplot() +
labs(title="ACF plot of first difference of trucks data", scale=c(1,10)) +
theme_gray(base_size = 18) +
theme(axis.text.x = element_text(angle=45, hjust = 1))
ACF_plot |> gg_tsdisplay(diff, plot_type = "partial")
#For ARIMA we need to choose below values:
#AR model inputs:
#p= 1 or 2
#d= 1 , since we took first difference and we can also try with 0
#q= 1 or 2
#P= 0
#D=1
#Q=1 or 2
#MODEL FITTING FOR ARIMA MODELS
ARIMA_fit1 <- MI_TimeSeries_Train |>
model(
AR1=ARIMA(Value),
AR2=ARIMA(Value~pdq(1,1,1)),
AR3=ARIMA(Value~pdq(2,1,2)),
AR4=ARIMA(Value~pdq(0,1,1)),
AR5=ARIMA(Value~pdq(0,1,2))
)
ARIMA_fit1 |> select(AR1) |>
report()
ARIMA_fit1 |> select(AR2) |>
report()
ARIMA_fit1 |> select(AR3) |>
report()
ARIMA_fit1 |> select(AR4) |>
report()
ARIMA_fit1 |> select(AR5) |>
report()
#MODEL FORECATING METRICS USING ARIMA MODEL WITH LEAST AICc
fit<-forecast(ARIMA_fit1$AR3, h=6)
fit
#MODEL FITTING AND FORECATING USING MOST OPTIMAL ARIMA MODEL WITH LOWEST AICc
best_fit1 <- MI_TimeSeries_Train %>% model(best = ARIMA(Value~pdq(2,1,2)))
best_fit1 |> report()
forecast_AR<- best_fit1 %>% forecast(h = 12)
forecast_AR |>
autoplot(MI_TimeSeries_Train) +
labs(title="\nForecast plots for ARIMA(2,1,2) for next 12 months") +
theme_gray(base_size = 18) +
theme(axis.text.x = element_text(angle=45, hjust = 1))
forecast_AR
predicted=forecast_AR$.mean
observed=MI_TimeSeries_Test$Value
#MODELING EVALUATION METRICS FOR ARIMA(2,1,2)
RMSE(predicted,observed)
MAPE(predicted,observed)
MAE(predicted,observed)
#FINDING BEST ARIMA MODEL USING STEPWISE AND SEARCH GRID
ARIMA_fit2 <- MI_TimeSeries_Train |>
model(
stepwise=ARIMA(Value),
search=ARIMA(Value, stepwise = FALSE)
)
ARIMA_fit2
best_fit2 <- MI_TimeSeries_Train %>% model(best2 = ARIMA(Value~pdq(2,0,2)))
report(best_fit2)
forecast_AR2 <- best_fit2 %>% forecast(h = 12)
forecast_AR2 |>
autoplot(MI_TimeSeries_Train) +
labs(title="\nForecast plots for ARIMA(2,0,2) for next 12 months") +
theme_gray(base_size = 18) +
theme(axis.text.x = element_text(angle=45, hjust = 1))
predicted <- forecast_AR2$.mean
observed <- MI_TimeSeries_Test$Value
#MODEL EVALUATION METRICS FOR ARIMA(2,0,2)
predicted <- forecast_AR2$.mean
observed <- MI_TimeSeries_Test$Value
RMSE(predicted,observed)
MAPE(predicted,observed)
MAE(predicted,observed)
#SEASONAL ARIMA MODELING
#Seasonal first difference ACF for lag 12
MI_TimeSeries_Plot |>
gg_tsdisplay(difference(Value,12), plot_type="partial", lag=48) +
labs(title="Seasonally differenced", y="") +
labs(title="\nForecast plots for ARIMA(2,1,2) for next 12 months")
#Seasonal second differencing ACF for lag 12
MI_TimeSeries_Plot |>
gg_tsdisplay(difference(Value,12) |>
difference(), plot_type="partial", lag=48) +
labs(title="Seasonally second differenced", y="")
#Here we can see that the second difference of seasonal first difference looks stationary and both PACF and ACF have controlled spikes.
#Seasonal MA(2) since we two significant spikes at ACF plot at 12 and 24 months
#Non-seasonal difference, we can visually see in ACF that we have first two spikes going down and in PACF we have first 3 significant spikes showing downward trend . so we choose non-seasonal AR(2) or AR (3)
SARIMA_fit1 <- MI_TimeSeries_Train |>
model(
SAR1=ARIMA(Value),
SAR2=ARIMA(Value~pdq(0,1,2) + PDQ(0,1,2)),
SAR3=ARIMA(Value~pdq(2,1,0) + PDQ(0,1,2))
)
SARIMA_fit1 |> select(SAR1) |>
report()
SARIMA_fit1 |> select(SAR2) |>
report()
SARIMA_fit1 |> select(SAR3) |>
report()
#OPTIMAL MODEL USING STEPWISE & SEARCH GRID
SARIMA_fit2 <- MI_TimeSeries_Train |>
model(
stepwise=ARIMA(Value),
search=ARIMA(Value, stepwise = FALSE)
)
SARIMA_fit2
SARIMA_best_fit2 <- MI_TimeSeries_Train |>
model(
SAR4=ARIMA(Value~pdq(2,0,2) + PDQ(0,1,2))
)
SARIMA_best_fit2 |> select(SAR4) |>
report()
forecast_SAR1 <- SARIMA_best_fit2 |> select(SAR4) |>
forecast(h = 12)
forecast_SAR1 |>
autoplot(MI_TimeSeries_Train) +
labs(title="FORECAST PLOT FOR SARIMA (2,0,2), (0,1,2)") +
theme_gray(base_size = 18) +
theme(axis.text.x = element_text(angle=45, hjust = 1))
#MODEL EVALUATION METRICS FOR SARIMA (2,0,2) (0,1,2)
predicted <- forecast_SAR1$.mean
observed <- MI_TimeSeries_Test$Value
RMSE(predicted,observed)
MAPE(predicted,observed)
MAE(predicted,observed)