-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDA 401 Code.Rmd
1006 lines (797 loc) · 49 KB
/
DA 401 Code.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "DA 401 Project"
author: "Hao Nguyen"
date: "2024-02-23"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(
echo = TRUE,
message = FALSE,
warning = FALSE
)
# Set Directory
setwd("C:/Users/nthao/OneDrive/Desktop/DA 401 Project")
# Relevant libraries
library(readr)
library(dplyr)
library(tidyverse)
library(readxl)
library(plotly)
library(gganimate)
library(DT)
library(scales)
library(tseries)
library(lmtest)
library(forecast)
library(modelsummary)
library(zoo)
library(urca)
library(dynlm)
library(car)
```
In this analysis, we will attempt to model and isolate the effect of sex ratio at birth in Vietnam's impact to population change.
# Data Importation and Cleaning
To begin with, we import the dataset from our local device.
```{r}
# Vietnam Population Data decomposed by Ages.
Population_Data_by_Age <- read_csv("C:/Users/nthao/OneDrive/Desktop/DA 401 Project/Datasets/Population Data by Age.csv")
# Vietnam General Population dataset
General_Population_Data <- read_csv("Datasets/General Population Data.csv")
# Composite Indices dataset
Composite_indices <- read_csv("Datasets/Composite indices.csv")
# Corresponding Composite Indices metadata
Composite_indices_metadata <- read_excel("Datasets/Composite indices metadata.xlsx")
# Unemployment rate dataset.
unemployment_rate <- read_excel("C:/Users/nthao/OneDrive/Desktop/DA 401 Project/Datasets/imf-dm-export-20240216 (2).xls")
# GDP PPP per capita, international dollar dataset.
GDP_PPP_capita <- read_excel("C:/Users/nthao/OneDrive/Desktop/DA 401 Project/Datasets/imf-dm-export-20240216.xls")
# Contraceptive dataset.
Contraceptive <- read_excel("C:/Users/nthao/OneDrive/Desktop/DA 401 Project/Datasets/API_SP.DYN.CONM.ZS_DS2_en_excel_v2_3785.xls",
skip = 3)
```
### Data Cleaning and Wrangling
We then proceed to execute initial data cleaning and wrangling. We note that because of the lack of contraceptive data from the World Bank of Vietnam, we use a data interpolation method. Specifically, this method fills initial and last missing values with the first and last available non-missing value respectively, and then employs linear interpolation to estimate and fill in the subsequent missing values based on the nearest known data points.
```{r}
# Clean Contraceptive dataset: remove irrelevant columns, , melt the dataset, and interpolate the data.
Contraceptive <- Contraceptive %>%
select(-`Indicator Code`) %>%
pivot_longer(cols = -c(1:3), names_to = "Year", values_to = "value") %>%
select(-`Indicator Name`) %>%
mutate(Year = as.integer(Year)) %>%
filter(`Country Code` == "VNM") %>%
mutate(value = na.approx(value, na.rm = FALSE)) %>%
# Fill initial and final NA values
mutate(value = ifelse(is.na(value),
ifelse(row_number() <= which(!is.na(value))[1],
value[which(!is.na(value))[1]],
value[which(!is.na(value))[length(which(!is.na(value)))]]
),
value
)) %>%
# Rename the 'value' column back to its original name
rename(`Contraceptive prevalence, any modern method (% of married women ages 15-49)` = value)
```
We continue with data cleaning and wrangling for other datasets. Specifically, we remove the `TOTAL` rows in `Population_Data_by_Age` dataset as these observations consists of total population measurements which could be calculated using sum of all age groups. We continue with decompose the `Population_Data_by_Age` into age groups with interval of 4 years, except for age group `0` being the population of infant in such year.
```{r}
# Clean Population_Data_by_Age dataset: remove the Total row of each year (as we could compute this ourself)
Population_Data_by_Age <- Population_Data_by_Age %>%
subset(GROUP != "TOTAL")
# Reformatted Population_Data_by_Age: change into age group.
Population_Data_by_Age_formated <- Population_Data_by_Age %>%
rename(Age = GROUP) %>% # change column name
mutate(Age = as.numeric(Age)) %>% # change data type
# re-organize into age groups
mutate(Age_Group = case_when(
Age == 0 ~ "0",
Age >= 1 & Age <= 4 ~ "1-4",
Age >= 5 & Age <= 9 ~ "5-9",
Age >= 10 & Age <= 14 ~ "10-14",
Age >= 15 & Age <= 19 ~ "15-19",
Age >= 20 & Age <= 24 ~ "20-24",
Age >= 25 & Age <= 29 ~ "25-29",
Age >= 30 & Age <= 34 ~ "30-34",
Age >= 35 & Age <= 39 ~ "35-39",
Age >= 40 & Age <= 44 ~ "40-44",
Age >= 45 & Age <= 49 ~ "45-49",
Age >= 50 & Age <= 54 ~ "50-54",
Age >= 55 & Age <= 59 ~ "55-59",
Age >= 60 & Age <= 64 ~ "60-64",
Age >= 65 & Age <= 69 ~ "65-69",
Age >= 70 & Age <= 74 ~ "70-74",
Age >= 75 & Age <= 79 ~ "75-79",
Age >= 80 & Age <= 84 ~ "80-84",
Age >= 85 & Age <= 89 ~ "85-89",
Age >= 90 & Age <= 94 ~ "90-94",
Age >= 95 & Age <= 99 ~ "95-99",
is.na(Age) ~ "100+")) %>%
# remove irrelevant columns
select(-c(GENC, `Country/Area Name`, Age)) %>%
# group by and recalculate other columns
group_by(Year, Age_Group) %>%
summarise(
Population = sum(Population, na.rm = TRUE),
`% of Population` = sum(`% of Population`, na.rm = TRUE),
`Male Population` = sum(`Male Population`, na.rm = TRUE),
`% of Males` = sum(`% of Males`, na.rm = TRUE),
`Female Population` = sum(`Female Population`, na.rm = TRUE),
`% of Females` = sum(`% of Females`, na.rm = TRUE)) %>%
mutate(`Sex ratio of the population` = `Male Population`/`Female Population`) %>%
# create a sort-key column
mutate(sort_key = case_when(
Age_Group == "0" ~ 0,
Age_Group == "1-4" ~ 1,
Age_Group == "5-9" ~ 5,
Age_Group == "10-14" ~ 10,
Age_Group == "15-19" ~ 15,
Age_Group == "20-24" ~ 20,
Age_Group == "25-29" ~ 25,
Age_Group == "30-34" ~ 30,
Age_Group == "35-39" ~ 35,
Age_Group == "40-44" ~ 40,
Age_Group == "45-49" ~ 45,
Age_Group == "50-54" ~ 50,
Age_Group == "55-59" ~ 55,
Age_Group == "60-64" ~ 60,
Age_Group == "65-69" ~ 65,
Age_Group == "70-74" ~ 70,
Age_Group == "75-79" ~ 75,
Age_Group == "80-84" ~ 80,
Age_Group == "85-89" ~ 85,
Age_Group == "90-94" ~ 90,
Age_Group == "95-99" ~ 95,
Age_Group == "100+" ~ 100)) %>%
# sort based on group
arrange(sort_key) %>%
# sort based on year
group_by(sort_key) %>%
arrange(Year) %>%
ungroup()
```
For Composite Indices metadata and Composite Indices datasets, we first remove non-essential columns within the metadata and rows with NA values. We continue with clean the Composite Indices dataset and melt the dataset into data tidy format. Lastly, we merge the metadata with the dataset in order to have a column showing the definitions of indices.
```{r}
# Clean Composite Indices metadata: remove rows with NA values and remove the year column.
Composite_indices_metadata <- Composite_indices_metadata %>%
na.omit() %>%
subset(select = -c(`Time series`)) %>%
# rearrange columns
select(`Short name`, `Full name`)
# Clean Composite Indices dataset: melt the column names into different variables showing different indecies and years, then replace the short names of the indecies into their definitions from the meta data.
Composite_indices <- Composite_indices %>%
### change the last "_" into "__" of column names that have such feature.
rename_with(~ gsub("_(?!.*_)", "__", ., perl = TRUE)) %>%
### melt columns into 2 variables: index and years using "__" as separator within the initial column name.
pivot_longer(
cols = -c(iso3, country, hdicode, region),
names_to = c("index", "year"),
names_sep = "__"
) %>%
### join the dataset with the metadata to define the index.
left_join(Composite_indices_metadata, by = c("index" = "Short name")) %>%
### Select only relevant country
filter(iso3 == "VNM") %>%
### rearrange columns for visualization.
select(iso3, country, region, hdicode, index, `Full name`, year, value) %>%
### change year value into int
mutate(year = as.integer(year))
```
Further, we note from the UN documentation: https://hdr.undp.org/data-center/documentation-and-downloads and would use only 3 main indices: `Human Development Index (hdi)`, `Gender Development Index (gdi)`, and `Gender Inequality Index (gii)`. We chose to focus on these indices due to their comprehensive representation of human development, gender disparities, and gender inequality, respectively. These indices are also widely recognized and utilized in research and policy contexts, offering a standardized framework for assessing and comparing societal progress across countries and within countries at different periods. Thus:
```{r}
# Filter picked indicies only
Composite_indices <- Composite_indices %>%
filter(index == "hdi" | index == "gdi" | index == "gii")
```
For the remaining datasets, we conduct basic data cleaning process, including: remove NAs, mutate into integers, rename columns etc.
```{r}
# Clean Vietnam General Population dataset: remove the 1st & 2nd columns ("Name" & "Region") and remove rows with all NA values.
General_Population_Data <- General_Population_Data %>%
subset(select = -c(Name, Region)) %>%
filter(!is.na(GENC)) %>%
# change all columns except the first column into int
mutate(across(-1, ~str_replace_all(., ",", ""))) %>% # Remove commas
mutate(across(-1, ~str_trim(., side = "both"))) %>% # Trim spaces
mutate(across(-1, as.numeric)) # Convert to numeric
# Clean Vietnam General Population dataset: Remove years with NA values: only take data after 1989.
General_Population_Data <- General_Population_Data[complete.cases(General_Population_Data), ]
# Clean Unemployment rate dataset: convert all "no data" values into NA, remove all rows that have full NA values, remove columns that have full NA values.
unemployment_rate[unemployment_rate == "no data"] <- NA # Change all "no data" values into NA.
unemployment_rate <- unemployment_rate %>%
select(-1) %>%
filter(rowSums(is.na(.)) != ncol(.)) %>% # Remove rows with all NAs
t() %>%
na.omit() # Remove after transposed rows with all NAs.
### Rename column
colnames(unemployment_rate)[1] <- "Unemployment Rate"
# Clean GDP PPP per capita, international dollar dataset: convert all "no data" values into NA, remove all rows that have full NA values, remove columns that have full NA values, and transpose the dataset.
GDP_PPP_capita[GDP_PPP_capita == "no data"] <- NA # Change all "no data" values into NA.
GDP_PPP_capita <- GDP_PPP_capita %>%
select(-1) %>%
filter(rowSums(is.na(.)) != ncol(.)) %>% # Remove rows with all NAs
t() %>%
na.omit() # Remove after transposed rows with all NAs.
### Rename column
colnames(GDP_PPP_capita)[1] <- "GDP PPP per capita, international dollar"
```
We now combine all our datasets into 1 comprehensive dataset. Our main dataset would be of format that include columns: country code, year, and all independent and dependent variables. As such, we further modify current datasets.
```{r}
# Pivot wider composite indices dataset to combine into main_df
wide_composite_indices <- Composite_indices %>%
# Remove full name and use short name
subset(select = -c(`Full name`)) %>%
# pivot the dataset
pivot_wider(names_from = index, values_from = value) %>%
# remove all NA values variables
select(where(~ all(complete.cases(.)))) %>%
# arrange rows chronically
arrange(year) %>%
# remove country name & identifier
subset(select = -c(iso3, country, region, hdicode))
# Modified gdp_ppp_capita data to combine into main_df
GDP_PPP_capita <- as.data.frame(GDP_PPP_capita)
GDP_PPP_capita$year <- as.numeric(rownames(GDP_PPP_capita))
rownames(GDP_PPP_capita) <- NULL
# Modified unemployment_rate data to combine into main_df
unemployment_rate <- as.data.frame(unemployment_rate)
unemployment_rate$year <- as.numeric(rownames(unemployment_rate))
rownames(unemployment_rate) <- NULL
# main_df: comprehensive dataset
main_df <- General_Population_Data %>%
inner_join(wide_composite_indices, by = c("Year" = "year")) %>%
inner_join(GDP_PPP_capita, c("Year" = "year")) %>%
inner_join(unemployment_rate, c("Year" = "year")) %>%
inner_join(subset(Contraceptive, select = -c(1,2)), c("Year" = "Year")) %>%
# convert all columns except the country code column into numeric values
mutate_at(vars(-1), as.numeric)
# Since we no longer use `Composite_indices`, `GDP_PPP_capita`, and `unemployment_rate`, thus:
rm(Composite_indices)
rm(GDP_PPP_capita)
rm(unemployment_rate)
rm(Contraceptive)
```
# Summary Statistics
### Comprehensive dataset: `main_df`
Regarding the `main_df` dataset, we create a correlation matrix between all dependent and independent variables to identify potential multicollinearity or relationships.
```{r}
# Set a high threshold for heavily correlated matrix
threshold <- 0.8
# Matrix for heavily correlated variables
corr_matrix_lar_filtered <- cor(main_df[,-c(1:3)]) %>%
# Replace values below threshold with NA
{
replace_mat <- .
replace_mat[-threshold < replace_mat & replace_mat < threshold] <- NA
replace_mat
} %>%
# Remove columns that contain only 1 and NA
{
cols_to_keep <- !apply(., 2, function(x) all(x[!is.na(x)] == 1))
.[, cols_to_keep]
} %>%
# Remove rows that contain only 1 and NA
{
rows_to_keep <- !apply(., 1, function(x) all(x[!is.na(x)] == 1))
.[rows_to_keep, ]
} %>%
# Create correlation matrix
{
significant_vars <- apply(!is.na(.), 1, any)
corr_matrix_temp <- .[significant_vars, significant_vars]
diag(corr_matrix_temp) <- NA # Exclude self-correlations
corr_matrix_temp
}
# Create an interactive heatmap with the filtered matrix
corr_matrix <- plot_ly(x = colnames(corr_matrix_lar_filtered), y = colnames(corr_matrix_lar_filtered), z = corr_matrix_lar_filtered,
type = "heatmap", colorscale = "Plasma",
zmin = -1, zmax = 1) %>%
layout(title = "Filtered Correlation Matrix (Absolute Corr >= 0.8)",
xaxis = list(title = "Variables", tickangle = 45, tickfont = list(size = 10)),
yaxis = list(title = "Variables", tickangle = -45, tickfont = list(size = 10)),
margin = list(l = 100, r = 100, t = 100, b = 100))
# Visualization
corr_matrix
# Remove unnecessary variable names from global environment
rm(threshold)
rm(corr_matrix)
```
We see that marking the threshold for large correlated variables at absolute value of 0.8 shows large number of indices in the Composite_indices dataset being correlated with each other. We hypothesize that the large correlation stems from various variables within Composite_indices dataset of the UN belonging in the same group, thus, measuring similar social quantities. In addition, we recognize that some variables are calculated using similar metrics as other variables, which would further explain the large correlation.
For now, we plan to identify the variables that are created using metrics that are similar to other variables. However, further identification of these variables would be conducted in the future.
### General Population Data
Population change is our main dependent variable, as such, we attempt to graph population levels and rate of change (%) over time. We notice that the dataset General_Population_Data includes past, current, and future projection of population data until 2100. We thus marks a dash line at 2024 to separate between real and projected data.
We proceed to graph out the historical and projection of the country population from 1989 till 2100 from our dataset.
```{r}
# graph out the historical and projection population of the country.
pop_trend <- ggplot(General_Population_Data, aes(x = Year)) +
geom_line(aes(y = Population, colour = "Population"), size = 1) +
geom_line(aes(y = `Annual Growth Rate %` * 100000000, colour = "Rate of Change"), size = 1) +
geom_vline(xintercept = 2024, linetype = "dashed", color = "black") + # Add dashed line at year 2024: years to the right being projection
scale_y_continuous(
name = "Total Population",
labels = function(x) format(x, scientific = FALSE),
sec.axis = sec_axis(~ (. / 100000000), name="Rate of Change (%)")
) +
labs(colour = "Legend") +
theme_minimal() +
theme(
axis.text.x = element_text(size = 12), # Change axis text size for x axis
axis.text.y = element_text(size = 12), # Change axis text size for y axis
axis.title.x = element_text(size = 12), # Change axis title size for x axis
axis.title.y = element_text(size = 12) # Change axis title size for y axis
)
ggsave("pop_trend.jpeg", plot = pop_trend, width = 10, height = 6, dpi = 300)
pop_trend
```
We see that since 1989 till now, the population of Vietnam has been on an increasing trend. However, we also see that the population levels would be projected to peak around the 2050s. In term of the change in population levels, the population growth rate has been in declined rapidly since the beginning of available dataset (1989) from more than 2% to a predicted rate of less than 0% annually in 60 years. However, we note that the chart only provides an overview of the population trends and historical as well as projected population data may very well varies with different characteristics.
Thus, we further decompose the population by age group by creating a population pyramid over time. We convert the `Population_Data_by_Age` into an age structure pyramid. We first reorder the dataset into age groups of interval 5 years, except for age 0 (being born) and interval 1-4 years old then create the pyramid.
```{r}
# Reshape data
df_pyramid <- Population_Data_by_Age_formated %>%
{
pop_age <-.
pop_age <- select(pop_age, Year, sort_key, Age_Group, `Male Population`, `Female Population`)
pop_age
} %>%
pivot_longer(
cols = c("Male Population", "Female Population"),
names_to = "Gender",
values_to = "Population"
) %>%
# Negate female population for pyramid
mutate(Gender = ifelse(Gender == "Male Population", "Male", "Female"),
Population = ifelse(Gender == "Female", -Population, Population))
# Reshape data to have separate columns for Male and Female
df_pyramid <- df_pyramid %>%
spread(key = Gender, value = Population) %>%
arrange(Year, sort_key)
# Determine the range for setting symmetrical axis limits
max_population <- ceiling(max(abs(c(df_pyramid$Male, df_pyramid$Female)), na.rm = TRUE) / 1000000) * 1000000
tick_values <- seq(-max_population, max_population, by = 1000000) # Each step is 1 million
# Create the initial plot
pop_pyramid <- plot_ly(df_pyramid, type = 'bar', orientation = 'h',
transforms = list(
list(
type = 'filter',
target = ~Year,
operation = '=',
value = min(df_pyramid$Year) # Start with the earliest year
)
))
# Define colors for male and female
male_color = 'rgba(255, 0, 0, 0.6)' # Light red with less transparency
female_color = 'rgba(0, 0, 255, 0.6)' # Light blue with less transparency
# Add traces for Male and Female with customized hoverinfo and without showing text on bars
pop_pyramid <- pop_pyramid %>% add_trace(x = ~Male, y = ~Age_Group, name = 'Male', text = ~Male, hoverinfo = 'text+y', textposition = 'none', marker = list(color = male_color))
pop_pyramid <- pop_pyramid %>% add_trace(x = ~Female, y = ~Age_Group, name = 'Female', text = ~abs(Female), hoverinfo = 'text+y', textposition = 'none', marker = list(color = female_color))
# Layout adjustments
pop_pyramid <- pop_pyramid %>% layout(
yaxis = list(title = 'Age Group'),
xaxis = list(
title = 'Population',
tickvals = tick_values,
ticktext = lapply(tick_values, function(x) ifelse(x == 0, "0", scales::comma(abs(x))))
),
barmode = 'overlay',
title = 'Population Pyramid'
)
# Slider for year selection
years <- sort(unique(df_pyramid$Year))
slider_steps <- lapply(years, function(year) {
list(
method = "restyle",
args = list("transforms[0].value", as.character(year)),
label = as.character(year)
)
})
pop_pyramid <- pop_pyramid %>% layout(
sliders = list(
list(
active = 0,
currentvalue = list(prefix = "Year: "),
steps = slider_steps
)
)
)
# Print the plot
pop_pyramid
# Remove unnessasry variables
rm(max_population)
rm(tick_values)
rm(male_color)
rm(female_color)
rm(pop_pyramid)
rm(slider_steps)
```
From the population pyramid over time, we can clearly see a generation less than 30 years old in 1989 created a large bump in the population pyramid. This generation is likely to drive population growth in the year ahead as they were heading into prime age to establish a family. In addition, we also saw that the number of children being born (age 0) decline over time. Thus, signaling declining fertility being the main drivers of population reduction in the long term. Still, we note the fact that in each age group the large increase in population levels does not have a corresponding dramatic reduction. This is likely due to advancement in adopting modern healthcare system, thus, keeping more people living at the same time and keeps the age group wide for longer.
Most importantly, we notice further from the pyramid that there are differences between the number of males and females at every age group with initially male outnumbers female individuals, then the trends reversed with increasing age groups. As such, we visualize the difference between number of male and female at birth and at each age group.
```{r}
# Creating an ordered factor for Age_Group
age_group_levels <- c("0", "1-4", "5-9", "10-14", "15-19", "20-24", "25-29", "30-34", "35-39", "40-44", "45-49", "50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80-84", "85-89", "90-94", "95-99", "100+")
df_pyramid$Age_Group <- factor(df_pyramid$Age_Group, levels = age_group_levels, ordered = TRUE)
# Calculate the difference between male and female populations
df_difference <- df_pyramid %>%
mutate(Difference = Male + Female) %>%
arrange(Year, Age_Group)
# Define colors
male_color = 'rgba(255, 0, 0, 0.6)'
female_color = 'rgba(0, 0, 255, 0.6)'
# Determine the range for the axis limits (based on the maximum absolute difference)
max_diff <- ceiling(max(abs(df_difference$Difference), na.rm = TRUE) / 1e5) * 1e5
tick_values_diff <- seq(-max_diff, max_diff, by = 1e5) # Each step is 100,000
# Create the interactive plot with conditional colors
pop_pyramid_diff <- plot_ly(data = df_difference, x = ~Difference, y = ~Age_Group, type = 'bar', orientation = 'h',
name = 'Difference', text = ~Difference, hoverinfo = 'text+y', textposition = 'none',
marker = list(color = ifelse(df_difference$Difference > 0, male_color, female_color)),
transforms = list(
list(
type = 'filter',
target = ~Year,
operation = '=',
value = min(df_difference$Year)
)
)) %>%
layout(
yaxis = list(title = 'Age Group'),
xaxis = list(
title = 'Population Difference (red: male exceed; blue: female exceed)',
zeroline = TRUE,
tickvals = tick_values_diff,
ticktext = lapply(tick_values_diff, function(x) scales::comma(abs(x))),
showgrid = TRUE # Display grid lines
),
barmode = 'overlay',
title = 'Male or Female Population Differences by Age Group',
sliders = list(
list(
active = 0,
currentvalue = list(prefix = "Year: "),
steps = lapply(sort(unique(df_difference$Year)), function(year) {
list(method = "restyle", args = list("transforms[0].value", as.character(year)), label = as.character(year))
})
)
)
)
# Display the plot
pop_pyramid_diff
# Remove unnecessary datasets or variables
rm(male_color)
rm(female_color)
rm(max_diff)
rm(tick_values_diff)
rm(df_difference)
rm(df_pyramid)
rm(age_group_levels)
rm(pop_pyramid_diff)
```
We clearly see an initial male bias at birth of the population pyramid in 1989 then drastically turned into female bias by the 25-29 age group. This is reasonable considering the end period of the Vietnam war in 1970s. Further, over time, we see the male bias at birth starting to dominate other age groups at older ages with currently in 2024 the male bias is dominating up until 45 years of age. Thus, we dive deeper into the sex ratio at birth and at different age group of the population over time.
```{r}
# Filter the dataframe for specific age groups
df <- Population_Data_by_Age_formated %>%
select(Year, Age_Group, `Sex ratio of the population`) %>%
filter(Age_Group == "0" | grepl("^10|^20|^30|^40|^50|^60|^70|^80|^90|^100", Age_Group))
# Filter the dataframe for specific age groups
df <- Population_Data_by_Age_formated %>%
select(Year, Age_Group, `Sex ratio of the population`) %>%
filter(Age_Group == "0" | grepl("^10|^20|^30|^40|^50|^60|^70|^80|^90|^100", Age_Group))
# Create the plot
Sex_Ratio_OT <- df %>%
plot_ly(x = ~Year, y = ~`Sex ratio of the population`, color = ~Age_Group, type = 'scatter', mode = 'lines') %>%
layout(title = "Sex Ratio Over Time by Age Group",
xaxis = list(title = "Year"),
yaxis = list(title = "Sex Ratio"),
shapes = list(
list(
type = "line",
line = list(dash = "dash", color = "black"),
x0 = 2024, x1 = 2024, y0 = 0, y1 = 1,
xref = 'x', yref = 'paper'
)
))
# Display the plot
Sex_Ratio_OT
# Remove unnecessary variables
rm(df)
rm(Sex_Ratio_OT)
```
We see that before and by 2024, the majority of age group above the age of 30 are female dominated. This patterns is more and more drastic the older the population. However, we also notice a male bias at birth forming in the 2010s and its effects continuing until 2080 when this generation starting to returning to equilibrium due to old age. On a overview point of view, we see that in all age groups there is either a male bias sex ratio or having a trend toward male bias in the population. While this patterns may have historical causes due to war, the relative peace since 1975 and the forming of male bias at birth of newer generations show a son preference culture in the country. This male bias at birth reaches a peak at 1.13 or 113 boy born relative to 100 girl born. The trends is predicted to decline but would prolong until 2050s when the population levels declining (as shown in above graph). As such, we have reason to believe in a connection between a male bias population at birth and the declining population growth rate.
### Note
We note before our analysis. We consider only historical data and not projected data from our dataset. Thus, we modify the datasets as follow:
```{r}
# Identify all main datasets real observations only using main_df
Population_Data_by_Age_formated <- Population_Data_by_Age_formated[as.numeric(Population_Data_by_Age_formated$Year) %in% as.numeric(main_df$Year),]
General_Population_Data <- General_Population_Data[as.numeric(General_Population_Data$Year) %in% as.numeric(main_df$Year),]
Population_Data_by_Age <- Population_Data_by_Age[as.numeric(Population_Data_by_Age$Year) %in% as.numeric(main_df$Year),]
```
# Analysis
### Simple OLS with Time Series Adjustments model between main dependent and independent variable
We first start off with implementing a simple linear regression between our main dependent and independent variable: being population change and sex ratio at birth over time respectively. We show the first 6 observations within the dataset. We note that the value of `Sex ratio of the population` being number of males per 1000 females.
```{r}
# sex ratio of a specified age group
sex_ratio_df <- Population_Data_by_Age %>%
filter(GROUP >= 15, GROUP <= 49) %>%
group_by(Year) %>%
summarize(
TotalMalePopulation = sum(`Male Population`),
TotalFemalePopulation = sum(`Female Population`),
SexRatio = TotalMalePopulation / TotalFemalePopulation * 1000
) %>%
select(Year, SexRatio)
# simple model dataset
sim_data <- General_Population_Data %>%
select(Year, `Annual Growth Rate %`) %>%
merge(sex_ratio_df, by = "Year") %>%
select(-Year) %>%
mutate(`Sex ratio of the population` = 1000*SexRatio) %>%
ts()
# first few lines of the dataset.
head(sim_data)
```
Before running the model, we check for variables' stationary.
```{r}
# Creating syntax to store relevant variables
`Annual Growth Rate %` <- sim_data[, "Annual Growth Rate %"]
`Sex ratio of the population` <- sim_data[, "Sex ratio of the population"]
# Applying the ADF test to 'Annual Growth Rate %'
summary(ur.df(diff(`Annual Growth Rate %`), type="drift", lags = 10, selectlags = c("AIC")))
# Applying the ADF test to 'Sex ratio of the population'
summary(ur.df(diff(diff(`Sex ratio of the population`)), type="drift", lags = 5, selectlags = c("AIC")))
```
We notice that the results of Augmented Dickey-Fuller Test varies between the 2 variables but do not support clear stationary for either variables. Specifically, we note `Sex ratio of the population` variable's Dickey-Fuller statistics and p-value of `r round(adf.test(sim_data[, "Sex ratio of the population"])$statistic,3)` and `r round(adf.test(sim_data[, "Sex ratio of the population"])$p.value ,3)` respectively, indicating clear non-stationary due to a not sufficiently negative Dickey-Fuller statistics and a high p-value. In term of `Annual Growth Rate %` variable's results of `r round(adf.test(sim_data[, "Annual Growth Rate %"])$statistic,3)` and `r round(adf.test(sim_data[, "Annual Growth Rate %"])$p.value ,3)` for Dickey-Fuller statistics and p-value respectively, we see moderate support for stationary. This is because the Dickey-Fuller statistics for `Annual Growth Rate %` is sufficiently negative at 5% and the p-value is still significant at 10% level.
Nonetheless, we continue with implement first differencing for both variables and retest for stationary. We believe that while the result for `Annual Growth Rate %` stationary being inconclusive, differencing would likely support both variables to achieve stationary. We show the first 6 observations of differencing variables.
```{r}
# First Differencing
diff_annual_growth_rate <- diff(`Annual Growth Rate %`)
diff_sex_ratio <- diff(`Sex ratio of the population`)
# first few lines of both variables
head(ts(data.frame(diff_annual_growth_rate, diff_sex_ratio)))
```
In addition, we are aware of the existence of male-bias sex ratio at birth in Vietnam, stems from culture, economical, and historical background. As such, we believe there exists an underlying trend driving the increase in male-bias sex ratio at birth for the population. Thus, we retest for stationary:
```{r}
# retest for stationary
# For Annual Growth Rate %
adf.test(diff_annual_growth_rate, alternative = "stationary")
# For Sex Ratio of the Population
summary(ur.df(diff_sex_ratio, type="trend"))
```
The ADF test results for the `diff_annual_growth_rate` variable yielded a Dickey-Fuller statistic of `r round(adf.test(diff_annual_growth_rate, alternative = "stationary")$statistic,3)` with a p-value of `r round(adf.test(diff_annual_growth_rate, alternative = "stationary")$p.value,3)`.The high p-value, well above the conventional threshold levels at 1%, 5% and 10% significance, indicates a failure to reject the null hypothesis of a unit root, suggesting that the differenced annual growth rate series remains non-stationary.
Similarly, `diff_sex_ratio` variable yielded a Dickey-Fuller statistic of `r round(ur.df(diff_sex_ratio, type="trend")@teststat[1],3)` with an associated p-value of `r round(0.03881,3)`, suggesting not enough evidence for rejecting of the null hypothesis of a unit root with a linear time trend.
We thus further differencing both variables and re-test the variable for stationary.
```{r}
# Second Differencing
sed_diff_annual_growth_rate <- diff(diff_annual_growth_rate)
sed_diff_sex_ratio <- diff(diff_sex_ratio)
# retest for stationary
# For population growth rate annually
adf.test(sed_diff_annual_growth_rate, alternative = "stationary")
# for annual sex ratio
summary(ur.df(sed_diff_sex_ratio, type="trend"))
```
We see that the variable `sed_diff_annual_growth_rate` being the 2nd differencing of `Annual Growth Rate %` variable possess a Dickey-Fuller statistics of `r round(adf.test(sed_diff_annual_growth_rate, alternative = "stationary")$statistic,3)` and a p-value of `r round(adf.test(sed_diff_annual_growth_rate, alternative = "stationary")$p.value,3)` after performing ADF test. This results indicate the time series variable is on the borderline of stationary according to the 5% threshold but can be considered stationary at a slightly less stringent level. Similarly, the `sed_diff_sex_ratio` being the 2nd differencing of `Sex ratio of the population` possess a Dickey-Fuller statistics of `r round(ur.df(sed_diff_sex_ratio, type="trend")@teststat[1],3)` and a p-value of `r round(0.0009192,4)` after performing ADF test.
Hence, we then implement OLS with these modified variables: `sed_diff_annual_growth_rate` and `sed_diff_sex_ratio`. However, before analyzing the model's results, we check for cointegration between the dependent and independent variables of the model.
```{r}
# simple linear regression model
sim_model <- lm(sed_diff_annual_growth_rate ~ sed_diff_sex_ratio)
# Check for cointegration by testing for stationary of residuals
summary(ur.df(residuals(sim_model), type="trend"))
```
The ADF test for including a trend component was applied to the residuals of the model. The test results yielded a notably significant Dickey-Fuller statistic of `r round(ur.df(residuals(sim_model), type="trend")@teststat[1],3)`. When compared to the critical values for the ADF test (tau3), this statistic is well beyond the critical value at the 1% level (-4.15). Such a substantial deviation into negative territory indicates a clear rejection of the null hypothesis of a unit root, implying that the residuals are stationary, and thus, the dependent and independent variables possess positive a strong indication of cointegration. Thus, the result implies a long-term equilibrium relationship between the 2 variables.
We thus continue with OLS regression's results.
```{r}
summary(sim_model)
mcvbn$coefficients[2,1]
```
We see that the regression model yields results indicating a statistically insignificant relationship between the second differenced annual population growth rate `sed_diff_annual_growth_rate` and the second differenced sex ratio at birth or number of males per 1000 females born `sed_diff_sex_ratio`. We observe that the intercept, independent, and model's p-values are all significantly larger than 0.5, suggesting a statistically insignificant relationship. Additionally, the model's multiple R-squared value is 0.00145, indicating that only a small fraction of the variance in the dependent variable is explained by the model, suggesting a weak explanatory power.
We expect this results because the model is a simplistic model that only contain 1 modified-independent and 1 modified-dependent variable. Further, we recognize that the relationship between sex ratio and population change would be convoluted, i.e affecting through fertility and other variables, as well as take a lag time components so that the difference in number of male and female would start to affect fertility rate. Nonetheless, for now, we conclude no-relationship between our modified-independent and modified-dependent variables.
### Autoregressive Distributed Lag (ARDL) Model
From the results of a simple OLS time series adjustments model, we note that both population growth rate % and sex ratio at birth are non-stationary with population growth rate % achieving stationary at 2nd order differenced and sex ratio at birth achieving stationary at 1st order differenced. As such, we choose the further our analysis by implementing ARDL model. This is because:
- The ARDL model can accommodate variables that are integrated at different orders (I(0), I(1)). This flexibility is a key advantage, as it allows for the direct inclusion of variables with different levels of differencing, without the need to pre-difference all series to the same order of integration.
- ARDL models uniquely allow for the estimation of both long-run and short-run dynamics simultaneously.
- ARDL model is known for its robustness in small sample sizes, critically relevant for our dataset.
Nonetheless, we do note that since ARDL model assume stationary at I(1) and population growth rate % only achieving stationary at 2nd order differenced. Thus, our dependent variable would the 1st order differenced population growth rate % rather than absolute level population growth rate %.
In addition, having detected the connection between dependent and independent variable using simple linear regression, we see that the current level of sex ratio at birth does not have any explanatory power over population change. Thus, we focus our continue analysis in dynamic model (model with independent variables being lag version)
1. ARDL model with 1 dependent and 1 independent variable.
We first start with identify lag length for both dependent and independent variables by using information criteria: AIC, BIC.
```{r}
# model data
data <- data.frame(diff_annual_growth_rate, diff_sex_ratio)
# We test until the lag level: 5
max_lag_diff_annual_growth_rate <- 5
max_lag_diff_sex_ratio <- 5
# Create matrix to identify optimal lags.
aic_values <- matrix(NA, nrow = max_lag_diff_annual_growth_rate, ncol = max_lag_diff_sex_ratio)
bic_values <- matrix(NA, nrow = max_lag_diff_annual_growth_rate, ncol = max_lag_diff_sex_ratio)
# Fill in matrix
for (lag_growth in 1:max_lag_diff_annual_growth_rate) {
for (lag_sex_ratio in 1:max_lag_diff_sex_ratio) {
formula <- as.formula(paste("diff_annual_growth_rate ~ L(diff_annual_growth_rate, 1:",
lag_growth, ") + L(diff_sex_ratio, 1:",
lag_sex_ratio, ")", sep=""))
model <- dynlm(formula, data = data)
aic_values[lag_growth, lag_sex_ratio] <- AIC(model)
bic_values[lag_growth, lag_sex_ratio] <- BIC(model)
}
}
# Identify optimal lag
optimal_lag_aic <- which(aic_values == min(aic_values), arr.ind = TRUE)
optimal_lag_bic <- which(bic_values == min(bic_values), arr.ind = TRUE)
```
Our AIC results indicate an optimal lag length of `r optimal_lag_aic[1]` and `r optimal_lag_aic[2]`, whereas BIC's results indicate an optimal lag length of `r optimal_lag_bic[1]` and `r optimal_lag_bic[2]` for dependent and independent variable respectively.
Thus, in term of AIC, we infer a model and model's results of:
```{r}
# Using the optimal lag determined by AIC
optimal_lag_growth_aic <- optimal_lag_aic[1, 1]
optimal_lag_sex_ratio_aic <- optimal_lag_aic[1, 2]
# apply model
AIC_optimal <- as.formula(paste("diff_annual_growth_rate ~ L(diff_annual_growth_rate, 1:",
optimal_lag_growth_aic, ") + L(diff_sex_ratio, 1:",
optimal_lag_sex_ratio_aic, ")", sep=""))
AIC_ARDL <- dynlm(AIC_optimal, data = data)
# model's result
summary(AIC_ARDL)
# Calculate the p-value
AIC_p_value <- pf(summary(AIC_ARDL)$fstatistic[1], summary(AIC_ARDL)$fstatistic[2], summary(AIC_ARDL)$fstatistic[3], lower.tail = FALSE)
```
And, in term of BIC, we infer a model and model's results of:
```{r}
# Using the optimal lag determined by BIC
optimal_lag_growth_bic <- optimal_lag_bic[1, 1]
optimal_lag_sex_ratio_bic <- optimal_lag_bic[1, 2]
BIC_optimal <- as.formula(paste("diff_annual_growth_rate ~ L(diff_annual_growth_rate, 1:",
optimal_lag_growth_bic, ") + L(diff_sex_ratio, 1:",
optimal_lag_sex_ratio_bic, ")", sep=""))
BIC_ARDL <- dynlm(BIC_optimal, data = data)
# Results
summary(BIC_ARDL)
# Calculate the p-value
BIC_p_value <- pf(summary(BIC_ARDL)$fstatistic[1], summary(BIC_ARDL)$fstatistic[2], summary(BIC_ARDL)$fstatistic[3], lower.tail = FALSE)
```
Since we see that AIC's model perform similarly well with that of BIC with both suggesting same lags length for both variables. We see that both models thus return adjusted R-squared of `r round(summary(AIC_ARDL)$adj.r.squared,2)` as well as model's p-value of `r round(AIC_p_value, 3)`
As such, we would move forward with lag results of `r optimal_lag_aic[1]` and `r optimal_lag_aic[2]` for dependent and independent variable respectively. We check accuracy for the lag models.
```{r}
checkresiduals(AIC_ARDL)
```
We visually confirm that the residuals to appear random and centered around zero. In additon, we see that there is no ACF value that less than -0.3 or more than 0.3, indicating no-present of autocorrelation. Nonetheless, we also see that the residuals distribution only roughly follow a random distribution with a mean of approximately at 0, thus, somewhat weaken our model.
From the AIC model results, we could say that:
Lagged Effect of Change of Population Growth Rate: The significant positive coefficient for the second lag of the population growth rate indicates a persistence effect in population growth. This could imply that factors affecting population growth have a lasting impact over a two-year period.
Insignificance of Sex Ratio: The lack of significance in the 1st order differenced sex ratio's lagged terms suggests that, in this model and with these data, the sex ratio does not have a statistically identifiable impact on the annual growth rate of the population. This could be due to a variety of reasons, such as other more influential factors not included in the model or the possibility that the effect of sex ratio on population growth does not manifest in the immediate short term of around 1 to 2 years.
2. ARDL model with 1 dependent and multiple independent variable.
We understand that in demographic studies, population change is driven directly by 4 factors: fertility rate, mortality rate, immigration rate, and emigration rate. In addition, we take into account other factors by including: HDI, GDI, and GII.
We check for covariates' stationary. Similar to dependent and independent variables, we use ADF test:
```{r}
# check stationary of fertility rate with trend component (because we assume that 2 child policy works)
summary(ur.df(diff(main_df$`Total Fertility Rate`), type="trend", lags = 10, selectlags = "AIC"))
# check stationary of rate of change of the slope of life expectancy at birth, both sexes (because we assume there is a decreasing marginal gain with each additonal longer year of life)
summary(ur.df(main_df$`Life Expectancy at Birth, Both Sexes`, type="drift", lags = 10, selectlags = "AIC")) # in ARDL I(0) at 1st differencing
# no need assumption
summary(ur.df(diff(diff(main_df$`Net Migration Rate`)), type="drift", lags = 10, selectlags = "AIC"))
# hdi marginal gains decrease over time.
summary(ur.df(diff(main_df$hdi), type = "trend"))
# gdi marginal gain decrease over time
summary(ur.df(diff(main_df$gdi), type = "trend"))
# gii reached stationary at 2rd differencing
summary(ur.df(diff(diff(main_df$gii)), type="trend", lags = 10, selectlags = "AIC"))
diff_annual_growth_rate
diff_sex_ratio
main_df$`Total Fertility Rate`
# model data
data <- data.frame(diff_annual_growth_rate = diff_annual_growth_rate,
diff_sex_ratio = diff_sex_ratio,
TFR = main_df$`Total Fertility Rate`[-1],
`Net Migration Rate`= main_df$`Net Migration Rate`[-1],
hdi = main_df$hdi[-1],
gdi = diff(main_df$gdi),
diff_gii = diff(main_df$gii), check.names = FALSE)
```
b. Define Optimal Lag
Having had the data, we determine the optimal amount of lags based on AIC or BIC criteria.
```{r}
# Define maximum lag for each variable
max_lag <- 4
data <- na.omit(data)
data_test <- na.omit(data)
# Pre-generate lagged variables for independent variables
for (lag in 1:max_lag) {
# Generate lagged variables for the dependent variable
data_test[[paste("lagged_growth_rate", lag, sep = "_")]] <- lag(data_test$diff_annual_growth_rate, lag)
# Generate lagged variables for the independent variable
data_test[[paste("lagged_sex_ratio", lag, sep = "_")]] <- lag(data_test$diff_sex_ratio, lag)
data_test[[paste("lagged_TFR", lag, sep = "_")]] <- lag(data_test$TFR, lag)
data_test[[paste("lagged_migration", lag, sep = "_")]] <- lag(data_test$`Net Migration Rate`, lag)
data_test[[paste("lagged_hdi", lag, sep = "_")]] <- lag(data_test$hdi, lag)
data_test[[paste("lagged_gdi", lag, sep = "_")]] <- lag(data_test$gdi, lag)
data_test[[paste("lagged_gii", lag, sep = "_")]] <- lag(data_test$diff_gii, lag)
}
# Create matrices for AIC and BIC values
aic_values <- array(NA, dim = c(rep(max_lag + 1, 7)))
bic_values <- array(NA, dim = c(rep(max_lag + 1, 7)))
# Nested loop for model fitting
for (lag0 in 0:max_lag) { # Lag for the dependent variable
for (lag1 in 0:max_lag) {
for (lag2 in 0:max_lag) {
for (lag3 in 0:max_lag) {
for (lag4 in 0:max_lag) {
for (lag5 in 0:max_lag) {
for (lag6 in 0:max_lag) {
# Skip if all lags are zero
if (lag0 == 0 && lag1 == 0 && lag2 == 0 && lag3 == 0 && lag4 == 0 && lag5 == 0 && lag6 == 0) {
next
}
# Construct the model formula
formula_parts <- c("diff_annual_growth_rate ~")
if (lag0 > 0) formula_parts <- c(formula_parts, paste("lagged_growth_rate", lag0, sep = "_"))
if (lag1 > 0) formula_parts <- c(formula_parts, paste("lagged_sex_ratio", lag1, sep = "_"))
if (lag2 > 0) formula_parts <- c(formula_parts, paste("lagged_TFR", lag2, sep = "_"))
if (lag3 > 0) formula_parts <- c(formula_parts, paste("lagged_migration", lag3, sep = "_"))
if (lag4 > 0) formula_parts <- c(formula_parts, paste("lagged_hdi", lag4, sep = "_"))
if (lag5 > 0) formula_parts <- c(formula_parts, paste("lagged_gdi", lag5, sep = "_"))
if (lag6 > 0) formula_parts <- c(formula_parts, paste("lagged_gii", lag6, sep = "_"))
model_formula <- as.formula(paste(formula_parts, collapse = " + "))
# Fit the model
model <- dynlm(model_formula, data = data_test)
# Store AIC and BIC values
aic_values[lag0 + 1, lag1 + 1, lag2 + 1, lag3 + 1, lag4 + 1, lag5 + 1, lag6 + 1] <- AIC(model)
bic_values[lag0 + 1, lag1 + 1, lag2 + 1, lag3 + 1, lag4 + 1, lag5 + 1, lag6 + 1] <- BIC(model)
}
}
}
}
}
}
}
# Handle NA values in aic_values and bic_values
aic_values[is.na(aic_values)] <- Inf # Replace NA with Inf
bic_values[is.na(bic_values)] <- Inf # Replace NA with Inf
view(aic_values)
# Identify optimal lags
optimal_lags_aic <- which(aic_values == min(aic_values, na.rm = TRUE), arr.ind = TRUE)
optimal_lags_bic <- which(bic_values == min(bic_values, na.rm = TRUE), arr.ind = TRUE)
```
We have the AIC's results model:
```{r}
generate_ARDL_formula <- function(lags_matrix, variable_names) {
# Format names: Add backticks if they contain spaces
formatted_names <- ifelse(grepl(" ", variable_names), paste0("`", variable_names, "`"), variable_names)
# Create lag terms for each variable
terms <- sapply(1:ncol(lags_matrix), function(i) {
paste(sprintf("lag(%s, %d)", formatted_names[i], 1:lags_matrix[1, i]), collapse = " + ")
})
# Construct and return the formula
paste("diff_annual_growth_rate ~", paste(terms, collapse = " + "))
}
# Usage
variable_names <- names(data)
# Generate and fit the ARDL model - AIC
ARDL_formula <- generate_ARDL_formula(optimal_lags_aic, variable_names)
ARDL_model_AIC <- dynlm(as.formula(ARDL_formula), data = data)
# Generate and fit the ARDL model - BIC
ARDL_formula <- generate_ARDL_formula(optimal_lags_bic, variable_names)
ARDL_model_BIC <- dynlm(as.formula(ARDL_formula), data = data)
options(scipen = 999)
BIC(ARDL_model_AIC)
summary(ARDL_model_AIC)
summary(ARDL_model_BIC)
```
We see that BIC's results are far superior compare to that of AIC. Thus, we move forward with BIC's model. We now check for model's multicolinearity by using VIF.
```{r}
ARDL_formula
# manually write down the regression
ARDL_model_BIC <- dynlm(diff_annual_growth_rate ~ lag(diff_annual_growth_rate, 1) + lag(diff_annual_growth_rate, 2) + lag(diff_sex_ratio, 1) + lag(diff_sex_ratio, 2) + lag(diff_sex_ratio, 3) + lag(diff_sex_ratio, 4) + lag(TFR, 1) + lag(TFR, 2) + lag(TFR, 3) + lag(TFR, 4) + lag(TFR, 5) + lag(`Net Migration Rate`, 1) + lag(`Net Migration Rate`, 2) + lag(hdi, 1) + lag(hdi, 2) + lag(hdi, 3) + lag(hdi, 4) + lag(gdi, 1) + lag(diff_gii, 1) + lag(diff_gii, 2), data = data)
# innitial results
summary(ARDL_model_BIC)
# current vif value
vif(ARDL_model_BIC)
```
The final model of VIF selection process:
```{r}
# model at end of VIF selection
ARDL_model_BIC <- dynlm(diff_annual_growth_rate ~ lag(diff_annual_growth_rate, 1) + lag(diff_annual_growth_rate, 2) + lag(diff_sex_ratio, 1) + lag(diff_sex_ratio, 2) + lag(diff_sex_ratio, 3) + lag(diff_sex_ratio, 4) + lag(TFR, 1) + lag(`Net Migration Rate`, 1) + lag(`Net Migration Rate`, 2) + lag(hdi, 1) + lag(gdi, 1) + lag(diff_gii, 1) + lag(diff_gii, 2), data = data)
# results
summary(ARDL_model_BIC)
# current vif value
vif(ARDL_model_BIC)
```
Backward variable selection
```{r}
ARDL_model_BIC <- dynlm(diff_annual_growth_rate ~ lag(diff_annual_growth_rate, 1) + lag(diff_sex_ratio, 4) + lag(TFR, 1) + lag(`Net Migration Rate`, 1) + lag(hdi, 1) + lag(diff_gii, 1), data = data)