HRDAG · thegargiulian · Sep 12, 2024 · Aug 26, 2024 · Aug 26, 2024 · Aug 26, 2024
diff --git a/.github/workflows/check-standard.yaml b/.github/workflows/check-standard.yaml
@@ -22,7 +22,7 @@ jobs:
           - {os: windows-latest, r: 'release'}
           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
           - {os: ubuntu-latest,   r: 'release'}
-          - {os: ubuntu-latest,   r: 'oldrel-1'}
+            #          - {os: ubuntu-latest,   r: 'oldrel-1'}
 
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}

diff --git a/R/combine_replicates.R b/R/combine_replicates.R
@@ -19,7 +19,8 @@
 #' @examples
 #' local_dir <- system.file("extdata", "right", package = "verdata")
 #' replicates_data <- read_replicates(replicates_dir = local_dir,
-#' violation = "reclutamiento", replicate_nums = c(1, 2), crash = TRUE)
+#' violation = "reclutamiento", replicate_nums = c(1, 2), version = "v1",
+#' crash = TRUE)
 #' replicates_obs_data <- summary_observed("reclutamiento", replicates_data,
 #' strata_vars = "sexo", conflict_filter = FALSE, forced_dis_filter = FALSE,
 #' edad_minors_filter = FALSE, include_props = FALSE)
@@ -82,7 +83,8 @@ proportions_imputed <- function(complete_data,
 #' @examples
 #' \dontrun{
 #' local_dir <- system.file("extdata", "right", package = "verdata")
-#' replicates_data <- read_replicates(local_dir, "reclutamiento", c(1, 2))
+#' replicates_data <- read_replicates(local_dir, "reclutamiento", c(1, 2),
+#' version = "v1")
 #' replicates_obs_data <- summary_observed("reclutamiento", replicates_data,
 #' strata_vars = "sexo", conflict_filter = FALSE, forced_dis_filter = FALSE,
 #' edad_minors_filter = FALSE, include_props = FALSE, digits = 2)

diff --git a/R/confirm_files.R b/R/confirm_files.R
@@ -9,6 +9,10 @@
 #' @param replicate_path Path to the replicate to be confirmed. The name
 #' of the files must include the violation in Spanish and lower case letters
 #' (homicidio, secuestro, reclutamiento, desaparicion).
+#' @param version Version of the data being read in. Options are "v1" or "v2".
+#' "v1" is appropriate for replicating the replicating the results of the joint
+#' JEP-CEV-HRDAG project. "v2" is appropriate for conducting your new analyses
+#' of the conflict in Colombia.
 #
 #' @return A data frame row with two columns: `replicate_path`, a string indicating the
 #' path to the replicate checked and `confirmed`, a boolean values indicating
@@ -19,10 +23,16 @@
 #' @examples
 #' local_dir_csv <- system.file("extdata", "right",
 #' "verdata-reclutamiento-R1.csv.zip", package = "verdata")
-#' confirm_file(local_dir_csv)
+#' confirm_file(local_dir_csv, version = "v1")
 #'
 #' @noRd
-confirm_file <- function(replicate_path) {
+confirm_file <- function(replicate_path, version) {
+
+    if (is.null(version) | !version %in% c("v1", "v2")) {
+
+        stop("Data version not properly specified. Options are 'v1' or 'v2'.")
+
+    }
 
     violacion <- stringr::str_extract(pattern = "homicidio|desaparicion|secuestro|reclutamiento",
                                       replicate_path)
@@ -37,19 +47,30 @@ confirm_file <- function(replicate_path) {
                                                                      algo = "sha1",
                                                                      file = TRUE)))
 
-    if (file_extension == "parquet") {
+    if (version == "v1" & file_extension == "parquet") {
 
-        file_test <- file %>%
+        file_test <- file_parquet_v1 %>%
             dplyr::filter(replica %in% hash_file$replica &
                               violacion %in% hash_file$violacion)
 
+    } else if (version == "v2" & file_extension == "parquet") {
 
-    } else if (file_extension == "csv") {
+        file_test <- file_parquet_v2 %>%
+            dplyr::filter(replica %in% hash_file$replica &
+                              violacion %in% hash_file$violacion)
+
+    } else if (version == "v1" & file_extension == "csv") {
 
-        file_test <- file_csv %>%
+        file_test <- file_csv_v1 %>%
             dplyr::filter(replica %in% hash_file$replica &
                               violacion %in% hash_file$violacion)
 
+    } else if (version == "v1" & file_extension == "csv") {
+
+        file_test <- file_csv_v2 %>%
+            dplyr::filter(replica %in% hash_file$replica &
+                              violacion %in% hash_file$violacion) # TODO: check
+
     }
 
     is_eq <- all.equal(file_test, hash_file)
@@ -64,10 +85,11 @@ confirm_file <- function(replicate_path) {
 
     else {
 
-
         final <- medidas(replicate_path)
 
-        summary_table <- get(violacion) %>%
+        violacion_file <- paste0(violacion, "_", version)
+
+        summary_table <- get(violacion_file) %>%
             dplyr::filter(replica %in% final$replica)
 
         final <- final[order(final$variable), ]
@@ -106,6 +128,10 @@ confirm_file <- function(replicate_path) {
 #' "reclutamiento", and "desaparicion".
 #' @param replicate_nums A numeric vector containing the replicates to be analyzed.
 #' Values in the vector should be between 1 and 100 inclusive.
+#' @param version Version of the data being read in. Options are "v1" or "v2".
+#' "v1" is appropriate for replicating the replicating the results of the joint
+#' JEP-CEV-HRDAG project. "v2" is appropriate for conducting your new analyses
+#' of the conflict in Colombia.
 #'
 #' @return A data frame row with `replicate_num` rows and two columns:
 #' `replicate_path`, a string indicating the path to the replicate checked and
@@ -117,17 +143,35 @@ confirm_file <- function(replicate_path) {
 #'
 #' @examples
 #' local_dir <- system.file("extdata", "right", package = "verdata")
-#' confirm_files(local_dir, "reclutamiento", c(1, 2))
-confirm_files <- function(replicates_dir, violation, replicate_nums) {
+#' confirm_files(local_dir, "reclutamiento", c(1, 2), version = "v1")
+confirm_files <- function(replicates_dir, violation, replicate_nums, version) {
 
     files <- build_path(replicates_dir, violation, replicate_nums)
 
-    results <- purrr::map_dfr(files, confirm_file)
+    results <- purrr::map_dfr(files, confirm_file, version = version)
 
     if (any(!results$confirmed)) {
 
         warning("Some replicate file contents do not match the published versions")
 
+    } else if (version == "v1") {
+
+        message("You are using v1 of the data. This version is appropriate for
+                replicating the results of the joint JEP-CEV-HRDAG project. If you
+                would like to conduct your own analysis of the conflict in Colombia,
+                please use v2 of the data.")
+
+    } else if (version == "v2") {
+
+        message("You are using v2 of the data. This version is appropriate for
+                conducting your own analysis of the conflict in Colombia. If you
+                would like to repliate the results of the joint JEP-CEV-HRDAG project,
+                please use v1 of the data.")
+
+    } else if (is.null(version) | !version %in% c("v1", "v2")) {
+
+        stop("Data version not properly specified. Options are 'v1' or 'v2'.")
+
     }
 
     return(results)

diff --git a/R/read_replicates.R b/R/read_replicates.R
@@ -9,6 +9,10 @@
 #' @param replicate_path Path to the replicate. The name of the file must include
 #' the violation in Spanish and lower case letters (homicidio, secuestro,
 #' reclutamiento, desaparicion).
+#' @param version Version of the data being read in. Options are "v1" or "v2".
+#' "v1" is appropriate for replicating the replicating the results of the joint
+#' JEP-CEV-HRDAG project. "v2" is appropriate for conducting your new analyses
+#' of the conflict in Colombia.
 #'
 #' @return A data frame with the data from the indicated replicate and a column
 #' `match` indicating whether the file hash matches the expected hash.
@@ -22,17 +26,24 @@
 #'
 #' local_dir <- system.file("extdata", "right",
 #' package = "verdata", "verdata-reclutamiento-R1.parquet")
-#' read_replicate(local_dir)
+#' read_replicate(local_dir, version = "v1")
 #'
 #' @noRd
-read_replicate <- function(replicate_path) {
+read_replicate <- function(replicate_path, version) {
+
+    if (is.null(version) | !version %in% c("v1", "v2")) {
+
+        stop("Data version not properly specified. Options are 'v1' or 'v2'.")
+
+    }
 
     violacion <- stringr::str_extract(pattern = "homicidio|desaparicion|secuestro|reclutamiento",
                                       replicate_path)
 
     file_extension <- stringr::str_extract(pattern = "parquet|csv", replicate_path)
 
-    if (file_extension == "parquet") {
+
+    if (version == "v1" & file_extension == "parquet") {
 
         replicate_data <- arrow::read_parquet(replicate_path)
 
@@ -41,11 +52,25 @@ read_replicate <- function(replicate_path) {
                                                                    replicate_path),
                                     hash = digest::digest(replicate_data, algo = "sha1"))
 
-        content_test <- content %>%
+        content_test <- content_parquet_v1 %>%
             dplyr::filter(replica %in% hash_intor$replica,
                           violacion == hash_intor$violacion)
 
-    } else {
+
+    } else if (version == "v2" & file_extension == "parquet") {
+
+        replicate_data <- arrow::read_parquet(replicate_path)
+
+        hash_intor <- dplyr::tibble(violacion = violacion,
+                                    replica = stringr::str_extract(pattern = ("(?:R)\\d+"),
+                                                                   replicate_path),
+                                    hash = digest::digest(replicate_data, algo = "sha1"))
+
+        content_test <- content_parquet_v2 %>%
+            dplyr::filter(replica %in% hash_intor$replica,
+                          violacion == hash_intor$violacion)
+
+    } else if (version == "v1" & file_extension == "csv") {
 
         replicate_data <- readr::read_csv(replicate_path)
 
@@ -54,7 +79,20 @@ read_replicate <- function(replicate_path) {
                                                                    replicate_path),
                                     hash = digest::digest(replicate_data, algo = "sha1"))
 
-        content_test <- content_csv %>%
+        content_test <- content_csv_v1 %>%
+            dplyr::filter(replica %in% hash_intor$replica)
+
+
+    } else if (version == "v2" & file_extension == "csv") {
+
+        replicate_data <- readr::read_csv(replicate_path)
+
+        hash_intor <- dplyr::tibble(violacion = violacion,
+                                    replica = stringr::str_extract(pattern = ("(?:R)\\d+"),
+                                                                   replicate_path),
+                                    hash = digest::digest(replicate_data, algo = "sha1"))
+
+        content_test <- content_csv_v2 %>%
             dplyr::filter(replica %in% hash_intor$replica)
 
     }
@@ -69,7 +107,9 @@ read_replicate <- function(replicate_path) {
 
         final <- medidas(replicate_path)
 
-        summary_table <- get(violacion) %>%
+        violacion_file <- paste0(violacion, "_", version)
+
+        summary_table <- get(violacion_file) %>%
             dplyr::filter(replica %in% final$replica)
 
         final <- final[order(final$variable), ]
@@ -102,6 +142,10 @@ read_replicate <- function(replicate_path) {
 #' "homicidio", "secuestro", "reclutamiento", and "desaparicion".
 #' @param replicate_nums A numeric vector containing the replicates to be analyzed.
 #' Values in the vector should be between 1 and 100 inclusive.
+#' @param version Version of the data being read in. Options are "v1" or "v2".
+#' "v1" is appropriate for replicating the replicating the results of the joint
+#' JEP-CEV-HRDAG project. "v2" is appropriate for conducting your new analyses
+#' of the conflict in Colombia.
 #' @param crash A parameter to define whether the function should crash if the
 #' content of the file is not identical to the one published. If crash = TRUE
 #' (default), it will return error and not read the data, if crash = FALSE, the
@@ -114,12 +158,18 @@ read_replicate <- function(replicate_path) {
 #'
 #' @examples
 #' local_dir <- system.file("extdata", "right", package = "verdata")
-#' read_replicates(local_dir, "reclutamiento", 1, 2)
+#' read_replicates(local_dir, "reclutamiento", 1, 2, version = "v1")
 read_replicates <- function(replicates_dir, violation, replicate_nums,
-                            crash = TRUE) {
+                            version, crash = TRUE) {
+
+    if (is.null(version) | !version %in% c("v1", "v2")) {
+
+        stop("Data version not properly specified. Options are 'v1' or 'v2'.")
+
+    }
 
     files <- build_path(replicates_dir, violation, replicate_nums)
-    replicate_data <- purrr::map_dfr(files, read_replicate)
+    replicate_data <- purrr::map_dfr(files, read_replicate, version = version)
 
     corrupted_replicates <- replicate_data %>%
         dplyr::filter(!match) %>%
@@ -130,6 +180,22 @@ read_replicates <- function(replicates_dir, violation, replicate_nums,
 
         if (all(replicate_data$match)) {
 
+            if (version == "v1") {
+
+                message("You are using v1 of the data. This version is appropriate for
+                replicating the results of the joint JEP-CEV-HRDAG project. If you
+                would like to conduct your own analysis of the conflict in Colombia,
+                please use v2 of the data.")
+
+            } else if (version == "v2") {
+
+                message("You are using v2 of the data. This version is appropriate for
+                conducting your own analysis of the conflict in Colombia. If you
+                would like to repliate the results of the joint JEP-CEV-HRDAG project,
+                please use v1 of the data.")
+
+            }
+
             return(replicate_data %>% dplyr::select(-match))
 
         } else {
@@ -141,6 +207,23 @@ read_replicates <- function(replicates_dir, violation, replicate_nums,
     } else {
 
         warning(glue::glue("The content of the files is not identical to the ones published.\nThe results of the analysis may be inconsistent.\nThe following replicates have incorrect content:\n{paste0(corrupted_replicates, collapse = '\n')}"))
+
+        if (version == "v1") {
+
+            message("You are using v1 of the data. This version is appropriate for
+                replicating the results of the joint JEP-CEV-HRDAG project. If you
+                would like to conduct your own analysis of the conflict in Colombia,
+                please use v2 of the data.")
+
+        } else if (version == "v2") {
+
+            message("You are using v2 of the data. This version is appropriate for
+                conducting your own analysis of the conflict in Colombia. If you
+                would like to repliate the results of the joint JEP-CEV-HRDAG project,
+                please use v1 of the data.")
+
+        }
+
         return(replicate_data %>% dplyr::select(-match))
 
     }

diff --git a/R/summary_observed.R b/R/summary_observed.R
@@ -18,7 +18,7 @@
 #' @examples
 #' \dontrun{
 #' local_dir <- system.file("extdata", "right", package = "verdata")
-#' replicates_data <- read_replicates(local_dir, "reclutamiento", c(1, 2))
+#' replicates_data <- read_replicates(local_dir, "reclutamiento", c(1, 2), version = "v1")
 #' tab_observed <- summary_observed("reclutamiento", replicates_data,
 #' strata_vars = "sexo", conflict_filter = TRUE, forced_dis_filter = FALSE,
 #' edad_minors_filter = TRUE, include_props = TRUE)
@@ -76,7 +76,7 @@ proportions_observed <- function(obs_data,
 #'
 #' @examples
 #' local_dir <- system.file("extdata", "right", package = "verdata")
-#' replicates_data <- read_replicates(local_dir, "reclutamiento", c(1, 2))
+#' replicates_data <- read_replicates(local_dir, "reclutamiento", c(1, 2), version = "v1")
 #' tab_observed <- summary_observed("reclutamiento", replicates_data,
 #' strata_vars = "sexo", conflict_filter = FALSE, forced_dis_filter = FALSE,
 #' edad_minors_filter = FALSE, include_props = FALSE, digits = 2)

diff --git a/R/sysdata.rda b/R/sysdata.rda
diff --git a/inst/extdata/corrected-right/verdata-corrected-reclutamiento-R1.csv.zip b/inst/extdata/corrected-right/verdata-corrected-reclutamiento-R1.csv.zip
diff --git a/inst/extdata/corrected-right/verdata-corrected-reclutamiento-R1.parquet b/inst/extdata/corrected-right/verdata-corrected-reclutamiento-R1.parquet
diff --git a/inst/extdata/corrected-right/verdata-corrected-reclutamiento-R2.parquet b/inst/extdata/corrected-right/verdata-corrected-reclutamiento-R2.parquet
diff --git a/man/combine_replicates.Rd b/man/combine_replicates.Rd
diff --git a/man/confirm_files.Rd b/man/confirm_files.Rd