Skip to content

Commit

Permalink
Merge pull request #40 from HRDAG/data-update
Browse files Browse the repository at this point in the history
Add functionality for data versions
  • Loading branch information
thegargiulian authored Sep 12, 2024
2 parents bd0ab33 + 2d7a43e commit 6e13f22
Show file tree
Hide file tree
Showing 21 changed files with 222 additions and 71 deletions.
6 changes: 4 additions & 2 deletions R/combine_replicates.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
#' @examples
#' local_dir <- system.file("extdata", "right", package = "verdata")
#' replicates_data <- read_replicates(replicates_dir = local_dir,
#' violation = "reclutamiento", replicate_nums = c(1, 2), crash = TRUE)
#' violation = "reclutamiento", replicate_nums = c(1, 2), version = "v1",
#' crash = TRUE)
#' replicates_obs_data <- summary_observed("reclutamiento", replicates_data,
#' strata_vars = "sexo", conflict_filter = FALSE, forced_dis_filter = FALSE,
#' edad_minors_filter = FALSE, include_props = FALSE)
Expand Down Expand Up @@ -82,7 +83,8 @@ proportions_imputed <- function(complete_data,
#' @examples
#' \dontrun{
#' local_dir <- system.file("extdata", "right", package = "verdata")
#' replicates_data <- read_replicates(local_dir, "reclutamiento", c(1, 2))
#' replicates_data <- read_replicates(local_dir, "reclutamiento", c(1, 2),
#' version = "v1")
#' replicates_obs_data <- summary_observed("reclutamiento", replicates_data,
#' strata_vars = "sexo", conflict_filter = FALSE, forced_dis_filter = FALSE,
#' edad_minors_filter = FALSE, include_props = FALSE, digits = 2)
Expand Down
66 changes: 55 additions & 11 deletions R/confirm_files.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
#' @param replicate_path Path to the replicate to be confirmed. The name
#' of the files must include the violation in Spanish and lower case letters
#' (homicidio, secuestro, reclutamiento, desaparicion).
#' @param version Version of the data being read in. Options are "v1" or "v2".
#' "v1" is appropriate for replicating the replicating the results of the joint
#' JEP-CEV-HRDAG project. "v2" is appropriate for conducting your new analyses
#' of the conflict in Colombia.
#
#' @return A data frame row with two columns: `replicate_path`, a string indicating the
#' path to the replicate checked and `confirmed`, a boolean values indicating
Expand All @@ -19,10 +23,16 @@
#' @examples
#' local_dir_csv <- system.file("extdata", "right",
#' "verdata-reclutamiento-R1.csv.zip", package = "verdata")
#' confirm_file(local_dir_csv)
#' confirm_file(local_dir_csv, version = "v1")
#'
#' @noRd
confirm_file <- function(replicate_path) {
confirm_file <- function(replicate_path, version) {

if (is.null(version) | !version %in% c("v1", "v2")) {

stop("Data version not properly specified. Options are 'v1' or 'v2'.")

}

violacion <- stringr::str_extract(pattern = "homicidio|desaparicion|secuestro|reclutamiento",
replicate_path)
Expand All @@ -37,19 +47,30 @@ confirm_file <- function(replicate_path) {
algo = "sha1",
file = TRUE)))

if (file_extension == "parquet") {
if (version == "v1" & file_extension == "parquet") {

file_test <- file %>%
file_test <- file_parquet_v1 %>%
dplyr::filter(replica %in% hash_file$replica &
violacion %in% hash_file$violacion)

} else if (version == "v2" & file_extension == "parquet") {

} else if (file_extension == "csv") {
file_test <- file_parquet_v2 %>%
dplyr::filter(replica %in% hash_file$replica &
violacion %in% hash_file$violacion)

} else if (version == "v1" & file_extension == "csv") {

file_test <- file_csv %>%
file_test <- file_csv_v1 %>%
dplyr::filter(replica %in% hash_file$replica &
violacion %in% hash_file$violacion)

} else if (version == "v1" & file_extension == "csv") {

file_test <- file_csv_v2 %>%
dplyr::filter(replica %in% hash_file$replica &
violacion %in% hash_file$violacion) # TODO: check

}

is_eq <- all.equal(file_test, hash_file)
Expand All @@ -64,10 +85,11 @@ confirm_file <- function(replicate_path) {

else {


final <- medidas(replicate_path)

summary_table <- get(violacion) %>%
violacion_file <- paste0(violacion, "_", version)

summary_table <- get(violacion_file) %>%
dplyr::filter(replica %in% final$replica)

final <- final[order(final$variable), ]
Expand Down Expand Up @@ -106,6 +128,10 @@ confirm_file <- function(replicate_path) {
#' "reclutamiento", and "desaparicion".
#' @param replicate_nums A numeric vector containing the replicates to be analyzed.
#' Values in the vector should be between 1 and 100 inclusive.
#' @param version Version of the data being read in. Options are "v1" or "v2".
#' "v1" is appropriate for replicating the replicating the results of the joint
#' JEP-CEV-HRDAG project. "v2" is appropriate for conducting your new analyses
#' of the conflict in Colombia.
#'
#' @return A data frame row with `replicate_num` rows and two columns:
#' `replicate_path`, a string indicating the path to the replicate checked and
Expand All @@ -117,17 +143,35 @@ confirm_file <- function(replicate_path) {
#'
#' @examples
#' local_dir <- system.file("extdata", "right", package = "verdata")
#' confirm_files(local_dir, "reclutamiento", c(1, 2))
confirm_files <- function(replicates_dir, violation, replicate_nums) {
#' confirm_files(local_dir, "reclutamiento", c(1, 2), version = "v1")
confirm_files <- function(replicates_dir, violation, replicate_nums, version) {

files <- build_path(replicates_dir, violation, replicate_nums)

results <- purrr::map_dfr(files, confirm_file)
results <- purrr::map_dfr(files, confirm_file, version = version)

if (any(!results$confirmed)) {

warning("Some replicate file contents do not match the published versions")

} else if (version == "v1") {

message("You are using v1 of the data. This version is appropriate for
replicating the results of the joint JEP-CEV-HRDAG project. If you
would like to conduct your own analysis of the conflict in Colombia,
please use v2 of the data.")

} else if (version == "v2") {

message("You are using v2 of the data. This version is appropriate for
conducting your own analysis of the conflict in Colombia. If you
would like to repliate the results of the joint JEP-CEV-HRDAG project,
please use v1 of the data.")

} else if (is.null(version) | !version %in% c("v1", "v2")) {

stop("Data version not properly specified. Options are 'v1' or 'v2'.")

}

return(results)
Expand Down
103 changes: 93 additions & 10 deletions R/read_replicates.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
#' @param replicate_path Path to the replicate. The name of the file must include
#' the violation in Spanish and lower case letters (homicidio, secuestro,
#' reclutamiento, desaparicion).
#' @param version Version of the data being read in. Options are "v1" or "v2".
#' "v1" is appropriate for replicating the replicating the results of the joint
#' JEP-CEV-HRDAG project. "v2" is appropriate for conducting your new analyses
#' of the conflict in Colombia.
#'
#' @return A data frame with the data from the indicated replicate and a column
#' `match` indicating whether the file hash matches the expected hash.
Expand All @@ -22,17 +26,24 @@
#'
#' local_dir <- system.file("extdata", "right",
#' package = "verdata", "verdata-reclutamiento-R1.parquet")
#' read_replicate(local_dir)
#' read_replicate(local_dir, version = "v1")
#'
#' @noRd
read_replicate <- function(replicate_path) {
read_replicate <- function(replicate_path, version) {

if (is.null(version) | !version %in% c("v1", "v2")) {

stop("Data version not properly specified. Options are 'v1' or 'v2'.")

}

violacion <- stringr::str_extract(pattern = "homicidio|desaparicion|secuestro|reclutamiento",
replicate_path)

file_extension <- stringr::str_extract(pattern = "parquet|csv", replicate_path)

if (file_extension == "parquet") {

if (version == "v1" & file_extension == "parquet") {

replicate_data <- arrow::read_parquet(replicate_path)

Expand All @@ -41,11 +52,25 @@ read_replicate <- function(replicate_path) {
replicate_path),
hash = digest::digest(replicate_data, algo = "sha1"))

content_test <- content %>%
content_test <- content_parquet_v1 %>%
dplyr::filter(replica %in% hash_intor$replica,
violacion == hash_intor$violacion)

} else {

} else if (version == "v2" & file_extension == "parquet") {

replicate_data <- arrow::read_parquet(replicate_path)

hash_intor <- dplyr::tibble(violacion = violacion,
replica = stringr::str_extract(pattern = ("(?:R)\\d+"),
replicate_path),
hash = digest::digest(replicate_data, algo = "sha1"))

content_test <- content_parquet_v2 %>%
dplyr::filter(replica %in% hash_intor$replica,
violacion == hash_intor$violacion)

} else if (version == "v1" & file_extension == "csv") {

replicate_data <- readr::read_csv(replicate_path)

Expand All @@ -54,7 +79,20 @@ read_replicate <- function(replicate_path) {
replicate_path),
hash = digest::digest(replicate_data, algo = "sha1"))

content_test <- content_csv %>%
content_test <- content_csv_v1 %>%
dplyr::filter(replica %in% hash_intor$replica)


} else if (version == "v2" & file_extension == "csv") {

replicate_data <- readr::read_csv(replicate_path)

hash_intor <- dplyr::tibble(violacion = violacion,
replica = stringr::str_extract(pattern = ("(?:R)\\d+"),
replicate_path),
hash = digest::digest(replicate_data, algo = "sha1"))

content_test <- content_csv_v2 %>%
dplyr::filter(replica %in% hash_intor$replica)

}
Expand All @@ -69,7 +107,9 @@ read_replicate <- function(replicate_path) {

final <- medidas(replicate_path)

summary_table <- get(violacion) %>%
violacion_file <- paste0(violacion, "_", version)

summary_table <- get(violacion_file) %>%
dplyr::filter(replica %in% final$replica)

final <- final[order(final$variable), ]
Expand Down Expand Up @@ -102,6 +142,10 @@ read_replicate <- function(replicate_path) {
#' "homicidio", "secuestro", "reclutamiento", and "desaparicion".
#' @param replicate_nums A numeric vector containing the replicates to be analyzed.
#' Values in the vector should be between 1 and 100 inclusive.
#' @param version Version of the data being read in. Options are "v1" or "v2".
#' "v1" is appropriate for replicating the replicating the results of the joint
#' JEP-CEV-HRDAG project. "v2" is appropriate for conducting your new analyses
#' of the conflict in Colombia.
#' @param crash A parameter to define whether the function should crash if the
#' content of the file is not identical to the one published. If crash = TRUE
#' (default), it will return error and not read the data, if crash = FALSE, the
Expand All @@ -114,12 +158,18 @@ read_replicate <- function(replicate_path) {
#'
#' @examples
#' local_dir <- system.file("extdata", "right", package = "verdata")
#' read_replicates(local_dir, "reclutamiento", 1, 2)
#' read_replicates(local_dir, "reclutamiento", 1, 2, version = "v1")
read_replicates <- function(replicates_dir, violation, replicate_nums,
crash = TRUE) {
version, crash = TRUE) {

if (is.null(version) | !version %in% c("v1", "v2")) {

stop("Data version not properly specified. Options are 'v1' or 'v2'.")

}

files <- build_path(replicates_dir, violation, replicate_nums)
replicate_data <- purrr::map_dfr(files, read_replicate)
replicate_data <- purrr::map_dfr(files, read_replicate, version = version)

corrupted_replicates <- replicate_data %>%
dplyr::filter(!match) %>%
Expand All @@ -130,6 +180,22 @@ read_replicates <- function(replicates_dir, violation, replicate_nums,

if (all(replicate_data$match)) {

if (version == "v1") {

message("You are using v1 of the data. This version is appropriate for
replicating the results of the joint JEP-CEV-HRDAG project. If you
would like to conduct your own analysis of the conflict in Colombia,
please use v2 of the data.")

} else if (version == "v2") {

message("You are using v2 of the data. This version is appropriate for
conducting your own analysis of the conflict in Colombia. If you
would like to repliate the results of the joint JEP-CEV-HRDAG project,
please use v1 of the data.")

}

return(replicate_data %>% dplyr::select(-match))

} else {
Expand All @@ -141,6 +207,23 @@ read_replicates <- function(replicates_dir, violation, replicate_nums,
} else {

warning(glue::glue("The content of the files is not identical to the ones published.\nThe results of the analysis may be inconsistent.\nThe following replicates have incorrect content:\n{paste0(corrupted_replicates, collapse = '\n')}"))

if (version == "v1") {

message("You are using v1 of the data. This version is appropriate for
replicating the results of the joint JEP-CEV-HRDAG project. If you
would like to conduct your own analysis of the conflict in Colombia,
please use v2 of the data.")

} else if (version == "v2") {

message("You are using v2 of the data. This version is appropriate for
conducting your own analysis of the conflict in Colombia. If you
would like to repliate the results of the joint JEP-CEV-HRDAG project,
please use v1 of the data.")

}

return(replicate_data %>% dplyr::select(-match))

}
Expand Down
4 changes: 2 additions & 2 deletions R/summary_observed.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#' @examples
#' \dontrun{
#' local_dir <- system.file("extdata", "right", package = "verdata")
#' replicates_data <- read_replicates(local_dir, "reclutamiento", c(1, 2))
#' replicates_data <- read_replicates(local_dir, "reclutamiento", c(1, 2), version = "v1")
#' tab_observed <- summary_observed("reclutamiento", replicates_data,
#' strata_vars = "sexo", conflict_filter = TRUE, forced_dis_filter = FALSE,
#' edad_minors_filter = TRUE, include_props = TRUE)
Expand Down Expand Up @@ -76,7 +76,7 @@ proportions_observed <- function(obs_data,
#'
#' @examples
#' local_dir <- system.file("extdata", "right", package = "verdata")
#' replicates_data <- read_replicates(local_dir, "reclutamiento", c(1, 2))
#' replicates_data <- read_replicates(local_dir, "reclutamiento", c(1, 2), version = "v1")
#' tab_observed <- summary_observed("reclutamiento", replicates_data,
#' strata_vars = "sexo", conflict_filter = FALSE, forced_dis_filter = FALSE,
#' edad_minors_filter = FALSE, include_props = FALSE, digits = 2)
Expand Down
Binary file modified R/sysdata.rda
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
3 changes: 2 additions & 1 deletion man/combine_replicates.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 7 additions & 2 deletions man/confirm_files.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion man/proportions_imputed.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 6e13f22

Please sign in to comment.