Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add functionality for data versions #40

Merged
merged 6 commits into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/check-standard.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
- {os: windows-latest, r: 'release'}
- {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'}
- {os: ubuntu-latest, r: 'release'}
- {os: ubuntu-latest, r: 'oldrel-1'}
# - {os: ubuntu-latest, r: 'oldrel-1'}

env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
Expand Down
6 changes: 4 additions & 2 deletions R/combine_replicates.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
#' @examples
#' local_dir <- system.file("extdata", "right", package = "verdata")
#' replicates_data <- read_replicates(replicates_dir = local_dir,
#' violation = "reclutamiento", replicate_nums = c(1, 2), crash = TRUE)
#' violation = "reclutamiento", replicate_nums = c(1, 2), version = "v1",
#' crash = TRUE)
#' replicates_obs_data <- summary_observed("reclutamiento", replicates_data,
#' strata_vars = "sexo", conflict_filter = FALSE, forced_dis_filter = FALSE,
#' edad_minors_filter = FALSE, include_props = FALSE)
Expand Down Expand Up @@ -82,7 +83,8 @@ proportions_imputed <- function(complete_data,
#' @examples
#' \dontrun{
#' local_dir <- system.file("extdata", "right", package = "verdata")
#' replicates_data <- read_replicates(local_dir, "reclutamiento", c(1, 2))
#' replicates_data <- read_replicates(local_dir, "reclutamiento", c(1, 2),
#' version = "v1")
#' replicates_obs_data <- summary_observed("reclutamiento", replicates_data,
#' strata_vars = "sexo", conflict_filter = FALSE, forced_dis_filter = FALSE,
#' edad_minors_filter = FALSE, include_props = FALSE, digits = 2)
Expand Down
66 changes: 55 additions & 11 deletions R/confirm_files.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
#' @param replicate_path Path to the replicate to be confirmed. The name
#' of the files must include the violation in Spanish and lower case letters
#' (homicidio, secuestro, reclutamiento, desaparicion).
#' @param version Version of the data being read in. Options are "v1" or "v2".
#' "v1" is appropriate for replicating the replicating the results of the joint
#' JEP-CEV-HRDAG project. "v2" is appropriate for conducting your new analyses
#' of the conflict in Colombia.
#
#' @return A data frame row with two columns: `replicate_path`, a string indicating the
#' path to the replicate checked and `confirmed`, a boolean values indicating
Expand All @@ -19,10 +23,16 @@
#' @examples
#' local_dir_csv <- system.file("extdata", "right",
#' "verdata-reclutamiento-R1.csv.zip", package = "verdata")
#' confirm_file(local_dir_csv)
#' confirm_file(local_dir_csv, version = "v1")
#'
#' @noRd
confirm_file <- function(replicate_path) {
confirm_file <- function(replicate_path, version) {

if (is.null(version) | !version %in% c("v1", "v2")) {

stop("Data version not properly specified. Options are 'v1' or 'v2'.")

}

violacion <- stringr::str_extract(pattern = "homicidio|desaparicion|secuestro|reclutamiento",
replicate_path)
Expand All @@ -37,19 +47,30 @@ confirm_file <- function(replicate_path) {
algo = "sha1",
file = TRUE)))

if (file_extension == "parquet") {
if (version == "v1" & file_extension == "parquet") {

file_test <- file %>%
file_test <- file_parquet_v1 %>%
dplyr::filter(replica %in% hash_file$replica &
violacion %in% hash_file$violacion)

} else if (version == "v2" & file_extension == "parquet") {

} else if (file_extension == "csv") {
file_test <- file_parquet_v2 %>%
dplyr::filter(replica %in% hash_file$replica &
violacion %in% hash_file$violacion)

} else if (version == "v1" & file_extension == "csv") {

file_test <- file_csv %>%
file_test <- file_csv_v1 %>%
dplyr::filter(replica %in% hash_file$replica &
violacion %in% hash_file$violacion)

} else if (version == "v1" & file_extension == "csv") {

file_test <- file_csv_v2 %>%
dplyr::filter(replica %in% hash_file$replica &
violacion %in% hash_file$violacion) # TODO: check

}

is_eq <- all.equal(file_test, hash_file)
Expand All @@ -64,10 +85,11 @@ confirm_file <- function(replicate_path) {

else {


final <- medidas(replicate_path)

summary_table <- get(violacion) %>%
violacion_file <- paste0(violacion, "_", version)

summary_table <- get(violacion_file) %>%
dplyr::filter(replica %in% final$replica)

final <- final[order(final$variable), ]
Expand Down Expand Up @@ -106,6 +128,10 @@ confirm_file <- function(replicate_path) {
#' "reclutamiento", and "desaparicion".
#' @param replicate_nums A numeric vector containing the replicates to be analyzed.
#' Values in the vector should be between 1 and 100 inclusive.
#' @param version Version of the data being read in. Options are "v1" or "v2".
#' "v1" is appropriate for replicating the replicating the results of the joint
#' JEP-CEV-HRDAG project. "v2" is appropriate for conducting your new analyses
#' of the conflict in Colombia.
#'
#' @return A data frame row with `replicate_num` rows and two columns:
#' `replicate_path`, a string indicating the path to the replicate checked and
Expand All @@ -117,17 +143,35 @@ confirm_file <- function(replicate_path) {
#'
#' @examples
#' local_dir <- system.file("extdata", "right", package = "verdata")
#' confirm_files(local_dir, "reclutamiento", c(1, 2))
confirm_files <- function(replicates_dir, violation, replicate_nums) {
#' confirm_files(local_dir, "reclutamiento", c(1, 2), version = "v1")
confirm_files <- function(replicates_dir, violation, replicate_nums, version) {

files <- build_path(replicates_dir, violation, replicate_nums)

results <- purrr::map_dfr(files, confirm_file)
results <- purrr::map_dfr(files, confirm_file, version = version)

if (any(!results$confirmed)) {

warning("Some replicate file contents do not match the published versions")

} else if (version == "v1") {

message("You are using v1 of the data. This version is appropriate for
replicating the results of the joint JEP-CEV-HRDAG project. If you
would like to conduct your own analysis of the conflict in Colombia,
please use v2 of the data.")

} else if (version == "v2") {

message("You are using v2 of the data. This version is appropriate for
conducting your own analysis of the conflict in Colombia. If you
would like to repliate the results of the joint JEP-CEV-HRDAG project,
please use v1 of the data.")

} else if (is.null(version) | !version %in% c("v1", "v2")) {

stop("Data version not properly specified. Options are 'v1' or 'v2'.")

}

return(results)
Expand Down
103 changes: 93 additions & 10 deletions R/read_replicates.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
#' @param replicate_path Path to the replicate. The name of the file must include
#' the violation in Spanish and lower case letters (homicidio, secuestro,
#' reclutamiento, desaparicion).
#' @param version Version of the data being read in. Options are "v1" or "v2".
#' "v1" is appropriate for replicating the replicating the results of the joint
#' JEP-CEV-HRDAG project. "v2" is appropriate for conducting your new analyses
#' of the conflict in Colombia.
#'
#' @return A data frame with the data from the indicated replicate and a column
#' `match` indicating whether the file hash matches the expected hash.
Expand All @@ -22,17 +26,24 @@
#'
#' local_dir <- system.file("extdata", "right",
#' package = "verdata", "verdata-reclutamiento-R1.parquet")
#' read_replicate(local_dir)
#' read_replicate(local_dir, version = "v1")
#'
#' @noRd
read_replicate <- function(replicate_path) {
read_replicate <- function(replicate_path, version) {

if (is.null(version) | !version %in% c("v1", "v2")) {

stop("Data version not properly specified. Options are 'v1' or 'v2'.")

}

violacion <- stringr::str_extract(pattern = "homicidio|desaparicion|secuestro|reclutamiento",
replicate_path)

file_extension <- stringr::str_extract(pattern = "parquet|csv", replicate_path)

if (file_extension == "parquet") {

if (version == "v1" & file_extension == "parquet") {

replicate_data <- arrow::read_parquet(replicate_path)

Expand All @@ -41,11 +52,25 @@ read_replicate <- function(replicate_path) {
replicate_path),
hash = digest::digest(replicate_data, algo = "sha1"))

content_test <- content %>%
content_test <- content_parquet_v1 %>%
dplyr::filter(replica %in% hash_intor$replica,
violacion == hash_intor$violacion)

} else {

} else if (version == "v2" & file_extension == "parquet") {

replicate_data <- arrow::read_parquet(replicate_path)

hash_intor <- dplyr::tibble(violacion = violacion,
replica = stringr::str_extract(pattern = ("(?:R)\\d+"),
replicate_path),
hash = digest::digest(replicate_data, algo = "sha1"))

content_test <- content_parquet_v2 %>%
dplyr::filter(replica %in% hash_intor$replica,
violacion == hash_intor$violacion)

} else if (version == "v1" & file_extension == "csv") {

replicate_data <- readr::read_csv(replicate_path)

Expand All @@ -54,7 +79,20 @@ read_replicate <- function(replicate_path) {
replicate_path),
hash = digest::digest(replicate_data, algo = "sha1"))

content_test <- content_csv %>%
content_test <- content_csv_v1 %>%
dplyr::filter(replica %in% hash_intor$replica)


} else if (version == "v2" & file_extension == "csv") {

replicate_data <- readr::read_csv(replicate_path)

hash_intor <- dplyr::tibble(violacion = violacion,
replica = stringr::str_extract(pattern = ("(?:R)\\d+"),
replicate_path),
hash = digest::digest(replicate_data, algo = "sha1"))

content_test <- content_csv_v2 %>%
dplyr::filter(replica %in% hash_intor$replica)

}
Expand All @@ -69,7 +107,9 @@ read_replicate <- function(replicate_path) {

final <- medidas(replicate_path)

summary_table <- get(violacion) %>%
violacion_file <- paste0(violacion, "_", version)

summary_table <- get(violacion_file) %>%
dplyr::filter(replica %in% final$replica)

final <- final[order(final$variable), ]
Expand Down Expand Up @@ -102,6 +142,10 @@ read_replicate <- function(replicate_path) {
#' "homicidio", "secuestro", "reclutamiento", and "desaparicion".
#' @param replicate_nums A numeric vector containing the replicates to be analyzed.
#' Values in the vector should be between 1 and 100 inclusive.
#' @param version Version of the data being read in. Options are "v1" or "v2".
#' "v1" is appropriate for replicating the replicating the results of the joint
#' JEP-CEV-HRDAG project. "v2" is appropriate for conducting your new analyses
#' of the conflict in Colombia.
#' @param crash A parameter to define whether the function should crash if the
#' content of the file is not identical to the one published. If crash = TRUE
#' (default), it will return error and not read the data, if crash = FALSE, the
Expand All @@ -114,12 +158,18 @@ read_replicate <- function(replicate_path) {
#'
#' @examples
#' local_dir <- system.file("extdata", "right", package = "verdata")
#' read_replicates(local_dir, "reclutamiento", 1, 2)
#' read_replicates(local_dir, "reclutamiento", 1, 2, version = "v1")
read_replicates <- function(replicates_dir, violation, replicate_nums,
crash = TRUE) {
version, crash = TRUE) {

if (is.null(version) | !version %in% c("v1", "v2")) {

stop("Data version not properly specified. Options are 'v1' or 'v2'.")

}

files <- build_path(replicates_dir, violation, replicate_nums)
replicate_data <- purrr::map_dfr(files, read_replicate)
replicate_data <- purrr::map_dfr(files, read_replicate, version = version)

corrupted_replicates <- replicate_data %>%
dplyr::filter(!match) %>%
Expand All @@ -130,6 +180,22 @@ read_replicates <- function(replicates_dir, violation, replicate_nums,

if (all(replicate_data$match)) {

if (version == "v1") {

message("You are using v1 of the data. This version is appropriate for
replicating the results of the joint JEP-CEV-HRDAG project. If you
would like to conduct your own analysis of the conflict in Colombia,
please use v2 of the data.")

} else if (version == "v2") {

message("You are using v2 of the data. This version is appropriate for
conducting your own analysis of the conflict in Colombia. If you
would like to repliate the results of the joint JEP-CEV-HRDAG project,
please use v1 of the data.")

}

return(replicate_data %>% dplyr::select(-match))

} else {
Expand All @@ -141,6 +207,23 @@ read_replicates <- function(replicates_dir, violation, replicate_nums,
} else {

warning(glue::glue("The content of the files is not identical to the ones published.\nThe results of the analysis may be inconsistent.\nThe following replicates have incorrect content:\n{paste0(corrupted_replicates, collapse = '\n')}"))

if (version == "v1") {

message("You are using v1 of the data. This version is appropriate for
replicating the results of the joint JEP-CEV-HRDAG project. If you
would like to conduct your own analysis of the conflict in Colombia,
please use v2 of the data.")

} else if (version == "v2") {

message("You are using v2 of the data. This version is appropriate for
conducting your own analysis of the conflict in Colombia. If you
would like to repliate the results of the joint JEP-CEV-HRDAG project,
please use v1 of the data.")

}

return(replicate_data %>% dplyr::select(-match))

}
Expand Down
4 changes: 2 additions & 2 deletions R/summary_observed.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#' @examples
#' \dontrun{
#' local_dir <- system.file("extdata", "right", package = "verdata")
#' replicates_data <- read_replicates(local_dir, "reclutamiento", c(1, 2))
#' replicates_data <- read_replicates(local_dir, "reclutamiento", c(1, 2), version = "v1")
#' tab_observed <- summary_observed("reclutamiento", replicates_data,
#' strata_vars = "sexo", conflict_filter = TRUE, forced_dis_filter = FALSE,
#' edad_minors_filter = TRUE, include_props = TRUE)
Expand Down Expand Up @@ -76,7 +76,7 @@ proportions_observed <- function(obs_data,
#'
#' @examples
#' local_dir <- system.file("extdata", "right", package = "verdata")
#' replicates_data <- read_replicates(local_dir, "reclutamiento", c(1, 2))
#' replicates_data <- read_replicates(local_dir, "reclutamiento", c(1, 2), version = "v1")
#' tab_observed <- summary_observed("reclutamiento", replicates_data,
#' strata_vars = "sexo", conflict_filter = FALSE, forced_dis_filter = FALSE,
#' edad_minors_filter = FALSE, include_props = FALSE, digits = 2)
Expand Down
Binary file modified R/sysdata.rda
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
3 changes: 2 additions & 1 deletion man/combine_replicates.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 7 additions & 2 deletions man/confirm_files.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading