diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..d130929 --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,6 @@ +^LICENSE\.md$ +^data-raw$ +^README\.Rmd$ +^_pkgdown\.yml$ +^docs$ +^pkgdown$ diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..08ea0cd --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,41 @@ +Package: xmap +Title: A principled approach to recoding and redistributing data between + nomenclature +Version: 0.0.1 +Authors@R: c( + person("Cynthia", "Huang", , "cynthia@gmail.com", role = c("aut", "cre")), + person("Laura", "Puzzello", role = c("aut", "fnd")) + ) +Description: Provides tools for creating and verifying classification, + category and/or nomenclature mapping objects. +License: MIT + file LICENSE +URL: https://github.com/cynthiahqy/xmap +Config/testthat/edition: 3 +Encoding: UTF-8 +Roxygen: list(markdown = TRUE) +RoxygenNote: 7.2.2 +Depends: + R (>= 4.1) +Imports: + cli, + dplyr, + glue, + rlang (>= 1.0.0), + tibble, + tidyr +Suggests: + forcats, + ggbump, + ggplot2, + knitr, + matlib, + Matrix, + patchwork, + rmarkdown, + stats, + stringr, + testthat (>= 3.0.0) +LazyData: true +VignetteBuilder: knitr +LitrVersionUsed: 0.7.0 +LitrId: 63fe8913732cbd8d745656e34cc88010 diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..355e013 --- /dev/null +++ b/LICENSE @@ -0,0 +1,2 @@ +YEAR: 2023 +COPYRIGHT HOLDER: C. Huang diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..9e0db0d --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,21 @@ +# MIT License + +Copyright (c) 2023 C. Huang + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..58ff08a --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,38 @@ +# Generated by roxygen2: do not edit by hand + +S3method(as_xmap_df,data.frame) +S3method(print,xmap_df) +S3method(xmap_reverse,xmap_df) +S3method(xmap_to_matrix,xmap_df) +export(add_weights_equal) +export(add_weights_unit) +export(as_pairs_from_named) +export(as_xmap_df) +export(is_xmap) +export(is_xmap_df) +export(msg_abort_frac_weights) +export(msg_abort_named_matchset) +export(pairs_to_named_list) +export(pairs_to_named_vector) +export(verify_links_as_xmap) +export(verify_named_all_1to1) +export(verify_named_all_names_unique) +export(verify_named_all_unique) +export(verify_named_all_values_unique) +export(verify_named_as_recode_unique) +export(verify_named_matchset_names_contain) +export(verify_named_matchset_names_exact) +export(verify_named_matchset_names_within) +export(verify_named_matchset_values_contain) +export(verify_named_matchset_values_exact) +export(verify_named_matchset_values_within) +export(verify_named_no_dup_names) +export(verify_named_no_dup_values) +export(verify_pairs_all_1to1) +export(verify_pairs_all_unique) +export(verify_pairs_as_recode_unique) +export(xmap_drop_extra) +export(xmap_reverse) +export(xmap_to_matrix) +export(xmap_to_named_list) +export(xmap_to_named_vector) diff --git a/R/abort.R b/R/abort.R new file mode 100644 index 0000000..02b80cc --- /dev/null +++ b/R/abort.R @@ -0,0 +1,120 @@ +# Generated from create-xmap.Rmd: do not edit by hand + +#' @describeIn abort Abort if named columns can't be found in df +#' +abort_missing_cols <- function(df, cols) { + missing_cols <- setdiff(cols, names(df)) + if (length(missing_cols) != 0) { + cli::cli_abort( + message = "The column{?s} {.var {missing_cols}} {?was/were} not found.", + class = "abort_missing_cols" + ) + } + invisible(df) +} + +#' @describeIn abort Abort if xmap_df has missing values +#' +abort_any_na <- function(df) { + if (base::anyNA(df)) { + cli::cli_abort( + message = "NA values found. Please enter missing `from` or `to` node labels and/or convert NA weights", + class = "abort_na" + ) + } + invisible(df) +} + +#' Validation functions and messages for xmap or candidate links (Internal) +#' +#' @description +#' Checks issues with data.frame like objects containing validated `xmap` or candidate links. +#' +#' @param df a data.frame-like object containing links +#' @param col_from,col_to,col_weights character vector or values naming columns from `df` +#' +#' @returns An error if the validation condition fails, +#' and invisibly returns `df` otherwise. +#' +#' @name abort +NULL + +#' @describeIn abort Abort if xmap_df has wrong column types +#' +abort_weights_col_type <- function(df, col_weights) { + if (!is.numeric(df[[col_weights]])) { + cli::cli_abort( + message = "The column `{col_weights}` should be of type numeric", + class = "abort_col_type" + ) + } + invisible(df) +} + +#' @describeIn abort Abort if duplicate source-target pairs are found +#' +abort_dup_pairs <- function(df, col_from, col_to) { + if (!vhas_no_dup_pairs(df[[col_from]], df[[col_to]])) { + cli::cli_abort( + message = "Duplicate `from`-`to` links were found. + Please remove or collapse duplicates.", + class = "abort_dup_pairs" + ) + } + invisible(df) +} + +#' @describeIn abort Abort for invalid mapping weights +#' +abort_bad_weights <- function(col_weights, call = rlang::caller_env()) { + cli::cli_abort( + message = c( + "Incomplete mapping weights found", + "x" = "{.var {col_weights}} does not sum to 1", + "i" = "Modify weights or adjust `tol` and try again."), + class = "abort_bad_weights", + call = call + ) +} + +#' @describeIn abort Abort if xmap_df columns are not in order +abort_col_order <- function(df, col_from, col_to, col_weights){ + correct_order <- c(col_from, col_to, col_weights) + first_three <- names(df[1:3]) + if(!identical(first_three, correct_order)){ + rlang::abort( + message = "columns are not sorted in order `from`, `to`, `weights`", + class = "abort_col_order" + ) + } + invisible(df) +} + +#' @describeIn abort Abort if from_set attribute doesn't match xmap_df values +#' +abort_from_set <- function(df, col_from, from_set) { + col_from_set <- as.character(unique(df[[col_from]])) + stopifnot(identical(col_from_set, from_set)) + + invisible(df) +} + +#' @describeIn abort Abort message for fractional weights +#' @export +msg_abort_frac_weights <- function(impact){ + cli::format_error(c( + "`x` contains fractional weights. {impact}", + "x" = "You've supplied a xmap with weights not equal to 1") + ) +} + +#' @describeIn abort Abort if xmap_df is not reversible without new weights +#' +abort_not_reversible <- function(df, col_to) { + x_to <- df[[col_to]] + if (vhas_collapse(x_to)){ + cli::cli_abort("Collapse links in {.var xmap_df} cannot be reversed. Please supply new weights and create a new xmap.") + } + invisible(df) +} + diff --git a/R/add_weights_equal.R b/R/add_weights_equal.R new file mode 100644 index 0000000..01ac4b5 --- /dev/null +++ b/R/add_weights_equal.R @@ -0,0 +1,28 @@ +# Generated from create-xmap.Rmd: do not edit by hand + +#' Add equal fractional weights to groups of source-target node pairs +#' +#' Attaches equal weights to every source-target link from a given source node. +#' The resultant weighted links can be verified or coerced into an `xmap`. +#' +#' @inheritParams verify_pairs +#' @inheritParams add_weights_unit +#' +#' @return `pairs` with additional column of weights +#' @family {Helpers for adding weights to pairs} +#' @export +#' +#' @examples +#' animal_pairs <- list(MAMM = c("elephant", "whale", "monkey"), +#' REPT = c("lizard", "turtle"), +#' CRUS = c("crab")) |> +#' as_pairs_from_named("class", "animal") +#' animal_pairs |> +#' add_weights_equal(from = class, to = animal) +add_weights_equal <- function(df, from, to, weights_into = "weights"){ + ## TODO: validate_pairs_unique() + df |> + dplyr::group_by({{from}}) |> + dplyr::mutate("{weights_into}" := 1/dplyr::n_distinct({{to}})) |> + dplyr::ungroup() +} diff --git a/R/add_weights_unit.R b/R/add_weights_unit.R new file mode 100644 index 0000000..d4128f7 --- /dev/null +++ b/R/add_weights_unit.R @@ -0,0 +1,24 @@ +# Generated from create-xmap.Rmd: do not edit by hand + +#' Add unit weights to node pairs table +#' +#' Attaches column of unit weights to pairs of source-target nodes. +#' The resultant weighted links can be verified or coerced into `xmap`. +#' +#' @inheritParams verify_pairs +#' @param weights_into character string naming new column to store link weights in +#' +#' @return `pairs` with additional column of ones +#' @family {Helpers for adding weights to pairs} +#' @export +#' +#' @examples +#' AUS_pairs <- list(AUS = c("NSW", "QLD", "SA", "TAS", "VIC", "WA", "ACT", "NT")) |> +#' as_pairs_from_named(names_to = "ctr", values_to = "state") +#' AUS_pairs |> +#' add_weights_unit(weights_into = "weights") +add_weights_unit <- function(df, weights_into = "weights"){ + ## TODO: validate_pairs_unique() + df[,weights_into] <- 1 + return(df) +} diff --git a/R/as_xmap.R b/R/as_xmap.R new file mode 100644 index 0000000..339c1a2 --- /dev/null +++ b/R/as_xmap.R @@ -0,0 +1,80 @@ +# Generated from create-xmap.Rmd: do not edit by hand + +#' Coerce objects to xmap_df +#' +#' Validates and creates a valid crossmap `xmap_df` object. +#' +#' @param x +#' * For `as_xmap_df()`: An object to coerce +#' * For `is_xmap_df()`: An object to test. +#' @param from,to Columns in `x` specifying the source and target nodes +#' @param weights Column in `x` specifying the weight applied to data passed along the directed link between source and target node +#' @inheritParams vhas_complete_weights +#' @param subclass Which xmap subclass to return. Defaults to `xmap_df` for `data.frame` and `tibble` +#' @param .drop_extra Drop columns other than `from`, `to` and `weights`. Defaults to `TRUE` +#' +#' @return A validated `xmap` object. +#' @name as_xmap +#' @export +as_xmap_df <- function(x, from, to, weights, tol = .Machine$double.eps^0.5, subclass = c("xmap_df"), ...) { + UseMethod("as_xmap_df") +} + +#' @describeIn as_xmap Coerce a `data.frame` to `xmap` +#' +#' @export +#' @examples +#' # For a well formed crossmap: +#' links <- data.frame( +#' a = "AUS", +#' b = c("VIC", "NSW", "WA", "OTHER"), +#' w = c(0.1, 0.15, 0.25, 0.5) +#' ) +#' as_xmap_df(links, from = a, to = b, weights = w) +#' +#' # extra columns are dropped, +#' links$extra <- c(2, 4, 5, 6) +#' as_xmap_df(links, from = a, to = b, weights = w) +as_xmap_df.data.frame <- function(x, from, to, weights, tol = .Machine$double.eps^0.5, subclass = "xmap_df", .drop_extra = TRUE) { + ## coercion & checks + stopifnot(is.data.frame(x)) + + # get string names for columns + col_from <- deparse(substitute(from)) + col_to <- deparse(substitute(to)) + col_weights <- deparse(substitute(weights)) + col_strings <- c(col_from, col_to, col_weights) + ## check columns exist + abort_missing_cols(x, col_strings) + + ## drop additional columns + if (.drop_extra) { + df <- x[col_strings] + } else { + df <- x + } + if (ncol(df) < ncol(x)) { + cli::cli_inform("Dropped additional columns in {.arg {deparse(substitute(x))}}") + } + + ## rearrange columns + col_order <- c(col_strings, setdiff(names(df), col_strings)) + df <- df[col_order] + + ## construction + xmap <- switch(subclass, + xmap_df = new_xmap_df(df, col_from, col_to, col_weights), + stop("Unknown xmap subclass")) + + ## validation + ## ---- xmap graph properties ---- + abort_weights_col_type(df, col_weights) + abort_dup_pairs(df, col_from, col_to) + stop_bad_weights <- !vhas_complete_weights(df[[col_from]], df[[col_weights]], tol) + if (stop_bad_weights) { abort_bad_weights(col_weights) } + + ## ---- xmap_df attributes ---- + validate_xmap_df(xmap) + + return(xmap) +} diff --git a/R/calc.R b/R/calc.R new file mode 100644 index 0000000..10b12dc --- /dev/null +++ b/R/calc.R @@ -0,0 +1,12 @@ +# Generated from create-xmap.Rmd: do not edit by hand + +#' +.calc_unique_sets.xmap_df <- function(x){ + stopifnot(is_xmap_df(x)) + df <- data.frame(x) + x_attrs <- attributes(x) + uniq_sets <- list() + uniq_sets$from_set <- as.character(unique(df[[x_attrs$col_from]])) + uniq_sets$to_set <- as.character(unique(df[[x_attrs$col_to]])) + return(uniq_sets) +} diff --git a/R/data.R b/R/data.R new file mode 100644 index 0000000..061058f --- /dev/null +++ b/R/data.R @@ -0,0 +1,20 @@ +# Generated from create-xmap.Rmd: do not edit by hand + +#' Mock input objects for the `xmap` package +#' +#' A collection of mock inputs for experimenting with functions +#' in the `xmap` package. +#' `named_` objects are either named vectors or nested lists. +#' `df_` objects may contain source-target *pairs* (no weights), +#' or weighted source-target *links*. +#' +#' @format ## `mock` +#' A list with: +#' \describe{ +#' \item{named_ctr_iso3c}{named vector. Names are ISO-3 country codes, values are ISO English country names. Retrieved from `countrycode` package: +#' \url{https://github.com/vincentarelbundock/countrycode}} +#' \item{df_anzsco21}{4-column tibble. Contains major and submajor occupation codes and descriptions for ANZSCO21. Retrieved from `strayr` package: +#' \url{https://github.com/runapp-aus/strayr}} +#' \item{df_mixed}{3-column data.frame. } +#' } +"mock" diff --git a/R/get-helpers.R b/R/get-helpers.R new file mode 100644 index 0000000..6a47ac7 --- /dev/null +++ b/R/get-helpers.R @@ -0,0 +1,9 @@ +# Generated from create-xmap.Rmd: do not edit by hand + +#' +.get_col_attrs.xmap_df <- function(x){ + stopifnot(is_xmap_df(x)) + x_attrs <- attributes(x) + col_attrs <- x_attrs[startsWith(names(x_attrs), "col")] + return(col_attrs) +} diff --git a/R/is_xmap.R b/R/is_xmap.R new file mode 100644 index 0000000..b0203aa --- /dev/null +++ b/R/is_xmap.R @@ -0,0 +1,17 @@ +# Generated from create-xmap.Rmd: do not edit by hand + +#' Test if object is a crossmap +#' +#' This function returns `TRUE` for crossmaps `xmap` or subclasses thereof (`xmap_df`), and `FALSE` for all other objects, including regular data.frames or tibbles. +#' @export +#' @rdname as_xmap +is_xmap <- function(x) { + base::inherits(x, "xmap") +} + +#' Test if object is `xmap_df` +#' @export +#' @rdname as_xmap +is_xmap_df <- function(x) { + rlang::inherits_all(x, c("xmap_df", "xmap")) +} diff --git a/R/new_xmap.R b/R/new_xmap.R new file mode 100644 index 0000000..ec4fd24 --- /dev/null +++ b/R/new_xmap.R @@ -0,0 +1,12 @@ +# Generated from create-xmap.Rmd: do not edit by hand + +#' +.calc_xmap_subclass_attr <- function(subclass = c("xmap_df")){ + subclass <- rlang::arg_match(subclass) + + class_attr <- switch(subclass, + xmap_df = c("xmap_df", "xmap", "data.frame"), + stop("Unknown xmap subclass")) + + return(class_attr) +} diff --git a/R/pairs_named.R b/R/pairs_named.R new file mode 100644 index 0000000..abefd24 --- /dev/null +++ b/R/pairs_named.R @@ -0,0 +1,69 @@ +# Generated from create-xmap.Rmd: do not edit by hand + +#' Convert between column pairs and named vector or lists +#' +#' @description +#' Convert named vectors or nested lists into a two-column table of node pairs and vice versa. +#' `as_pairs_from_named` extracts the vector or list element names and the values, unnesting where necessary. +#' `pairs_to_named_vector` extracts name-value pairs from column pairs as is, +#' whilst `pairs_to_named_list()` nests the values first. +#' +#' @inheritParams verify_named +#' @inheritParams verify_pairs +#' @param names_to,values_to character vector specify the new columns to pass the information in `x` into. +#' @param names_from,values_from two columns in `x` to convert to names and values +#' +#' @return +#' * For `as_pairs_from_named()`: a two-column tibble +#' * For `pairs_to_named` fncs: named vector or list +#' +#' @name as_pairs +#' @examples +#' # Coerce named vectors and list to column pairs +#' +#' veg_vec <- c(eggplant = "aubergine", zucchini = "courgette") +#' as_pairs_from_named(veg_vec, "au_eng", "uk_eng") +#' +#' animal_list <- list(MAMM = c("elephant", "whale", "monkey"), +#' REPT = c("lizard", "turtle"), +#' CRUS = c("crab")) +#' as_pairs_from_named(animal_list, "class", "animal") +#' +#' # Convert pairs back to named vector and lists +#' veg_from_pairs <- as_pairs_from_named(veg_vec) |> +#' pairs_to_named_vector(names_from = name, values_from = value) +#' identical(veg_vec, veg_from_pairs) +#' +#' animal_from_pairs <- as_pairs_from_named(animal_list, "class", "animal") |> +#' pairs_to_named_list(names_from = class, values_from = animal) +#' identical(animal_list, animal_from_pairs) +NULL + +#' @describeIn as_pairs Convert named vector or nested list into column pairs +#' @export +as_pairs_from_named <- function(x, names_to = "name", values_to = "value"){ + stopifnot(is.vector(x)) + node_pairs <- x |> + tibble::enframe(name = names_to, value = values_to) |> + tidyr::unnest_longer(col=tidyr::all_of(values_to)) + return(node_pairs) +} + +#' @describeIn as_pairs Convert column pairs to named vector +#' @export +pairs_to_named_vector <- function(df, names_from = name, values_from = value){ + ordered_cols <- dplyr::select(df, {{names_from}}, {{values_from}}) + tibble::deframe(ordered_cols) +} + +#' @describeIn as_pairs Convert column pairs to nested named list +#' @export +pairs_to_named_list <- function(df, names_from = name, values_from = value){ + nested_cols <- dplyr::select(df, {{names_from}}, {{values_from}}) |> + tidyr::nest(values = {{values_from}}) + ordered_cols <- dplyr::select(nested_cols, {{names_from}}, values) + tibble::deframe(ordered_cols) |> + sapply(as.matrix) |> + sapply(as.vector) |> + as.list() +} diff --git a/R/print.R b/R/print.R new file mode 100644 index 0000000..904421b --- /dev/null +++ b/R/print.R @@ -0,0 +1,44 @@ +# Generated from create-xmap.Rmd: do not edit by hand + +#' +.calc_link_types.xmap_df <- function(xmap_df){ + x_attrs <- attributes(xmap_df) + x_weights <- xmap_df[[x_attrs$col_weights]] + x_to <- xmap_df[[x_attrs$col_to]] + + flags <- c( + "recode" = vhas_recode(x_weights), + "split" = vhas_split(x_weights), + "collapse" = vhas_collapse(x_to) + ) + types <- names(flags[flags == TRUE]) + type <- cli::pluralize("{types} {? }") + + return(type) +} + +#' +.calc_link_direction.xmap_df <- function(xmap_df){ + x_attrs <- attributes(xmap_df) + direction <- paste0("(", x_attrs$col_from, " -> ", x_attrs$col_to, ") ", + "BY " ,x_attrs$col_weights) + return(direction) +} + +#' Print an `xmap` object +#' +#' @name print.xmap +NULL + +#' @describeIn print.xmap Print an `xmap_df` +#' +#' @export +print.xmap_df <- function(x){ + x_direction <- .calc_link_direction.xmap_df(x) + x_type <- .calc_link_types.xmap_df(x) + x_links <- as.data.frame(x) + + ## print headers and links + cat(paste0("xmap_df:\n", x_type, "\n", x_direction, "\n")) + print(x_links) +} diff --git a/R/sysdata.rda b/R/sysdata.rda new file mode 100644 index 0000000..5e4b328 Binary files /dev/null and b/R/sysdata.rda differ diff --git a/R/utils.R b/R/utils.R new file mode 100644 index 0000000..5eb3aa4 --- /dev/null +++ b/R/utils.R @@ -0,0 +1,6 @@ +# Generated from create-xmap.Rmd: do not edit by hand + +#' Defaults for NULL values +#' @name op-null-default +#' @keywords internal +`%||%` <- function(x, y) if (is.null(x)) y else x diff --git a/R/validate_xmap_df.R b/R/validate_xmap_df.R new file mode 100644 index 0000000..190e0ab --- /dev/null +++ b/R/validate_xmap_df.R @@ -0,0 +1,25 @@ +# Generated from create-xmap.Rmd: do not edit by hand + +#' Validator for `xmap_df` class (INTERNAL) +#' +#' Only checks class attributes, not crossmap graph properties. +#' Use `verify_links_as_xmap()` or `as_xmap()` to verify graph +#' properties +validate_xmap_df <- function(x) { + stopifnot(is_xmap_df(x)) + + df <- data.frame(x) # unclass(x) + x_attrs <- attributes(x) + col_attrs <- c(x_attrs$col_from, x_attrs$col_to, x_attrs$col_weights) + + ## ---- df attributes ---- + abort_missing_cols(df, col_attrs) + abort_any_na(df) + + ## ---- xmap_df attributes --- + abort_col_order(df, x_attrs$col_from, x_attrs$col_to, x_attrs$col_weights) + abort_from_set(df, x_attrs$col_from, x_attrs$from_set) + + ## return original object + invisible(x) +} diff --git a/R/verify_links_as_xmap.R b/R/verify_links_as_xmap.R new file mode 100644 index 0000000..ec09c33 --- /dev/null +++ b/R/verify_links_as_xmap.R @@ -0,0 +1,31 @@ +# Generated from create-xmap.Rmd: do not edit by hand + +#' Check if candidate links meet crossmap properties +#' +#' @param df data.frame like object containing candidate links +#' @inheritParams vhas_complete_weights +#' @inheritParams as_xmap +#' +#' @export +#' @examples +#' # For a well formed crossmap: +#' links <- data.frame( +#' a = "AUS", +#' b = c("VIC", "NSW", "WA", "OTHER"), +#' w = c(0.1, 0.15, 0.25, 0.5) +#' ) +#' verify_links_as_xmap(links, from = a, to = b, weights = w) +verify_links_as_xmap <- function(df, from, to, weights, tol = .Machine$double.eps^0.5){ + col_from <- deparse(substitute(from)) + col_to <- deparse(substitute(to)) + col_weights <- deparse(substitute(weights)) + col_attrs <- c(col_from, col_to, col_weights) + abort_missing_cols(df, col_attrs) + abort_any_na(df) + abort_weights_col_type(df, col_weights) + abort_dup_pairs(df, col_from, col_to) + stop_bad_weights <- !vhas_complete_weights(df[[col_from]], df[[col_weights]], tol) + if (stop_bad_weights) { abort_bad_weights(col_weights) } + + invisible(df) +} diff --git a/R/verify_named.R b/R/verify_named.R new file mode 100644 index 0000000..856baf3 --- /dev/null +++ b/R/verify_named.R @@ -0,0 +1,202 @@ +# Generated from create-xmap.Rmd: do not edit by hand + +#' Verify crossmap properties of named vectors or lists +#' +#' @param x a Named vector or list. Lists values are flattened via `unlist()`. +#' +#' @return `x` or throws an error +#' @name verify_named +#' @examples +#' ## check each fruit has a unique color +#' fruit_color <- c(apple = "green", strawberry = "red", banana = "yellow") +#' verify_named_all_1to1(fruit_color) +#' +#' ## check no student is assigned to multiple groups +#' student_groups <- list(GRP1 = c("kate", "jane", "peter"), +#' GRP2 = c("terry", "ben", "grace"), +#' GRP3 = c("cindy", "lucy", "alex" )) +#' verify_named_no_dup_names(student_groups) +#' +#' ## check +NULL + +#' @describeIn verify_named Verify named vector or list has only one-to-one relations +#' @export +#' +verify_named_all_1to1 <- function(x){ + stopifnot(is.vector(x)) + unique_names <- unique(names(x)) + unique_values <- unique(unlist(unname(x))) + stop <- !(length(unique_names) == length(unique_values)) + if (stop){ + cli::cli_abort("Not all relations in `x` are 1-to-1.", + class = "abort_not_1to1") + } + invisible(x) +} + +#' @describeIn verify_named Verify name-value pairs of named vector or list are not duplicated +#' @export +verify_named_all_unique <- function(x){ + stopifnot(is.vector(x)) + pairs <- as_pairs_from_named(x) + dup_idx <- anyDuplicated(pairs) + stop <- as.logical(dup_idx) + if (stop){ + cli::cli_abort(c( + "Duplicated pairs found in `x`.", + "i" = "Use `as_pairs_from_named(x) |> base::duplicated()` to identify duplicates."), + class = "abort_not_unique" + ) + } + invisible(x) +} + +#' @describeIn verify_named Verify names of named vector or list are not duplicated +#' @export +verify_named_all_names_unique <- function(x){ + stopifnot(is.vector(x)) + dup_idx <- anyDuplicated(names(x)) + stop <- as.logical(dup_idx) + if (stop){ + cli::cli_abort(c( + "Duplicated names found in `x`.", + "i" = "Use `base::duplicated(names(x))` to identify duplicates."), + class = "abort_not_unique" + ) + } + invisible(x) +} + +#' @describeIn verify_named Verify values in named vector or list are not duplicated (after unnesting) +#' @export +verify_named_all_values_unique <- function(x){ + stopifnot(is.vector(x)) + dup_idx <- anyDuplicated(unlist(unname(x))) + stop <- as.logical(dup_idx) + if (stop){ + cli::cli_abort(c( + "Duplicated values found in `x`.", + "i" = "Use `base::duplicated(unlist(unname(x)))` to identify duplicates."), + class = "abort_not_unique" + ) + } + #stopifnot(unlist(unname(x)) == unique(unlist(unname(student_groups)))) + invisible(x) +} + +#' @describeIn abort Abort message for verify_named_matchset_* functions +#' @export +msg_abort_named_matchset <- function(set_type = c("names", "values"), + match_type = c("exact", "within", "contain")){ + match_text <- switch(match_type, + exact = "do not exactly match", + within = "are not all within", + contain = "do not contain all elements of") + + cli::format_error("The {set_type} of {.var x} {match_text} {.var ref_set}") +} + +#' Verify unique names or values of named vector or list match expected set +#' +#' @name verify_named_matchset +#' @inheritParams verify_named +#' @param ref_set a vector of character strings +#' +#' @return `x` or throw an error +#' @examples +#' fruit_color <- c(apple = "green", strawberry = "red", banana = "yellow") +#' fruit_set <- c("apple", "strawberry", "banana", "pear") +#' fruit_color |> +#' verify_named_matchset_names_within(ref_set = fruit_set) +NULL + +#' @describeIn verify_named_matchset Names of `x` **exactly** match `ref_set` +#' @export +verify_named_matchset_names_exact <- function(x, ref_set){ + stopifnot(is.vector(x)) + unique_names <- unique(names(x)) + stop <- !setequal(ref_set, unique_names) + if (stop) { + cli::cli_abort(msg_abort_named_matchset("names", "exact"), + class = "abort_matchset") + } + invisible(x) +} + +#' @describeIn verify_named_matchset Values of `x` **exactly** match `ref_set` +#' @export +verify_named_matchset_values_exact <- function(x, ref_set){ + stopifnot(is.vector(x)) + unique_values <- unique(unlist(unname(x))) + stop <- !setequal(ref_set, unique_values) + if (stop) { + cli::cli_abort(msg_abort_named_matchset("values", "exact"), + class = "abort_matchset") + } + invisible(x) +} + +#' @describeIn verify_named_matchset Names of `x` **contain** all of `ref_set` +#' @export +verify_named_matchset_names_contain <- function(x, ref_set){ + stopifnot(is.vector(x)) + unique_names <- unique(names(x)) + stop <- !all(ref_set %in% unique_names) + if (stop){ + cli::cli_abort(msg_abort_named_matchset("names", "contain"), + class = "abort_matchset") + } + invisible(x) +} + +#' @describeIn verify_named_matchset Values of `x` **contain** all of `ref_set` +#' @export +verify_named_matchset_values_contain <- function(x, ref_set){ + stopifnot(is.vector(x)) + unique_values <- unique(unlist(unname(x))) + stop <- !all(ref_set %in% unique_values) + if (stop){ + cli::cli_abort(msg_abort_named_matchset("values", "contain"), + class = "abort_matchset") + } + invisible(x) +} + +#' @describeIn verify_named_matchset Names of `x` are all **within** `ref_set` +#' @export +verify_named_matchset_names_within <- function(x, ref_set){ + stopifnot(is.vector(x)) + unique_x <- unique(names(x)) + stop <- !all(unique_x %in% ref_set) + if (stop){ + cli::cli_abort(msg_abort_named_matchset("names", "within"), + class = "abort_matchset") + } + invisible(x) +} + +#' @describeIn verify_named_matchset Values of `x` are all **within** `ref_set` +#' @export +verify_named_matchset_values_within <- function(x, ref_set){ + stopifnot(is.vector(x)) + unique_x <- unique(unlist(unname(x))) + stop <- !all(unique_x %in% ref_set) + if (stop){ + cli::cli_abort(msg_abort_named_matchset("values", "within"), + class = "abort_matchset") + } + invisible(x) +} + +#' @describeIn verify_named Alias of `verify_named_all_1to1()` +#' @export +verify_named_as_recode_unique <- verify_named_all_1to1 + +#' @describeIn verify_named Alias of `verify_named_all_values_unique()` +#' @export +verify_named_no_dup_values <- verify_named_all_values_unique + +#' @describeIn verify_named Alias of `verify_named_all_names_unique()` +#' @export +verify_named_no_dup_names <- verify_named_all_names_unique diff --git a/R/verify_pairs.R b/R/verify_pairs.R new file mode 100644 index 0000000..3bcf0c7 --- /dev/null +++ b/R/verify_pairs.R @@ -0,0 +1,40 @@ +# Generated from create-xmap.Rmd: do not edit by hand + +#' Verify crossmap properties of column pairs +#' +#' @param df a data frame-like object with at least two columns +#' @inheritParams verify_links_as_xmap +#' +#' @return `df` or error +#' @name verify_pairs +NULL + +#' @describeIn verify_pairs Verify column pairs have only one-to-one relations +#' @export +verify_pairs_all_1to1 <- function(df, from, to){ + stopifnot(is.data.frame(df)) + set_from <- unique(df[[rlang::englue("{{from}}")]]) + set_to <- unique(df[[rlang::englue("{{to}}")]]) + stopifnot(length(set_from) == length(set_to)) + invisible(df) +} + +#' @describeIn verify_pairs Verify column pairs are all unique +#' @export +verify_pairs_all_unique <- function(df, from, to){ + stopifnot(is.data.frame(df)) + pairs <- dplyr::select(df, {{ from }}, {{to}}) + stopifnot(!as.logical(anyDuplicated(pairs))) + invisible(df) +} + +#' @describeIn verify_pairs Alias of `verify_pairs_all_1to1` +#' @export +verify_pairs_as_recode_unique <- verify_pairs_all_1to1 + +# @describeIn verify_pairs Verify column pairs outgoing link degree +verify_pairs_out <- function(x, from, to, max_out, min_out){ + # TODO: FINISH THIS! + dplyr::group_by(x, {{from}}) |> + dplyr::summarise() +} diff --git a/R/vhas.R b/R/vhas.R new file mode 100644 index 0000000..8a60a83 --- /dev/null +++ b/R/vhas.R @@ -0,0 +1,89 @@ +# Generated from create-xmap.Rmd: do not edit by hand + +#' Boolean flags for properties of candidate and validated xmap links (internal) +#' +#' @description +#' `vhas_*()` functions check properties of xmap links and/or candidate links. +#' The functions only accepts equal length vector inputs to support multiple link formats, +#' but does not check if the inputs are from the same xmap. +#' @param v_from,v_to,v_weights equal length vectors containing the source-target node pairs +#' +#' @return TRUE or FALSE +#' +#' @name vhas +NULL + +#' @describeIn vhas Returns TRUE if xmap does not have +#' duplicate pairs of source-target nodes (irrespective of weights) +#' +vhas_no_dup_pairs <- function(v_from, v_to) { + stopifnot(is.vector(v_from)) + stopifnot(is.vector(v_to)) + stopifnot(identical(length(v_from), length(v_to))) + links <- data.frame(v_from, v_to) + dup_idx <- anyDuplicated(links) + !as.logical(dup_idx) +} + +#' @describeIn vhas Returns TRUE if all weights for a given `from` label +#' sum to one (approximately) +#' @param tol numeric \eqn{\ge 0}. Ignore differences smaller than `tol`. +#' Passed through to the `tolerance` arg of `base::all.equal()`. +vhas_complete_weights <- function(v_from, v_weights, tol = .Machine$double.eps^0.5) { + stopifnot(is.vector(v_from)) + stopifnot(is.vector(v_weights)) + stopifnot(identical(length(v_from), length(v_weights))) + sum_w <- tapply( + X = v_weights, + INDEX = v_from, + FUN = sum, + simplify = TRUE + ) |> as.vector() + names(sum_w) <- NULL + ones <- rep(1, length(sum_w)) + all(isTRUE(all.equal(sum_w, ones, tolerance = tol))) +} + +.calc_vector_lens <- function(...){ + v_list <- list(...) + v_lens <- sapply(v_list, length) + return(v_lens) +} + +#' @describeIn vhas Returns TRUE if links have no duplicate pairs and complete weights +vhas_xmap_props <- function(v_from, v_to, v_weights){ + ## check vectors are equal length + v_lengths <- .calc_vector_lens(v_from, v_to, v_weights) + stopifnot(length(unique(v_lengths)) == 1) + + ## check properties + v_props <- c( + pairs = vhas_no_dup_pairs(v_from, v_to), + weights = vhas_complete_weights(v_from, v_weights) + ) + all(v_props) +} + +#' @describeIn vhas Return TRUE if xmap recodes labels between `from` and `to` +vhas_1to1 <- function(v_weights) { + stopifnot(is.vector(v_weights)) + any(v_weights == 1) +} +#' +vhas_recode <- vhas_1to1 + +#' @describeIn vhas Return TRUE if xmap has splitting links between `from` and `to` +vhas_1toM <- function(v_weights) { + stopifnot(is.vector(v_weights)) + any(v_weights < 1) +} +#' +vhas_split <- vhas_1toM + +#' @describeIn vhas Return TRUE if xmap has collapsing links between `from` and `to` +vhas_1fromM <- function(v_to){ + stopifnot(is.vector(v_to)) + as.logical(anyDuplicated(v_to)) +} +#' +vhas_collapse <- vhas_1fromM diff --git a/R/xmap_df.R b/R/xmap_df.R new file mode 100644 index 0000000..b64d64a --- /dev/null +++ b/R/xmap_df.R @@ -0,0 +1,31 @@ +# Generated from create-xmap.Rmd: do not edit by hand + +#' Low Level Constructors for xmap subclasses +#' @param x data-frame object containing candidate links. +#' @param col_from,col_to,col_weights character strings naming columns containing source nodes, target nodes and numeric weights. +#' @return xmap_df object. Note that this function unclasses tibbles. +#' @name new_xmap +NULL + +#' @describeIn new_xmap Construct xmap_df from data.frame +new_xmap_df <- function(x, col_from, col_to, col_weights, from_set = NULL) { + #' checks argument types + stopifnot(is.data.frame(x)) + stopifnot(length(col_from) == 1 && is.character(col_from)) + stopifnot(length(col_to) == 1 && is.character(col_to)) + stopifnot(length(col_weights) == 1 && is.character(col_weights)) + + #' naively generates `from_set` if it is missing + from_set <- from_set %||% as.character(unique(x[[col_from]])) + stopifnot(is.vector(from_set, mode = "character")) + + #' @return `x` with additional subclasses `xmap_df` and `xmap` + + class(x) <- .calc_xmap_subclass_attr("xmap_df") + structure(x, + col_from = col_from, + col_to = col_to, + col_weights = col_weights, + from_set = from_set + ) +} diff --git a/R/xmap_drop_extra.R b/R/xmap_drop_extra.R new file mode 100644 index 0000000..8b7f626 --- /dev/null +++ b/R/xmap_drop_extra.R @@ -0,0 +1,20 @@ +# Generated from create-xmap.Rmd: do not edit by hand + +#' Drop extra columns from `xmap` objects +#' +#' @param .xmap an xmap object +#' +#' @export +xmap_drop_extra <- function(.xmap){ + stopifnot(is_xmap_df(.xmap)) + # get col names + col_strings <- simplify2array(.get_col_attrs.xmap_df(.xmap)) + # construct new xmap with only necessary columns + z_attrs <- attributes(.xmap) + z_attrs$names <- unname(col_strings) + z <- as.data.frame(.xmap) + z <- z[,col_strings] + attributes(z) <- z_attrs + + return(z) +} diff --git a/R/xmap_reverse.R b/R/xmap_reverse.R new file mode 100644 index 0000000..e68fec5 --- /dev/null +++ b/R/xmap_reverse.R @@ -0,0 +1,42 @@ +# Generated from create-xmap.Rmd: do not edit by hand + +#' Reverse xmap direction +#' +#' @param .xmap xmap object to be reversed +#' @param weights_into A string specifying the name of a new or existing column to store reverse weights in. +#' +#' @return xmap object of same class as `x`, or throws an error if `x` is not reversible +#' @export +xmap_reverse <- function(.xmap, weights_into){ + UseMethod("xmap_reverse") +} + +#' @describeIn xmap_reverse Reverse a `xmap_df` +#' +#' @export +xmap_reverse.xmap_df <- function(.xmap, weights_into = "r_weights"){ + stopifnot(inherits(.xmap, "xmap_df")) + x_attrs <- attributes(.xmap) + df <- as.data.frame(.xmap) + + ## check xmap can be reversed + abort_not_reversible(df, x_attrs$col_to) + + ## make new xmap + df[[weights_into]] <- 1 + new_from <- x_attrs$col_to + new_to <- x_attrs$col_from + new_weights <- weights_into + new_cols <- c(new_from, new_to, new_weights) + ## rearrange columns + #col_order <- c(new_cols, setdiff(names(df), new_cols)) + df <- df[new_cols] + + ## construction + xmap <- new_xmap_df(df, new_from, new_to, new_weights) + + ## validation + validate_xmap_df(xmap) + + return(xmap) +} diff --git a/R/xmap_to.R b/R/xmap_to.R new file mode 100644 index 0000000..0b48eaa --- /dev/null +++ b/R/xmap_to.R @@ -0,0 +1,127 @@ +# Generated from create-xmap.Rmd: do not edit by hand + +#' Extract incidence matrix from xmap objects +#' +#' Transforms `xmap` objects into incidence matrix where the rows are indexed by the `from` values +#' and the columns are indexed by `to` values. Drops any additional variables. +#' +#' @param .xmap an xmap object +#' @param sparse logical specifying if the result should be a sparse matrix. Defaults to TRUE. +#' @param ... Reversed for passing arguments to `stats::xtabs` +#' +#' @return A matrix or sparse matrix object +#' @family {xmap coercion} +#' +#' @export +xmap_to_matrix <- function(.xmap, sparse, ...) { + UseMethod("xmap_to_matrix") +} + +#' @describeIn xmap_to_matrix Coerce a `xmap_df` to a Matrix +#' +#' @export +#' @examples +#' abc_xmap <- data.frame( +#' stringsAsFactors = FALSE, +#' origin = c("a","b","c","d","e", +#' "f","g","h","i","i","j","j","j"), +#' dest = c("AA","AA","AA","AA", +#' "BB","BB","CC","DD","EE","FF","GG","HH","II"), +#' link = c(1, 1, 1, 1, 1, 1, 1, 1, 0.5, 0.5, 0.3, 0.3, 0.4) +#' ) |> +#' as_xmap_df(origin, dest, link) +#' xmap_to_matrix(abc_xmap) +xmap_to_matrix.xmap_df <- function(.xmap, sparse = TRUE, ...){ + x_attrs <- attributes(.xmap) + df <- .xmap |> as.data.frame(stringsAsFactors = TRUE) + fm <- paste(x_attrs$col_weights, "~", x_attrs$col_from, "+", x_attrs$col_to, + collapse = "") + + if(sparse){ + x_mtx <- stats::xtabs(stats::as.formula(fm), df, sparse = TRUE) + } else { + x_mtx <- stats::xtabs(stats::as.formula(fm), df, sparse = FALSE) + attr(x_mtx, "call") <- NULL + unclass(x_mtx) + } + return(x_mtx) +} + +#' Coerce a unit weight `xmap_df` to a named vector or list +#' +#' Checks that an `xmap` has unit weights, and converts the +#' `from` values into: +#' * a vector for `xmap_to_named_vector()` +#' * a nested list for `xmap_to_named_list()` +#' +#' Names are the unique target nodes in `to`, +#' and each element contains the source node(s) in `from`. +#' +#' @param .xmap xmap with only unit weights +#' +#' @return Named vector or list. +#' @export +#' @rdname xmap_to_named +#' @family {xmap coercion} +#' +#' @examples +#' iso_vector <- c(AF = "004", AL = "008", DZ = "012", AS = "016", AD = "020") +#' iso_xmap <- iso_vector |> +#' as_pairs_from_named(names_to = "iso2c", values_to = "iso3n") |> +#' add_weights_unit() |> +#' as_xmap_df(from = iso3n, to = iso2c, weights) +#' identical(iso_vector, xmap_to_named_vector(iso_xmap)) +xmap_to_named_vector <- function(.xmap){ + stopifnot(is_xmap(.xmap)) + x_attrs <- attributes(.xmap) + df <- as.data.frame(.xmap) + # check only unit weights + w <- df[[x_attrs$col_weights]] + stop <- !all(w == 1) + if (stop) { + cli::cli_abort(msg_abort_frac_weights("Cannot convert to named vector"), + class = "abort_frac_weights") + } + + # convert + df |> + subset(select = c(x_attrs$col_to, x_attrs$col_from)) |> + tibble::deframe() |> + sapply(as.matrix) |> + sapply(as.vector) +} + +#' @rdname xmap_to_named +#' @export +#' +#' @examples +#' animal_list <- list(MAMM = c("elephant", "whale", "monkey"), +#' REPT = c("lizard", "turtle"), +#' CRUS = c("crab")) +#' animal_xmap <- animal_list |> +#' as_pairs_from_named(names_to = "class", values_to = "animals") |> +#' add_weights_unit() |> +#' as_xmap_df(from = animals, to = class, weights = weights) +#' identical(xmap_to_named_list(animal_xmap), animal_list) +xmap_to_named_list <- function(.xmap) { + stopifnot(is_xmap(.xmap)) + x_attrs <- attributes(.xmap) + df <- as.data.frame(.xmap) + # check only unit weights + w <- df[[x_attrs$col_weights]] + + stop <- !all(w == 1) + if (stop) { + cli::cli_abort(msg_abort_frac_weights("Cannot convert to named list"), + class = "abort_frac_weights") + } + + # convert + df |> + subset(select = c(x_attrs$col_to, x_attrs$col_from)) |> + tidyr::nest(source = c(x_attrs$col_from)) |> + tibble::deframe() |> + sapply(as.matrix) |> + sapply(as.vector) |> + as.list() +} diff --git a/README.Rmd b/README.Rmd new file mode 100644 index 0000000..4390df6 --- /dev/null +++ b/README.Rmd @@ -0,0 +1,45 @@ +--- +output: github_document +--- + + + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +## Overview + +There are lots of ways in R to modify categorical variables and to redistribution numeric values between categories. If you are performing a simple recoding of category labels, or collapsing multiple categories, you might use the [`forcats`](https://github.com/tidyverse/forcats) package. However, if you are doing more complex transformations you might find yourself writing custom functions or scripts including mutating joins, grouped summary operations, or case-wise transformation of numeric values. Verifying these data wrangling scripts and pipelines becomes more difficult as the number of categories and the complexity of the mappings increases. Current solutions to this issue mostly involve ad hoc data validation of the data before and after they are transformed ([`assertr`](https://github.com/ropensci/assertr), [`validate`](https://data-cleaning.github.io/validate/), [`pointblank`](https://rich-iannone.github.io/pointblank/)). + +The `xmap` package offers an alternative approach to ensuring your code performs the intended transformations. Instead of inspecting the data, the package provides tools for validating the mapping objects which are used to transform the data. Examples of mapping objects and available verification functions include: + +- **Named vectors or lists** + - Commonly used as reference inputs for recoding or collapsing categories. + - Use `verify_named()` for checking properties such as uniqueness or 1-to-1 relations, + - and `verify_named_matchset()` for checking the set of names or values matches expectations. +- **Lookup tables**: + - Also known as crosswalks and concordance tables + - Use `verify_pairs()` for checking uniqueness and 1-to-1 relations. +- **Crossmaps**: + - a new graph-based extension of Crosswalk tables that also store redistribution weights for ambiguous 1-to-many relations. + - Use `verify_links_as_xmap()` to check aggregation or disaggregation weights and other desirable properties. + +See `vignette("xmap")` to get started using verification functions in your existing workflows. The functions are based on results obtained by representing and analysing recoding or redistribution transformations as directed, weighted bipartite graphs (i.e. "Crossmaps"). For more information about this underlying graph structure, and the experimental `xmap_df` class, see `vignette("making-xmaps")` and `vignette("vis-xmaps")`. + +## Installation + +To install the latest release of `xmap`: + +``` r +remotes::install_github("cynthiahqy/xmap") +``` + +To install the latest development version of `xmap`: + +``` r +remotes::install_github("cynthiahqy/conformr-xmap-project", subdir = "xmap") +``` diff --git a/README.md b/README.md new file mode 100644 index 0000000..15f31f3 --- /dev/null +++ b/README.md @@ -0,0 +1,63 @@ + + + +## Overview + +There are lots of ways in R to modify categorical variables and to +redistribution numeric values between categories. If you are performing +a simple recoding of category labels, or collapsing multiple categories, +you might use the [`forcats`](https://github.com/tidyverse/forcats) +package. However, if you are doing more complex transformations you +might find yourself writing custom functions or scripts including +mutating joins, grouped summary operations, or case-wise transformation +of numeric values. Verifying these data wrangling scripts and pipelines +becomes more difficult as the number of categories and the complexity of +the mappings increases. Current solutions to this issue mostly involve +ad hoc data validation of the data before and after they are transformed +([`assertr`](https://github.com/ropensci/assertr), +[`validate`](https://data-cleaning.github.io/validate/), +[`pointblank`](https://rich-iannone.github.io/pointblank/)). + +The `xmap` package offers an alternative approach to ensuring your code +performs the intended transformations. Instead of inspecting the data, +the package provides tools for validating the mapping objects which are +used to transform the data. Examples of mapping objects and available +verification functions include: + +- **Named vectors or lists** + - Commonly used as reference inputs for recoding or collapsing + categories. + - Use `verify_named()` for checking properties such as uniqueness or + 1-to-1 relations, + - and `verify_named_matchset()` for checking the set of names or + values matches expectations. +- **Lookup tables**: + - Also known as crosswalks and concordance tables + - Use `verify_pairs()` for checking uniqueness and 1-to-1 relations. +- **Crossmaps**: + - a new graph-based extension of Crosswalk tables that also store + redistribution weights for ambiguous 1-to-many relations. + - Use `verify_links_as_xmap()` to check aggregation or disaggregation + weights and other desirable properties. + +See `vignette("xmap")` to get started using verification functions in +your existing workflows. The functions are based on results obtained by +representing and analysing recoding or redistribution transformations as +directed, weighted bipartite graphs (i.e. “Crossmaps”). For more +information about this underlying graph structure, and the experimental +`xmap_df` class, see `vignette("making-xmaps")` and +`vignette("vis-xmaps")`. + +## Installation + +To install the latest release of `xmap`: + +``` r +remotes::install_github("cynthiahqy/xmap") +``` + +To install the latest development version of `xmap`: + +``` r +remotes::install_github("cynthiahqy/conformr-xmap-project", subdir = "xmap") +``` diff --git a/_pkgdown.yml b/_pkgdown.yml new file mode 100644 index 0000000..60d1e44 --- /dev/null +++ b/_pkgdown.yml @@ -0,0 +1,61 @@ +destination: ./docs/ + +url: ~ +template: + bootstrap: 5 + +repo: + url: + home: https://github.com/cynthiahqy/xmap/ + source: https://github.com/cynthiahqy/xmap/ + issue: https://github.com/cynthiahqy/conformr-xmap-project/issues + user: https://github.com/cynthiahqy + +authors: + Cynthia Huang: + href: https://www.cynthiahqy.com + Laura Puzzello: + href: https://sites.google.com/site/laurapuzzello + +navbar: + structure: + left: [intro, reference, articles] + right: [github] + components: + github: + icon: fa-github + href: https://github.com/cynthiahqy/xmap/ + +reference: +- title: Verify existing mapping objects + contents: + - starts_with("verify_named") + - starts_with("verify_pairs") + - starts_with("verify_links") + +- title: Creating `xmap` objects + contents: + - starts_with("pairs") + - starts_with("links") + - starts_with("add_weights") + - starts_with("as") + - as_xmap_df + - mock + +- title: Using and Modifying `xmap` objects + contents: + - starts_with("xmap_to") + - starts_with("xmap") + - is_xmap + - print.xmap + +- title: internal + contents: + - starts_with("vhas") + - starts_with("abort") + - starts_with("msg") + - starts_with("validate") + - starts_with(".get") + - starts_with(".calc") + - new_xmap_df + - validate_xmap_df diff --git a/data-raw/mock.R b/data-raw/mock.R new file mode 100644 index 0000000..f0b9976 --- /dev/null +++ b/data-raw/mock.R @@ -0,0 +1,21 @@ +## code to prepare `mock` dataset goes here + +usethis::use_data(mock, overwrite = TRUE) + +mock <- list() + +mock$named_ctr_iso3c <- countrycode::codelist |> + dplyr::select(iso3c, iso.name.en) |> + tidyr::drop_na() |> + tibble::deframe() + +# mock_named$collapse_list <- list(MAMM = c("elephant", "whale", "monkey"), +# REPT = c("lizard", "turtle"), +# CRUS = c("crab")) + +mock$df_anzsco21 <- strayr::anzsco2021 |> + dplyr::select(tidyselect::starts_with(c("anzsco_major", "anzsco_submajor"))) |> + dplyr::distinct() |> + dplyr::select(tidyselect::ends_with("_code"), tidyselect::everything()) + +usethis::use_data(mock) diff --git a/data/mock.rda b/data/mock.rda new file mode 100644 index 0000000..23f8ce3 Binary files /dev/null and b/data/mock.rda differ diff --git a/man/abort.Rd b/man/abort.Rd new file mode 100644 index 0000000..c619833 --- /dev/null +++ b/man/abort.Rd @@ -0,0 +1,74 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/abort.R, R/verify_named.R +\name{abort_missing_cols} +\alias{abort_missing_cols} +\alias{abort_any_na} +\alias{abort} +\alias{abort_weights_col_type} +\alias{abort_dup_pairs} +\alias{abort_bad_weights} +\alias{abort_col_order} +\alias{abort_from_set} +\alias{msg_abort_frac_weights} +\alias{abort_not_reversible} +\alias{msg_abort_named_matchset} +\title{Validation functions and messages for xmap or candidate links (Internal)} +\usage{ +abort_missing_cols(df, cols) + +abort_any_na(df) + +abort_weights_col_type(df, col_weights) + +abort_dup_pairs(df, col_from, col_to) + +abort_bad_weights(col_weights, call = rlang::caller_env()) + +abort_col_order(df, col_from, col_to, col_weights) + +abort_from_set(df, col_from, from_set) + +msg_abort_frac_weights(impact) + +abort_not_reversible(df, col_to) + +msg_abort_named_matchset( + set_type = c("names", "values"), + match_type = c("exact", "within", "contain") +) +} +\arguments{ +\item{df}{a data.frame-like object containing links} + +\item{col_from, col_to, col_weights}{character vector or values naming columns from \code{df}} +} +\value{ +An error if the validation condition fails, +and invisibly returns \code{df} otherwise. +} +\description{ +Checks issues with data.frame like objects containing validated \code{xmap} or candidate links. +} +\section{Functions}{ +\itemize{ +\item \code{abort_missing_cols()}: Abort if named columns can't be found in df + +\item \code{abort_any_na()}: Abort if xmap_df has missing values + +\item \code{abort_weights_col_type()}: Abort if xmap_df has wrong column types + +\item \code{abort_dup_pairs()}: Abort if duplicate source-target pairs are found + +\item \code{abort_bad_weights()}: Abort for invalid mapping weights + +\item \code{abort_col_order()}: Abort if xmap_df columns are not in order + +\item \code{abort_from_set()}: Abort if from_set attribute doesn't match xmap_df values + +\item \code{msg_abort_frac_weights()}: Abort message for fractional weights + +\item \code{abort_not_reversible()}: Abort if xmap_df is not reversible without new weights + +\item \code{msg_abort_named_matchset()}: Abort message for verify_named_matchset_* functions + +}} diff --git a/man/add_weights_equal.Rd b/man/add_weights_equal.Rd new file mode 100644 index 0000000..909b19b --- /dev/null +++ b/man/add_weights_equal.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/add_weights_equal.R +\name{add_weights_equal} +\alias{add_weights_equal} +\title{Add equal fractional weights to groups of source-target node pairs} +\usage{ +add_weights_equal(df, from, to, weights_into = "weights") +} +\arguments{ +\item{df}{a data frame-like object with at least two columns} + +\item{from, to}{Columns in \code{x} specifying the source and target nodes} + +\item{weights_into}{character string naming new column to store link weights in} +} +\value{ +\code{pairs} with additional column of weights +} +\description{ +Attaches equal weights to every source-target link from a given source node. +The resultant weighted links can be verified or coerced into an \code{xmap}. +} +\examples{ +animal_pairs <- list(MAMM = c("elephant", "whale", "monkey"), + REPT = c("lizard", "turtle"), + CRUS = c("crab")) |> + as_pairs_from_named("class", "animal") +animal_pairs |> + add_weights_equal(from = class, to = animal) +} +\seealso{ +Other {Helpers for adding weights to pairs}: +\code{\link{add_weights_unit}()} +} +\concept{{Helpers for adding weights to pairs}} diff --git a/man/add_weights_unit.Rd b/man/add_weights_unit.Rd new file mode 100644 index 0000000..f15a964 --- /dev/null +++ b/man/add_weights_unit.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/add_weights_unit.R +\name{add_weights_unit} +\alias{add_weights_unit} +\title{Add unit weights to node pairs table} +\usage{ +add_weights_unit(df, weights_into = "weights") +} +\arguments{ +\item{df}{a data frame-like object with at least two columns} + +\item{weights_into}{character string naming new column to store link weights in} +} +\value{ +\code{pairs} with additional column of ones +} +\description{ +Attaches column of unit weights to pairs of source-target nodes. +The resultant weighted links can be verified or coerced into \code{xmap}. +} +\examples{ +AUS_pairs <- list(AUS = c("NSW", "QLD", "SA", "TAS", "VIC", "WA", "ACT", "NT")) |> + as_pairs_from_named(names_to = "ctr", values_to = "state") +AUS_pairs |> + add_weights_unit(weights_into = "weights") +} +\seealso{ +Other {Helpers for adding weights to pairs}: +\code{\link{add_weights_equal}()} +} +\concept{{Helpers for adding weights to pairs}} diff --git a/man/as_pairs.Rd b/man/as_pairs.Rd new file mode 100644 index 0000000..9822ef4 --- /dev/null +++ b/man/as_pairs.Rd @@ -0,0 +1,65 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pairs_named.R +\name{as_pairs} +\alias{as_pairs} +\alias{as_pairs_from_named} +\alias{pairs_to_named_vector} +\alias{pairs_to_named_list} +\title{Convert between column pairs and named vector or lists} +\usage{ +as_pairs_from_named(x, names_to = "name", values_to = "value") + +pairs_to_named_vector(df, names_from = name, values_from = value) + +pairs_to_named_list(df, names_from = name, values_from = value) +} +\arguments{ +\item{x}{a Named vector or list. Lists values are flattened via \code{unlist()}.} + +\item{names_to, values_to}{character vector specify the new columns to pass the information in \code{x} into.} + +\item{df}{a data frame-like object with at least two columns} + +\item{names_from, values_from}{two columns in \code{x} to convert to names and values} +} +\value{ +\itemize{ +\item For \code{as_pairs_from_named()}: a two-column tibble +\item For \code{pairs_to_named} fncs: named vector or list +} +} +\description{ +Convert named vectors or nested lists into a two-column table of node pairs and vice versa. +\code{as_pairs_from_named} extracts the vector or list element names and the values, unnesting where necessary. +\code{pairs_to_named_vector} extracts name-value pairs from column pairs as is, +whilst \code{pairs_to_named_list()} nests the values first. +} +\section{Functions}{ +\itemize{ +\item \code{as_pairs_from_named()}: Convert named vector or nested list into column pairs + +\item \code{pairs_to_named_vector()}: Convert column pairs to named vector + +\item \code{pairs_to_named_list()}: Convert column pairs to nested named list + +}} +\examples{ +# Coerce named vectors and list to column pairs + +veg_vec <- c(eggplant = "aubergine", zucchini = "courgette") +as_pairs_from_named(veg_vec, "au_eng", "uk_eng") + +animal_list <- list(MAMM = c("elephant", "whale", "monkey"), + REPT = c("lizard", "turtle"), + CRUS = c("crab")) +as_pairs_from_named(animal_list, "class", "animal") + +# Convert pairs back to named vector and lists +veg_from_pairs <- as_pairs_from_named(veg_vec) |> + pairs_to_named_vector(names_from = name, values_from = value) +identical(veg_vec, veg_from_pairs) + +animal_from_pairs <- as_pairs_from_named(animal_list, "class", "animal") |> + pairs_to_named_list(names_from = class, values_from = animal) +identical(animal_list, animal_from_pairs) +} diff --git a/man/as_xmap.Rd b/man/as_xmap.Rd new file mode 100644 index 0000000..def2f0b --- /dev/null +++ b/man/as_xmap.Rd @@ -0,0 +1,77 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/as_xmap.R, R/is_xmap.R +\name{as_xmap} +\alias{as_xmap} +\alias{as_xmap_df} +\alias{as_xmap_df.data.frame} +\alias{is_xmap} +\alias{is_xmap_df} +\title{Coerce objects to xmap_df} +\usage{ +as_xmap_df( + x, + from, + to, + weights, + tol = .Machine$double.eps^0.5, + subclass = c("xmap_df"), + ... +) + +\method{as_xmap_df}{data.frame}( + x, + from, + to, + weights, + tol = .Machine$double.eps^0.5, + subclass = "xmap_df", + .drop_extra = TRUE +) + +is_xmap(x) + +is_xmap_df(x) +} +\arguments{ +\item{x}{\itemize{ +\item For \code{as_xmap_df()}: An object to coerce +\item For \code{is_xmap_df()}: An object to test. +}} + +\item{from, to}{Columns in \code{x} specifying the source and target nodes} + +\item{weights}{Column in \code{x} specifying the weight applied to data passed along the directed link between source and target node} + +\item{tol}{numeric \eqn{\ge 0}. Ignore differences smaller than \code{tol}. +Passed through to the \code{tolerance} arg of \code{base::all.equal()}.} + +\item{subclass}{Which xmap subclass to return. Defaults to \code{xmap_df} for \code{data.frame} and \code{tibble}} + +\item{.drop_extra}{Drop columns other than \code{from}, \code{to} and \code{weights}. Defaults to \code{TRUE}} +} +\value{ +A validated \code{xmap} object. +} +\description{ +Validates and creates a valid crossmap \code{xmap_df} object. + +This function returns \code{TRUE} for crossmaps \code{xmap} or subclasses thereof (\code{xmap_df}), and \code{FALSE} for all other objects, including regular data.frames or tibbles. +} +\section{Functions}{ +\itemize{ +\item \code{as_xmap_df(data.frame)}: Coerce a \code{data.frame} to \code{xmap} + +}} +\examples{ +# For a well formed crossmap: +links <- data.frame( + a = "AUS", + b = c("VIC", "NSW", "WA", "OTHER"), + w = c(0.1, 0.15, 0.25, 0.5) +) +as_xmap_df(links, from = a, to = b, weights = w) + +# extra columns are dropped, +links$extra <- c(2, 4, 5, 6) +as_xmap_df(links, from = a, to = b, weights = w) +} diff --git a/man/mock.Rd b/man/mock.Rd new file mode 100644 index 0000000..9d590e7 --- /dev/null +++ b/man/mock.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{mock} +\alias{mock} +\title{Mock input objects for the \code{xmap} package} +\format{ +\subsection{\code{mock}}{ + +A list with: +\describe{ +\item{named_ctr_iso3c}{named vector. Names are ISO-3 country codes, values are ISO English country names. Retrieved from \code{countrycode} package: +\url{https://github.com/vincentarelbundock/countrycode}} +\item{df_anzsco21}{4-column tibble. Contains major and submajor occupation codes and descriptions for ANZSCO21. Retrieved from \code{strayr} package: +\url{https://github.com/runapp-aus/strayr}} +\item{df_mixed}{3-column data.frame. } +} +} +} +\usage{ +mock +} +\description{ +A collection of mock inputs for experimenting with functions +in the \code{xmap} package. +\code{named_} objects are either named vectors or nested lists. +\code{df_} objects may contain source-target \emph{pairs} (no weights), +or weighted source-target \emph{links}. +} +\keyword{datasets} diff --git a/man/new_xmap.Rd b/man/new_xmap.Rd new file mode 100644 index 0000000..add7bca --- /dev/null +++ b/man/new_xmap.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xmap_df.R +\name{new_xmap} +\alias{new_xmap} +\alias{new_xmap_df} +\title{Low Level Constructors for xmap subclasses} +\usage{ +new_xmap_df(x, col_from, col_to, col_weights, from_set = NULL) +} +\arguments{ +\item{x}{data-frame object containing candidate links.} + +\item{col_from, col_to, col_weights}{character strings naming columns containing source nodes, target nodes and numeric weights.} +} +\value{ +xmap_df object. Note that this function unclasses tibbles. + +\code{x} with additional subclasses \code{xmap_df} and \code{xmap} +} +\description{ +Low Level Constructors for xmap subclasses +} +\section{Functions}{ +\itemize{ +\item \code{new_xmap_df()}: Construct xmap_df from data.frame +checks argument types +naively generates \code{from_set} if it is missing + +}} diff --git a/man/op-null-default.Rd b/man/op-null-default.Rd new file mode 100644 index 0000000..a13048e --- /dev/null +++ b/man/op-null-default.Rd @@ -0,0 +1,13 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{op-null-default} +\alias{op-null-default} +\alias{\%||\%} +\title{Defaults for NULL values} +\usage{ +x \%||\% y +} +\description{ +Defaults for NULL values +} +\keyword{internal} diff --git a/man/print.xmap.Rd b/man/print.xmap.Rd new file mode 100644 index 0000000..064f33d --- /dev/null +++ b/man/print.xmap.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/print.R +\name{print.xmap} +\alias{print.xmap} +\alias{print.xmap_df} +\title{Print an \code{xmap} object} +\usage{ +\method{print}{xmap_df}(x) +} +\description{ +Print an \code{xmap} object +} +\section{Functions}{ +\itemize{ +\item \code{print(xmap_df)}: Print an \code{xmap_df} + +}} diff --git a/man/validate_xmap_df.Rd b/man/validate_xmap_df.Rd new file mode 100644 index 0000000..5e553cd --- /dev/null +++ b/man/validate_xmap_df.Rd @@ -0,0 +1,13 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validate_xmap_df.R +\name{validate_xmap_df} +\alias{validate_xmap_df} +\title{Validator for \code{xmap_df} class (INTERNAL)} +\usage{ +validate_xmap_df(x) +} +\description{ +Only checks class attributes, not crossmap graph properties. +Use \code{verify_links_as_xmap()} or \code{as_xmap()} to verify graph +properties +} diff --git a/man/verify_links_as_xmap.Rd b/man/verify_links_as_xmap.Rd new file mode 100644 index 0000000..4222c80 --- /dev/null +++ b/man/verify_links_as_xmap.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/verify_links_as_xmap.R +\name{verify_links_as_xmap} +\alias{verify_links_as_xmap} +\title{Check if candidate links meet crossmap properties} +\usage{ +verify_links_as_xmap(df, from, to, weights, tol = .Machine$double.eps^0.5) +} +\arguments{ +\item{df}{data.frame like object containing candidate links} + +\item{from, to}{Columns in \code{x} specifying the source and target nodes} + +\item{weights}{Column in \code{x} specifying the weight applied to data passed along the directed link between source and target node} + +\item{tol}{numeric \eqn{\ge 0}. Ignore differences smaller than \code{tol}. +Passed through to the \code{tolerance} arg of \code{base::all.equal()}.} +} +\description{ +Check if candidate links meet crossmap properties +} +\examples{ +# For a well formed crossmap: +links <- data.frame( + a = "AUS", + b = c("VIC", "NSW", "WA", "OTHER"), + w = c(0.1, 0.15, 0.25, 0.5) +) +verify_links_as_xmap(links, from = a, to = b, weights = w) +} diff --git a/man/verify_named.Rd b/man/verify_named.Rd new file mode 100644 index 0000000..3a3e5a1 --- /dev/null +++ b/man/verify_named.Rd @@ -0,0 +1,66 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/verify_named.R +\name{verify_named} +\alias{verify_named} +\alias{verify_named_all_1to1} +\alias{verify_named_all_unique} +\alias{verify_named_all_names_unique} +\alias{verify_named_all_values_unique} +\alias{verify_named_as_recode_unique} +\alias{verify_named_no_dup_values} +\alias{verify_named_no_dup_names} +\title{Verify crossmap properties of named vectors or lists} +\usage{ +verify_named_all_1to1(x) + +verify_named_all_unique(x) + +verify_named_all_names_unique(x) + +verify_named_all_values_unique(x) + +verify_named_as_recode_unique(x) + +verify_named_no_dup_values(x) + +verify_named_no_dup_names(x) +} +\arguments{ +\item{x}{a Named vector or list. Lists values are flattened via \code{unlist()}.} +} +\value{ +\code{x} or throws an error +} +\description{ +Verify crossmap properties of named vectors or lists +} +\section{Functions}{ +\itemize{ +\item \code{verify_named_all_1to1()}: Verify named vector or list has only one-to-one relations + +\item \code{verify_named_all_unique()}: Verify name-value pairs of named vector or list are not duplicated + +\item \code{verify_named_all_names_unique()}: Verify names of named vector or list are not duplicated + +\item \code{verify_named_all_values_unique()}: Verify values in named vector or list are not duplicated (after unnesting) + +\item \code{verify_named_as_recode_unique()}: Alias of \code{verify_named_all_1to1()} + +\item \code{verify_named_no_dup_values()}: Alias of \code{verify_named_all_values_unique()} + +\item \code{verify_named_no_dup_names()}: Alias of \code{verify_named_all_names_unique()} + +}} +\examples{ +## check each fruit has a unique color +fruit_color <- c(apple = "green", strawberry = "red", banana = "yellow") +verify_named_all_1to1(fruit_color) + +## check no student is assigned to multiple groups +student_groups <- list(GRP1 = c("kate", "jane", "peter"), + GRP2 = c("terry", "ben", "grace"), + GRP3 = c("cindy", "lucy", "alex" )) +verify_named_no_dup_names(student_groups) + +## check +} diff --git a/man/verify_named_matchset.Rd b/man/verify_named_matchset.Rd new file mode 100644 index 0000000..4fad30f --- /dev/null +++ b/man/verify_named_matchset.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/verify_named.R +\name{verify_named_matchset} +\alias{verify_named_matchset} +\alias{verify_named_matchset_names_exact} +\alias{verify_named_matchset_values_exact} +\alias{verify_named_matchset_names_contain} +\alias{verify_named_matchset_values_contain} +\alias{verify_named_matchset_names_within} +\alias{verify_named_matchset_values_within} +\title{Verify unique names or values of named vector or list match expected set} +\usage{ +verify_named_matchset_names_exact(x, ref_set) + +verify_named_matchset_values_exact(x, ref_set) + +verify_named_matchset_names_contain(x, ref_set) + +verify_named_matchset_values_contain(x, ref_set) + +verify_named_matchset_names_within(x, ref_set) + +verify_named_matchset_values_within(x, ref_set) +} +\arguments{ +\item{x}{a Named vector or list. Lists values are flattened via \code{unlist()}.} + +\item{ref_set}{a vector of character strings} +} +\value{ +\code{x} or throw an error +} +\description{ +Verify unique names or values of named vector or list match expected set +} +\section{Functions}{ +\itemize{ +\item \code{verify_named_matchset_names_exact()}: Names of \code{x} \strong{exactly} match \code{ref_set} + +\item \code{verify_named_matchset_values_exact()}: Values of \code{x} \strong{exactly} match \code{ref_set} + +\item \code{verify_named_matchset_names_contain()}: Names of \code{x} \strong{contain} all of \code{ref_set} + +\item \code{verify_named_matchset_values_contain()}: Values of \code{x} \strong{contain} all of \code{ref_set} + +\item \code{verify_named_matchset_names_within()}: Names of \code{x} are all \strong{within} \code{ref_set} + +\item \code{verify_named_matchset_values_within()}: Values of \code{x} are all \strong{within} \code{ref_set} + +}} +\examples{ +fruit_color <- c(apple = "green", strawberry = "red", banana = "yellow") +fruit_set <- c("apple", "strawberry", "banana", "pear") +fruit_color |> + verify_named_matchset_names_within(ref_set = fruit_set) +} diff --git a/man/verify_pairs.Rd b/man/verify_pairs.Rd new file mode 100644 index 0000000..b79c47e --- /dev/null +++ b/man/verify_pairs.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/verify_pairs.R +\name{verify_pairs} +\alias{verify_pairs} +\alias{verify_pairs_all_1to1} +\alias{verify_pairs_all_unique} +\alias{verify_pairs_as_recode_unique} +\title{Verify crossmap properties of column pairs} +\usage{ +verify_pairs_all_1to1(df, from, to) + +verify_pairs_all_unique(df, from, to) + +verify_pairs_as_recode_unique(df, from, to) +} +\arguments{ +\item{df}{a data frame-like object with at least two columns} + +\item{from, to}{Columns in \code{x} specifying the source and target nodes} +} +\value{ +\code{df} or error +} +\description{ +Verify crossmap properties of column pairs +} +\section{Functions}{ +\itemize{ +\item \code{verify_pairs_all_1to1()}: Verify column pairs have only one-to-one relations + +\item \code{verify_pairs_all_unique()}: Verify column pairs are all unique + +\item \code{verify_pairs_as_recode_unique()}: Alias of \code{verify_pairs_all_1to1} + +}} diff --git a/man/vhas.Rd b/man/vhas.Rd new file mode 100644 index 0000000..8095ae0 --- /dev/null +++ b/man/vhas.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/vhas.R +\name{vhas} +\alias{vhas} +\alias{vhas_no_dup_pairs} +\alias{vhas_complete_weights} +\alias{vhas_xmap_props} +\alias{vhas_1to1} +\alias{vhas_1toM} +\alias{vhas_1fromM} +\title{Boolean flags for properties of candidate and validated xmap links (internal)} +\usage{ +vhas_no_dup_pairs(v_from, v_to) + +vhas_complete_weights(v_from, v_weights, tol = .Machine$double.eps^0.5) + +vhas_xmap_props(v_from, v_to, v_weights) + +vhas_1to1(v_weights) + +vhas_1toM(v_weights) + +vhas_1fromM(v_to) +} +\arguments{ +\item{v_from, v_to, v_weights}{equal length vectors containing the source-target node pairs} + +\item{tol}{numeric \eqn{\ge 0}. Ignore differences smaller than \code{tol}. +Passed through to the \code{tolerance} arg of \code{base::all.equal()}.} +} +\value{ +TRUE or FALSE +} +\description{ +\verb{vhas_*()} functions check properties of xmap links and/or candidate links. +The functions only accepts equal length vector inputs to support multiple link formats, +but does not check if the inputs are from the same xmap. +} +\section{Functions}{ +\itemize{ +\item \code{vhas_no_dup_pairs()}: Returns TRUE if xmap does not have +duplicate pairs of source-target nodes (irrespective of weights) + +\item \code{vhas_complete_weights()}: Returns TRUE if all weights for a given \code{from} label +sum to one (approximately) + +\item \code{vhas_xmap_props()}: Returns TRUE if links have no duplicate pairs and complete weights + +\item \code{vhas_1to1()}: Return TRUE if xmap recodes labels between \code{from} and \code{to} + +\item \code{vhas_1toM()}: Return TRUE if xmap has splitting links between \code{from} and \code{to} + +\item \code{vhas_1fromM()}: Return TRUE if xmap has collapsing links between \code{from} and \code{to} + +}} diff --git a/man/xmap_drop_extra.Rd b/man/xmap_drop_extra.Rd new file mode 100644 index 0000000..f357cd1 --- /dev/null +++ b/man/xmap_drop_extra.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xmap_drop_extra.R +\name{xmap_drop_extra} +\alias{xmap_drop_extra} +\title{Drop extra columns from \code{xmap} objects} +\usage{ +xmap_drop_extra(.xmap) +} +\arguments{ +\item{.xmap}{an xmap object} +} +\description{ +Drop extra columns from \code{xmap} objects +} diff --git a/man/xmap_reverse.Rd b/man/xmap_reverse.Rd new file mode 100644 index 0000000..4d7c277 --- /dev/null +++ b/man/xmap_reverse.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xmap_reverse.R +\name{xmap_reverse} +\alias{xmap_reverse} +\alias{xmap_reverse.xmap_df} +\title{Reverse xmap direction} +\usage{ +xmap_reverse(.xmap, weights_into) + +\method{xmap_reverse}{xmap_df}(.xmap, weights_into = "r_weights") +} +\arguments{ +\item{.xmap}{xmap object to be reversed} + +\item{weights_into}{A string specifying the name of a new or existing column to store reverse weights in.} +} +\value{ +xmap object of same class as \code{x}, or throws an error if \code{x} is not reversible +} +\description{ +Reverse xmap direction +} +\section{Methods (by class)}{ +\itemize{ +\item \code{xmap_reverse(xmap_df)}: Reverse a \code{xmap_df} + +}} diff --git a/man/xmap_to_matrix.Rd b/man/xmap_to_matrix.Rd new file mode 100644 index 0000000..dfdcb76 --- /dev/null +++ b/man/xmap_to_matrix.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xmap_to.R +\name{xmap_to_matrix} +\alias{xmap_to_matrix} +\alias{xmap_to_matrix.xmap_df} +\title{Extract incidence matrix from xmap objects} +\usage{ +xmap_to_matrix(.xmap, sparse, ...) + +\method{xmap_to_matrix}{xmap_df}(.xmap, sparse = TRUE, ...) +} +\arguments{ +\item{.xmap}{an xmap object} + +\item{sparse}{logical specifying if the result should be a sparse matrix. Defaults to TRUE.} + +\item{...}{Reversed for passing arguments to \code{stats::xtabs}} +} +\value{ +A matrix or sparse matrix object +} +\description{ +Transforms \code{xmap} objects into incidence matrix where the rows are indexed by the \code{from} values +and the columns are indexed by \code{to} values. Drops any additional variables. +} +\section{Methods (by class)}{ +\itemize{ +\item \code{xmap_to_matrix(xmap_df)}: Coerce a \code{xmap_df} to a Matrix + +}} +\examples{ +abc_xmap <- data.frame( + stringsAsFactors = FALSE, + origin = c("a","b","c","d","e", + "f","g","h","i","i","j","j","j"), + dest = c("AA","AA","AA","AA", + "BB","BB","CC","DD","EE","FF","GG","HH","II"), + link = c(1, 1, 1, 1, 1, 1, 1, 1, 0.5, 0.5, 0.3, 0.3, 0.4) + ) |> +as_xmap_df(origin, dest, link) +xmap_to_matrix(abc_xmap) +} +\seealso{ +Other {xmap coercion}: +\code{\link{xmap_to_named_vector}()} +} +\concept{{xmap coercion}} diff --git a/man/xmap_to_named.Rd b/man/xmap_to_named.Rd new file mode 100644 index 0000000..f388f7c --- /dev/null +++ b/man/xmap_to_named.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xmap_to.R +\name{xmap_to_named_vector} +\alias{xmap_to_named_vector} +\alias{xmap_to_named_list} +\title{Coerce a unit weight \code{xmap_df} to a named vector or list} +\usage{ +xmap_to_named_vector(.xmap) + +xmap_to_named_list(.xmap) +} +\arguments{ +\item{.xmap}{xmap with only unit weights} +} +\value{ +Named vector or list. +} +\description{ +Checks that an \code{xmap} has unit weights, and converts the +\code{from} values into: +\itemize{ +\item a vector for \code{xmap_to_named_vector()} +\item a nested list for \code{xmap_to_named_list()} +} + +Names are the unique target nodes in \code{to}, +and each element contains the source node(s) in \code{from}. +} +\examples{ +iso_vector <- c(AF = "004", AL = "008", DZ = "012", AS = "016", AD = "020") +iso_xmap <- iso_vector |> + as_pairs_from_named(names_to = "iso2c", values_to = "iso3n") |> + add_weights_unit() |> + as_xmap_df(from = iso3n, to = iso2c, weights) +identical(iso_vector, xmap_to_named_vector(iso_xmap)) +animal_list <- list(MAMM = c("elephant", "whale", "monkey"), + REPT = c("lizard", "turtle"), + CRUS = c("crab")) +animal_xmap <- animal_list |> + as_pairs_from_named(names_to = "class", values_to = "animals") |> + add_weights_unit() |> + as_xmap_df(from = animals, to = class, weights = weights) +identical(xmap_to_named_list(animal_xmap), animal_list) +} +\seealso{ +Other {xmap coercion}: +\code{\link{xmap_to_matrix}()} +} +\concept{{xmap coercion}} diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 0000000..96fd256 --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,12 @@ +# This file is part of the standard setup for testthat. +# It is recommended that you do not modify it. +# +# Where should you do additional test configuration? +# Learn more about the roles of various files in: +# * https://r-pkgs.org/tests.html +# * https://testthat.r-lib.org/reference/test_package.html#special-files + +library(testthat) +library(xmap) + +test_check("xmap") diff --git a/tests/testthat/tests.R b/tests/testthat/tests.R new file mode 100644 index 0000000..3f0c374 --- /dev/null +++ b/tests/testthat/tests.R @@ -0,0 +1,422 @@ +# Generated from create-xmap.Rmd: do not edit by hand +testthat::test_that("add_weights_*() work as expected",{ + abc_pairs <- data.frame(lower = letters[1:5], upper = LETTERS[1:5]) + abc_links <- data.frame(lower = letters[1:5], upper = LETTERS[1:5], weights = 1) + testthat::expect_equal(add_weights_unit(abc_pairs), abc_links) + + animal_pairs <- list(MAMM = c("elephant", "whale", "monkey"), + REPT = c("lizard", "turtle"), + CRUS = c("crab")) |> + as_pairs_from_named("class", "animal") + animal_links <- animal_pairs |> + dplyr::group_by(class) |> + dplyr::mutate(weights = 1/dplyr::n_distinct(animal)) |> + dplyr::ungroup() + add_links <- animal_pairs |> + add_weights_equal(from = class, to = animal, weights_into = "weights") + + testthat::expect_equal(animal_links, add_links) + } + ) + +testthat::test_that( + "vhas_* xmap validation helpers work as expected on valid df", + { + df <- tibble::tribble( + ~from, ~to, ~weights, + "A1", "B01", 1, + "A2", "B02", 1, + "A3", "B02", 1, + "A4", "B03", 0.67, + "A4", "B04", 0.33 + ) + testthat::expect_true(vhas_no_dup_pairs(df$from, df$to)) + testthat::expect_true(vhas_complete_weights(df$from, df$weights)) + testthat::expect_true(vhas_xmap_props(df$from, df$to, df$weights)) + } +) + +testthat::test_that( + "vhas_* xmap validation helpers catch invalid df", + { + df <- tibble::tribble( + ~from, ~to, ~weights, + "A1", "B01", 1, + "A2", "B02", 0.3, + "A2", "B02", 0.5 + ) + testthat::expect_false(vhas_complete_weights(df$from, df$weights)) + testthat::expect_false(vhas_no_dup_pairs(df$from, df$to)) + } +) + +testthat::test_that( + "vhas_complete_weights() works on recurring fractional weights", + { + df <- data.frame(key1 = rep("A1", 3), + key2 = c("B01", "B02", "B03"), + share = rep(1/3, 3)) + + testthat::expect_true(vhas_complete_weights(df$key1, df$share)) + } +) + +testthat::test_that( + "vhas_* relation type flag functions work as expected", + { + w_1to1 <- rep(1, 10) + w_1toM <- rep(1/6, 6) + to_1fromM <- rep("country", 4) + testthat::expect_true(vhas_recode(w_1to1)) + testthat::expect_false(vhas_recode(w_1toM)) + testthat::expect_true(vhas_split(w_1toM)) + testthat::expect_false(vhas_split(w_1to1)) + testthat::expect_true(vhas_collapse(to_1fromM)) + } +) + +testthat::test_that("verify_named_all_1to1() works as expected", { + v1toM <- c(fruit = "apple", fruit = "banana") + v1to1 <- c(A = 1, B = 2, C = 3) + l1toM <- list(fruit = c("apple", "banana")) + testthat::expect_error(verify_named_all_1to1(v1toM), class = "abort_not_1to1") + testthat::expect_error(verify_named_all_1to1(l1toM), class = "abort_not_1to1") + testthat::expect_equal(verify_named_all_1to1(v1to1), v1to1) + }) + +testthat::test_that("verify_named_*_unique() work as expected", { + vdup_pairs <- c(fruit = "apple", fruit = "apple") + ldup_pairs <- list(fruit = c("apple", "apple")) + testthat::expect_error(verify_named_all_unique(vdup_pairs), class = "abort_not_unique") + testthat::expect_error(verify_named_all_unique(ldup_pairs), class = "abort_not_unique") + vdup_names <- c(fruit = "apple", fruit = "banana") + ldup_names <- list(fruit = c("apple", "banana"), fruit = "pear") + testthat::expect_error(verify_named_all_names_unique(vdup_names), class = "abort_not_unique") + testthat::expect_error(verify_named_all_names_unique(ldup_names), class = "abort_not_unique") + vdup_values <- c(fruit = "apple", veg = "apple") + ldup_values <- list(fruit = c("apple", "banana"), veg = "apple") + testthat::expect_error(verify_named_all_values_unique(vdup_values), class = "abort_not_unique") + testthat::expect_error(verify_named_all_values_unique(ldup_values), class = "abort_not_unique") +}) + +testthat::test_that("verify_named_matchset fncs work as expected", { + v_1to1 <- c(x1 = 1, x2 = 2, x3 = 3) + refn_exact_1to1 <- c("x1", "x2", "x3") + refn_subset_1to1 <- c("x1", "x2") + refn_superset_1to1 <- c("x1", "x2", "x3", "x4") + testthat::expect_equal(verify_named_matchset_names_exact(v_1to1, refn_exact_1to1), v_1to1) + testthat::expect_error(verify_named_matchset_names_exact(v_1to1, c("not", "right")), + class = "abort_matchset") + testthat::expect_equal(verify_named_matchset_names_contain(v_1to1, refn_subset_1to1), v_1to1) + testthat::expect_equal(verify_named_matchset_names_contain(v_1to1, refn_exact_1to1), v_1to1) + testthat::expect_error(verify_named_matchset_names_contain(v_1to1, refn_superset_1to1), + class = "abort_matchset") + testthat::expect_equal(verify_named_matchset_names_within(v_1to1, refn_superset_1to1), v_1to1) + testthat::expect_equal(verify_named_matchset_names_within(v_1to1, refn_exact_1to1), v_1to1) + testthat::expect_error(verify_named_matchset_names_within(v_1to1, refn_subset_1to1), + class = "abort_matchset") + refv_exact_1to1 <- c(1, 2, 3) + refv_subset_1to1 <- c(1, 2) + refv_superset_1to1 <- c(1, 2, 3, 4) + testthat::expect_equal(verify_named_matchset_values_exact(v_1to1, refv_exact_1to1), v_1to1) + testthat::expect_error(verify_named_matchset_values_exact(v_1to1, c("not", "right")), + class = "abort_matchset") + testthat::expect_equal(verify_named_matchset_values_contain(v_1to1, refv_subset_1to1), v_1to1) + testthat::expect_equal(verify_named_matchset_values_contain(v_1to1, refv_exact_1to1), v_1to1) + testthat::expect_error(verify_named_matchset_values_contain(v_1to1, refv_superset_1to1), + class = "abort_matchset") + testthat::expect_equal(verify_named_matchset_values_within(v_1to1, refv_superset_1to1), v_1to1) + testthat::expect_equal(verify_named_matchset_values_within(v_1to1, refv_exact_1to1), v_1to1) + testthat::expect_error(verify_named_matchset_values_within(v_1to1, refv_subset_1to1), + class = "abort_matchset") +}) + +testthat::test_that("verify_pairs_* work as expected", { + v_1to1 <- c(x1 = 1, x2 = 2, x3 = 3) + pairs_1to1 <- tibble::enframe(v_1to1, "f", "t") + testthat::expect_identical(verify_pairs_all_1to1(pairs_1to1, f, t), pairs_1to1) + testthat::expect_identical(verify_pairs_all_unique(pairs_1to1, f, t), pairs_1to1) +} +) + +testthat::test_that(".calc_xmap_subclass_attr() rejects unknown subclass", + { + testthat::expect_error(.calc_xmap_subclass_attr("unknown")) + }) + +testthat::test_that( + "new_xmap_df() accepts arbitrary data.frames with correct from argument", + { + df <- data.frame( + x = letters[1:5], + y = 1:5, + z = runif(5) + ) + xmap <- new_xmap_df(x = df, "x", "y", "z") + xmap_attrs <- attributes(xmap) + testthat::expect_s3_class(xmap, .calc_xmap_subclass_attr("xmap_df")) + testthat::expect_identical(xmap_attrs$col_from, "x") + testthat::expect_identical(xmap_attrs$col_to, "y") + testthat::expect_identical(xmap_attrs$col_weights, "z") + testthat::expect_identical(xmap_attrs$from_set, unique(df$x)) + } +) + +testthat::test_that("abort_col_order() works as expected", + { + df <- data.frame(a = 1, b = 2, c = 3) + testthat::expect_invisible(abort_col_order(df, "a", "b", "c")) + testthat::expect_identical(abort_col_order(df, "a", "b", "c"), df) + testthat::expect_error(abort_col_order(df, "b", "a", "c"), + class = "abort_col_order") + }) + +testthat::test_that( + "validate & verify xmap fncs accept well-formed xmaps", + { + df <- tibble::tribble( + ~node_A, ~node_B, ~w_AB, + "A1", "B01", 1, + "A2", "B02", 1, + "A3", "B02", 1, + "A4", "B03", 0.25, + "A4", "B04", 0.75 + ) + x <- new_xmap_df(df, "node_A", "node_B", "w_AB") + out <- testthat::expect_invisible(validate_xmap_df(x)) + testthat::expect_identical(out, x) + testthat::expect_identical(df, verify_links_as_xmap(df, node_A, node_B, w_AB)) + } +) + +## columns present +testthat::test_that( + "validate & verify fncs reject missing columns", + { + df <- tibble::tribble( + ~from, ~to, ~weights, + "A1", "B01", 1 + ) + x <- new_xmap_df(df, "from", "missing_col", "weights") + testthat::expect_error(abort_missing_cols(df, c("from", "missing_col", "weights")), + class = "abort_missing_cols" + ) + testthat::expect_error(validate_xmap_df(x), + class = "abort_missing_cols" + ) + testthat::expect_error(verify_links_as_xmap(df, node_A, node_B, w_AB), + class = "abort_missing_cols") + } +) + +## any NA values +testthat::test_that( + "validate & verify xmap fncs reject missing values", + { + df <- tibble::tribble( + ~from, ~to, ~weights, + "A1", "B2", NA, + NA, "B2", NA, + "A3", "B1", 1 + ) + x <- new_xmap_df(df, "from", "to", "weights") + testthat::expect_error(validate_xmap_df(x), class = "abort_na") + testthat::expect_error(verify_links_as_xmap(df, from, to, weights), + class = "abort_na") + } +) + + +## column type +testthat::test_that( + "validate & verify xmap fncs reject non-numeric weight columns", + { + df <- tibble::tribble( + ~f, ~t, ~w, + "A1", "B01", 1, + "A4", "B03", 0.25, + "A4", "B04", 0.75 + ) |> + dplyr::mutate(w = as.character(w)) + testthat::expect_error(abort_weights_col_type(df, "weights"), + class = "abort_col_type" + ) + x <- new_xmap_df(df, "f", "t", "w") + testthat::expect_error(verify_links_as_xmap(df, f, t, w), + class = "abort_col_type") + } +) + +## from set check +testthat::test_that( + "validate_xmap_df() rejects mismatching from_set", + { + df <- tibble::tribble( + ~from, ~to, ~weights, + "A1", "B01", 1, + "A4", "B03", 0.25, + "A4", "B04", 0.75 + ) + bad_set <- c("bad set", "of", "nodes") + testthat::expect_error(abort_from_set(df, "from", bad_set)) + x <- new_xmap_df(df, "from", "to", "weights", from_set = bad_set) + testthat::expect_error(validate_xmap_df(x)) + } +) + +## duplicate links +testthat::test_that( + "validate and verify xmap fncs reject duplicate from-to links", + { + df <- tibble::tribble( + ~f, ~t, ~w, + "A1", "B02", 0.3, + "A1", "B02", 1 + ) + testthat::expect_error(abort_dup_pairs(df, "f", "t"), class = "abort_dup_pairs") + x <- new_xmap_df(df, "f", "t", "w") + testthat::expect_error(verify_links_as_xmap(df, f, t, w), + class = "abort_dup_pairs") + } +) + +## complete weights +testthat::test_that( + "validate & verify xmap fncs rejects invalid weights", + { + df <- tibble::tribble( + ~f, ~t, ~w, + "A1", "B01", 0.4, + "A1", "B02", 0.59 + ) + x <- new_xmap_df(df, "f", "t", "w") + testthat::expect_error(verify_links_as_xmap(df, f, t, w), + class = "abort_bad_weights") + } +) + +testthat::test_that( + "as_xmap() is returns expected xmap subclasses", + { + tbl_links <- tibble::tribble( + ~f, ~t, ~w, + "A1", "B01", 1, + "A2", "B02", 1, + "A3", "B02", 1, + "A4", "B03", 0.67, + "A4", "B04", 0.33 + ) + df_links <- as.data.frame(tbl_links) + + ## default subclasses work as expected + testthat::expect_s3_class(as_xmap_df(df_links, f, t, w), + .calc_xmap_subclass_attr("xmap_df")) + + ## override subclass works as well + testthat::expect_s3_class(as_xmap_df(tbl_links, f, t, w, subclass = "xmap_df"), + .calc_xmap_subclass_attr("xmap_df")) + } +) + +testthat::test_that("xmap_to_matrix handles xmaps with different column counts",{ + links <- tibble::tribble( + ~f, ~t, ~w, + "A1", "B01", 1, + "A2", "B02", 1, + "A3", "B02", 1, + "A4", "B03", 0.25, + "A4", "B04", 0.75 + ) + xmap_small <- new_xmap_df(links, "f", "t", "w") + + links_extra <- links |> + dplyr::mutate(ex = "extra") + xmap_extra <- new_xmap_df(links_extra, "f", "t", "w") + + xmap_matrix_small <- xmap_small |> xmap_to_matrix() + xmap_matrix_extra <- xmap_extra |> xmap_to_matrix() + + testthat::expect_identical(xmap_matrix_small, xmap_matrix_extra) + } + ) + +testthat::test_that("xmap_to_named works as expected", { + links <- tibble::tribble( + ~f, ~t, ~w, + "A1", "B01", 1, + "A2", "B02", 1, + "A3", "B02", 1, + "A4", "B03", 0.25, + "A4", "B04", 0.75 + ) + ## works for collapse relations + xmap_unit <- new_xmap_df(links[1:3,], "f", "t", "w") + unit_list <- list(B01 = c("A1"), B02 = c("A2", "A3")) + unit_vector <- tibble::deframe(xmap_unit[,c("t", "f")]) + testthat::expect_identical(unit_list, xmap_to_named_list(xmap_unit)) + testthat::expect_identical(unit_vector, xmap_to_named_vector(xmap_unit)) + ## rejects split relations + xmap_mixed <- new_xmap_df(links, "f", "t", "w") + testthat::expect_error(xmap_to_named_list(xmap_mixed), + class = "abort_frac_weights") +}) + +testthat::test_that("xmap_to_named_list() reverses as_pairs_from_named()", { + link_list <- list(AA = c("x3", "x4", "x6"), + BB = c("x1", "x5"), + CC = c("x2") + ) + link_xmap <- + as_pairs_from_named(link_list, + "capital", "xvars") |> + add_weights_unit(weights_into = "w") |> + new_xmap_df("xvars", "capital", "w") + testthat::expect_identical(xmap_to_named_list(link_xmap), link_list) +}) + +testthat::test_that("xmap_reverse.xmap_df() works as expected", { + df_x <- tibble::tribble( + ~from, ~to, ~weights, + "A1", "B01", 1, + "A4", "B03", 0.25, + "A4", "B04", 0.75 + ) |> as.data.frame() |> + new_xmap_df("from", "to", "weights") + + df_x_rev <- data.frame( + to = df_x$to, + from = df_x$from, + r_weights = 1 + ) |> + new_xmap_df("to", "from", "r_weights") + + # class checks + testthat::expect_s3_class(xmap_reverse.xmap_df(df_x), class(df_x_rev)) + testthat::expect_s3_class(xmap_reverse(df_x), class(df_x_rev)) + + # output checks + testthat::expect_identical(xmap_reverse.xmap_df(df_x), df_x_rev) + testthat::expect_identical(abort_not_reversible(df_x,"to"), df_x) +} +) + +testthat::test_that('xmap_drop_extra works as expected', { + links <- tibble::tribble( + ~f, ~t, ~w, + "A1", "B01", 1, + "A2", "B02", 1, + "A3", "B02", 1, + "A4", "B03", 0.25, + "A4", "B04", 0.75 + ) + xmap_small <- new_xmap_df(links, "f", "t", "w") + + links_extra <- links |> + dplyr::mutate(ex = "extra") + xmap_extra <- new_xmap_df(links_extra, "f", "t", "w") + + xmap_drop <- xmap_extra |> xmap_drop_extra() + + testthat::expect_identical(xmap_small, xmap_drop) +}) + diff --git a/vignettes/making-xmaps.Rmd b/vignettes/making-xmaps.Rmd new file mode 100644 index 0000000..ce9e6b5 --- /dev/null +++ b/vignettes/making-xmaps.Rmd @@ -0,0 +1,466 @@ +--- +title: "Making and Verifying Crossmaps" +output: + rmarkdown::html_vignette: + toc: yes +vignette: > + %\VignetteIndexEntry{Making and Verifying Crossmaps} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup} +library(xmap) +``` + +## Crossmaps + +A (Nomenclature) Crossmap encodes a complete redistribution of values between a source and target classifications as a directed, bipartite, weighted graph. Numeric values transformed using a valid Crossmap will sum to the same total in both the source and target classifications. A valid Crossmap satisfies the following conditions: + +1. There is at most one link between each distinct source and target node +2. For each source node, the sum of weights attached to all outgoing links sums to one. + +For example, this mapping between occupation codes in ANZSCO22 and ISCO8 is a valid crossmap: + +![A crossmap for converting Australian occupation categories (ANZSCO22) to an international standard (ISCO8).](plot-anzsco-isco-bigraph.png) + +### Unit and Fractional Weight Links + +The weights associated with each source-target link encode the transformation to be applied to values associated with a given source node. Let the weight between source node $i$ and target node $j$ be denoted $w_{ij}$. Weights can range from $[0,1]$. Note that $w_{ij} = 0$ is the trivial case where there is no link between the source and target nodes. The non-trivial cases are unit weights, $w_{ij} = 1$ and fractional weights, $w_{ij} \in (0,1)$. + +Unit weights indicate that source values will be unmodified when passed into the target nomenclature. Therefore, crosswalks for recoding node labels, or collapsing multiple sub-category source nodes into parent target nodes, can both be represented as crossmaps with unit weights. + +Fractional weights on the other hand indicate how a source value will be modified when passed to the target node. For example a link with a weight of $0.7$ indicates that 70% of the numeric value associated with a source node should be "passed over" to the target node. + +Now, consider two links from the same source node $i$ to two different target nodes $j$ and $k$, with weights $w_{ij}$ and $w_{ik}$. Assume that the source node $i$ has no other outgoing links into the target nomenclature. Then, it follows that the weights on the two given links should sum to one if we want to preserve the total value between the source and target nomenclature. For instance, if $w_{ij} = 0.7$, then $w_{ik}$ should be $0.3$, such that 70% of the value associated with source node $i$ goes to the target node $j$, and the remaining 30% is distributed to target node $k$. + +### Crossmap tables + +Crossmap tables are extensions of crosswalk or lookup tables, where each row represents a weighted link between the source and target classifications. You can pass a data.frame or tibble of weighted edges to `verify_links_as_xmap()` to check if the input satisfies crossmap properties. + +Your input table `x` needs to have at least 3 complete (i.e. no `NA`) columns: + +- `from`: source classification labels. +- `to`: target classification labels +- `weights`: applied to values in the source classification + +Additional columns (e.g. for label descriptions etc.) can be retained by setting the `.drop_extra` argument to `FALSE`. + +`verify_links_as_xmap` and `as_xmap_df()` will validate that: + +1. There are no duplicates `from`-`to` pairs, +2. Every group of `weights` associated with a distinct `from` value sums to 1 (subject to a floating point tolerance). + +The package also offers a experimental `xmap_df` class facilitate additional functionality such as graph property calculation and printing (i.e. relation types and crossmap direction) via a custom `print()` method, coercion to other useful classes (e.g. `xmap_to_matrix()`), visualisation (see `vignette("vis-xmaps")` for prototypes), and multi-step transformations (i.e. from nomenclature A to B to C). + +Please note that the `xmap_df` class and related functions are still in active development and subject to breaking changes in future releases. + +## Special Cases of Crossmaps + +The conditions for valid nomenclature crossmaps defined above are sufficient to ensure that the total sum of value is retained between the source and target nomenclature However, there are a number of other transformation properties which crossmaps can also encode and validate. Such special cases include "degenerate" crossmaps. We consider a crossmap "degenerate" if it has only binary weights on all possible source-target links. In such cases the weights are implied by the presence or absence of a link. + +### Types of Mapping Relations + +It is helpful when thinking about special cases to define some familiar types of relations: + +- **one-to-one**: a set of links where pairs of source and target nodes are uniquely linked to each other and no other nodes. Any links in this type of relation will have unit weights. +- **one-to-many**: a set of links where a single source node is linked to multiple target nodes. Links in this "splitting" relation will have fractional weights. +- **one-from-many**: a set of links where a single target node is linked to multiple source nodes. This "collapse" relation is more commonly known as a **many-to-one**. Links can have either unit or fractional weights depending on whether the source nodes are part of one-to-one or one-to-many relations. +- **many-to-many**: refers to a combination of the above relations and is often used to signal that a correspondence between sets has both **one-to-many** splitting relations and **one-from-many** collapsing relations. + +Note that links in a crossmap can be partitioned into subgraphs according to the number of outgoing links (i.e. one-to-one and one-to-many) + +### Recode maps + +Crossmaps with only one-to-one relations recode source labels to target labels, leaving any attached values untouched. This implies that the weights on links are always 1. Notice that Schema Crosswalks, lookup tables and category recoding functions are all Recode Maps. + +A common way of implementing recodings in R is named vectors or pairwise arguments such as in `forcats::fct_recode()`. Named vectors are a convenient way to store and use one-to-one and many-to-one mappings. In general it is not necessary to convert such mappings into crossmaps unless you want to combine them with fractional weight links. However, when necessary, named vectors can be converted into `xmap` by first transforming vector into two-column table of node pairs, and then attaching the implied unit weights. + +``` r +# example using the named vector included in `mock` objects: +mock$recode_vect + +recode_xmap <- + xmap::as_pairs_from_named(mock$recode_vect, names_to = "iso3", values = "ctr_name") |> + xmap::add_weights_unit(weights_into = "w") |> + xmap::as_xmap_df(from = iso3, to = ctr_name, weights = w) +``` + +For a crossmap to be a recode map it must have binary weights, and the cardinality of the source and target node sets must be equal. In other words, weights can only be 0 (i.e. no link) or 1, and the number of unique source and target labels should be the same. The function `verify_named_all_1to1()` or its alias `verify_named_as_recode_unique()` uses the cardinality condition to verify whether or not a named vector has only one-to-one relation: + +```{r} +fruit_color <- c(apple = "green", strawberry = "red", banana = "yellow") +fruit_color |> + verify_named_all_1to1() |> + print() +``` + +We can also identify mistakes such as accidentally assigning a value twice: + +```{r error=TRUE} +fruit_color_mistake <- c(fruit_color, pear = "green") + +fruit_color_mistake |> + verify_named_as_recode_unique() +``` + +### Collapse or Aggregation maps + +Collapse crossmaps are similar to Recode crossmaps in that source values are unmodified when passed into the target nomenclature. However, as the name suggests, multiple source nodes can be "collapsed" into the same target node, such that at least some of the source values will be aggregated in the target nomenclature. Similar to a Recode crossmap, a Collapse crossmap must only have unit weights, but the cardinality of the source node set will be larger than the target node set. This means that at least two source nodes must be linked to the same target node since there are fewer target nodes than source nodes. + +Similar to the recode maps, collapse maps only have unit weights. Hence, we can use named lists to store such mappings. Consider an example such as assigning students to groups. Assume no two students share the same name: + +```{r} +student_groups <- list(GRP1 = c("kate", "jane", "peter"), + GRP2 = c("terry", "ben", "grace"), + GRP3 = c("cindy", "lucy", "alex" )) +``` + +We can check that students are not assigned to multiple groups: + +```{r} +student_groups |> + verify_named_all_values_unique() +``` + +Or that the every student has been assigned a group: + +```{r error=TRUE} +## mistakenly assign kate to another group +student_group_mistake <- list(GRP1 = c("kate", "jane", "peter"), + GRP2 = c("terry", "ben", "grace"), + GRP2 = c("cindy", "lucy", "kate" )) + +student_list <- c("kate", "jane", "peter", "terry", "ben", "grace", "cindy", "lucy", "alex") + +student_group_mistake |> + verify_named_matchset_values_exact(student_list) +``` + +and each group has a unique name: + +```{r error=TRUE} +student_group_mistake |> + verify_named_all_names_unique() +``` + +Coercion to `xmap` requires adding weights and is directional. We can choose to verify conditions and return the same object via `verify_links_*` functions, or coerce to an `xmap` for use with other functions in the package. + +```{r error=TRUE} +group_links <- student_groups |> + as_pairs_from_named(names_to = "group", values_to = "student") |> + add_weights_unit() + +## collapse xmap from students to groups +group_links |> + verify_links_as_xmap(from = student, to = group, weights = weights) + +## reverse doesn't work without adjusting the weights +group_links |> + verify_links_as_xmap(from = group, to = student, weights = weights) +``` + +Aggregation transformations can implemented as Collapse maps. Additionally, aggregation size requirements can be implemented as conditions on the number of incoming links for each target node. For instance, if you know that each target node should aggregate values from at least two source nodes, then the minimum number of non-zero incoming links should be 2 for every target node. Notice that this requirement precludes one-to-one links since it prevents any source node from being recoded into an "unshared" node in the target nomenclature. + +### Split or Redistribution maps (with Equal Weights) + +Any crossmap with at least one set of fractional weight (one-to-many) links involve some redistribution of source value. However, a useful special case of maps with fractional weights are split maps, which encode the disaggregation of source values into a target nomenclature. Such maps which will have mostly one-to-many relations, and do not contain one-from-many relations. This can be ensured by restricting each target node to only have one incoming link. Furthermore, we can preclude one-to-one links with the condition that all links must have fractional weights. + +Notice that the same set of node pairs can generate either a split or collapse map depending on the weights added. For example, recall that in the student group example from above, swapping the `to` and `from` variables no longer formed a valid crossmap. Now consider allocating some budget of prizes equally between students in each group. This requires adding fractional weights to `group_links`, which can then be verified as a crossmap. + +```{r} +group_prize_links <- student_groups |> + as_pairs_from_named("group", "student") |> + dplyr::group_by(group) |> + dplyr::mutate(prize_share = 1 / dplyr::n_distinct(student)) + +group_prize_links |> + verify_links_as_xmap(from = group, to = student, weights = prize_share) +``` + +Note that if the first crossmap condition is met, i.e. there is only one link between each unique source-node pair, then `dplyr::n()` and `dplyr::n_distinct()` are interchangeable when generating equal share weights. + +## Creating `xmap_df` from Pairs and Links + +The following examples demonstrate how to coerce various inputs into the experimental `xmap_df` class. + +### Row-wise Links + +The following example shows how to create a mixed crossmap which encodes one-to-one, many-to-one and one-to-many relations and coerce them into a `xmap_df`. This method is most suitable for simple crossmaps. + +```{r} +simple_links <- tibble::tribble( + ~source, ~target, ~share, + "equal", "EQUAL", 1, # one-to-one + "member_1", "GROUP", 1, # one-FROM-many + "member_2", "GROUP", 1, + "whole", "PART_1", 0.3, # one-to-many + "whole", "PART_2", 0.6, + "whole", "PART_3", 0.1 +) + +simple_xmap <- simple_links |> + as_xmap_df(from = source, to = target, weights = share) + +simple_xmap +``` + +### Crosswalk/Lookup Tables + +It is more common that you will want to convert an existing correspondence into a crossmap. Such conversions require attaching appropriate weights to the existing crosswalk table. + +#### Recode Pairs + +Consider the first five country codes in the ISO 3166 international standard and the one-to-one correspondence between the 2-digit, 2-digit and numeric codes. + +```{r} +iso_codes <- tibble::tribble( + ~country, ~ISO2, ~ISO3, ~ISONumeric, + "Afghanistan", "AF", "AFG", "004", + "Albania", "AL", "ALB", "008", + "Algeria", "DZ", "DZA", "012", + "American Samoa", "AS", "ASM", "016", + "Andorra", "AD", "AND", "020" + ) +``` + +We can verify any pairwise combination of columns as one-to-one recode maps using the `verify_pairs_*` functions: + +```{r} +iso_codes |> + verify_pairs_as_recode_unique(from = country, to = ISO2) |> + print() +``` + +To create a crossmap between `ISONumeric` and `ISO2`, we simply add a weights columns and coerce to `xmap_df`: + +```{r} +iso_xmap <- iso_codes |> + add_weights_unit(weights_into = "weight") |> + as_xmap_df(from = ISONumeric, to = ISO2, weights = weight) +``` + +Notice that `as_xmap_df()` always place the `from`, `to` and `weights` columns in order and drops any additional columns passed to it. + +```{r} +print(iso_xmap) +``` + +To convert the validated xmap into a named vector: + +```{r} +iso_xmap |> + xmap_to_named_vector() +``` + +We can also easily generate validated crossmaps for the other nomenclature in the table, and keep additional columns: + +```{r} +iso_codes |> + add_weights_unit(weights_into = "weight") |> + as_xmap_df(from = ISO2, to = ISO3, weights = weight, .drop_extra = FALSE) +``` + +#### Nested Lists + +Now consider aggregating data which were collected using the ISO 3166-2 Subdivisions of [Australia](https://en.wikipedia.org/w/index.php?title=ISO_3166-2:AU&oldid=1110907059) and [Canada](https://en.wikipedia.org/w/index.php?title=ISO_3166-2:CA&oldid=1110906706): + +```{r} +adm1_list <- tibble::tribble( + ~ctr, ~adm1, + "AU", "AU-NSW, AU-QLD, AU-SA, AU-TAS, AU-VIC, AU-WA, AU-ACT, AU-NT", + "CA", "CA-AB, CA-BC, CA-MB, CA-NB, CA-NL, CA-NS, CA-ON, CA-PE, CA-QC, CA-SK, CA-NT, CA-NU, CA-YT" +) +``` + +Recall that we need one row per relation between the source (`adm1`) and target (`ctr`) nomenclature. Thus we split the string list into a vector, and then unnest the values by country. + +```{r} +agg_x <- adm1_list |> + dplyr::mutate(adm1 = stringr::str_split(adm1, ", ")) |> + tidyr::unnest(cols = c(adm1)) + +agg_x +``` + +Since aggregation involves the one-to-one transfer of values between `adm1` and `ctr` prior to the collapsing the `ctr` groups, we simple add unit weights to form a valid crossmap: + +```{r} +agg_xmap <- agg_x |> + add_weights_unit(weights_into = "link") |> + as_xmap_df(from = adm1, to = ctr, weights = link) + +agg_xmap +``` + +#### References for Custom Weights + +Conversely, we might have aggregate level data which want to disaggregate. Continuing the above example, this could involve incorporating country level data into analysis at the 3166-2 Subdivisions level. + +For example, imagine that we have population figures for Australia at the 3166-2 level for 9 out of 10 years, but only country level figures for the missing year. In this simple example, a reasonable harmonisation design could involve splitting the country level figure by the subdivision level population proportions from the year preceding or following the missing year. + +Alternatively, we might want to compare aggregate and disaggregate statistics to identify discrepancies. + +Using Australian population statistics from 30 Jun 2022[^1] + +[^1]: Source: [Australian Bureau of Statistics](https://www.abs.gov.au/statistics/people/population/national-state-and-territory-population/jun-2022) + +```{r} +state_data <- tibble::tribble( + ~state, ~adm1, ~Pop, + "New South Wales", "AU-NSW", 8153600, + "Victoria", "AU-VIC", 6613700, + "Queensland", "AU-QLD", 5322100, + "South Australia", "AU-SA", 1820500, + "Western Australia", "AU-WA", 2785300, + "Tasmania", "AU-TAS", 571500, + "Northern Territory", "AU-NT", 250600, + "Australian Capital Territory", "AU-ACT", 456700 +) + +state_xmap <- state_data |> + dplyr::mutate(ctr = "AU", + adm1, + share = Pop / sum(Pop)) |> + as_xmap_df(from = ctr, to = adm1, weights = share) + +state_xmap +``` + +### Piecewise Construction + +Now consider the following mixed transformation using selected correspondences between NAICS Canada 1997 and ISIC Revision 3. Imagine that we have some numeric data (e.g. gross output in CAD) collected in the NAICS Canada nomenclature that we want to harmonise into ISIC Revision 3. The correspondence between the two nomenclature contains a mixture of one-to-one, one-to-many, and one-from-many relations. + +Luckily, we can split the source nomenclature into two groups: + +1. source nodes with only one outgoing link (one-to-one relations with unit weights) +2. source nodes with multiple outgoing links (one-to-many relations with fractional weights) + +Let's first define somed example correspondences.[^2] + +[^2]: based on examples provided by Statistics Canada on the page [How to Read a Concordance Table](https://www.statcan.gc.ca/en/subjects/standard/concordances/concordanc_tabl3). + +In the first example, one NAICS Canada class relates to exactly one ISIC class, forming a one-to-one relation. + +```{r} +canada_recode <- tibble::tibble( + NAICS1997 = "212210", + NAICS1997_desc = "Iron Ore Mining", + ISIC3 = "C1310", + ISIC3_desc = "Mining of iron ores" +) +``` + +```{r echo=FALSE} +knitr::kable(canada_recode) +``` + +In the second example, the ISIC target class `D1543` is equivalent to more than one NAICS Canada source class, forming a one-from-many relation. The asterisk (Partial Flag) indicates that part of ISIC D1543 is equivalent to each NAICS Canada class. The ISIC activities corresponding to each NAICS Canada class are listed in the column labelled "Link". + +```{r} +canada_agg <- tibble::tribble( + ~NAICS1997, ~NAICS1997_desc, ~ISIC3, ~ISIC3_desc, ~Link, + "311320", "Chocolate and Confectionery Manufacturing from Cacao Beans", "D1543 *", "Manufacture of cocoa, chocolate and sugar confectionery", "Chocolate and confectionery, made from cacao beans", + "311330", "Confectionery Manufacturing from Purchased Chocolate", "D1543 *", "Manufacture of cocoa, chocolate and sugar confectionery", "Confectionery, made from purchased chocolate", + "311340", "Non-Chocolate Confectionery Manufacturing", "D1543 *", "Manufacture of cocoa, chocolate and sugar confectionery", "Non-chocolate confectionery, manufacturing" +) +``` + +```{r echo=FALSE} +knitr::kable(canada_agg) +``` + +Notice that for both one-to-one and one-from-many relations, values attached to each source category are not directly modified during the "transfer" between source and target nomenclature. Instead, the source values are either retained, or summarised when the category collapse (and value aggregation) is performed. Thus, as shown above, all links in the above examples will take unit weights. + +In this third example, one souce NAICS Canada class is equivalent to more than one target ISIC class, forming a one-to-many relation. + +```{r} +canada_split <- tibble::tribble( + ~NAICS1997, ~NAICS1997_desc, ~ISIC3, ~ISIC3_desc, ~Link, + "483213", "Inland Water Transportation (except by Ferries)", "I6110 *", "Sea and coastal water transport", "Intracoastal water transportation", + "483213", "Inland Water Transportation (except by Ferries)", "I6120 *", "Inland water transport", "Inland water transportation (except ferries)" +) +``` + +```{r echo=FALSE} +knitr::kable(canada_split) +``` + +Let's clean up the recode and collapse relations shown above: + +```{r} +canada_unit <- canada_agg |> + # remove the partial flag (*) + dplyr::mutate(ISIC3 = stringr::str_remove(ISIC3, " \\*")) |> + dplyr::select(-Link) |> + # bind the links together and add weights + dplyr::bind_rows(canada_recode) |> + dplyr::mutate(share = 1) +``` + +```{r echo=FALSE} +knitr::kable(canada_unit) +``` + +Now all that remains is to prepare the split links. Similar to the disaggregation example above, we need to design weights to allocate the "pool" of numeric value associated with the NAICS class `483213` into the corresponding ISIC classes `I6110` and `I6120`. + +Assume for illustration purposes that we have reference data to suggest the Canadian "Inland water transport" industry (`I6120`) is twice as big as the "Sea and coastal water transport" industry (`I6110`). This suggests that the weight between `483213` and `I6120` should be twice that of `I6110`. + +```{r} +canada_frac <- canada_split |> + dplyr::mutate(ISIC3 = stringr::str_remove(ISIC3, " \\*")) |> + dplyr::select(-Link) |> + dplyr::mutate(share = dplyr::case_when(ISIC3 == "I6110" ~ 0.33, + ISIC3 == "I6120" ~ 0.67, + T ~ NA_real_)) +``` + +Now let's combine the unit and fractional links into a crossmap: + +```{r} +canada_xmap <- dplyr::bind_rows(canada_unit, canada_frac) |> + as_xmap_df(from = NAICS1997, to = ISIC3, weights = share) + +print(canada_xmap) +``` + +## Reversing Crossmaps + +### One-Way Maps + +Except in the case of recoding, crossmaps are generally lateral (one-way). Weights on collapse and split links are no longer valid if you reverse the direct of the link. Notice that `as_xmap_df()` throws an error if you try to naively swap the `from` and `to` arguments: + +```{r error=TRUE} +dplyr::bind_rows(canada_unit, canada_frac) |> + as_xmap_df(from = ISIC3, to = NAICS1997, weights = share) +``` + +### Reversible Maps + +However, we **can** swap the arguments on a recode map. Recall the ISO country code crossmap we created above: + +```{r} +print(iso_xmap) +``` + +Imagine that instead of converting country codes from ISO Numeric to ISO-2 digit, we wanted to convert from ISO-2 digit to ISO Numeric. We can take the existing crossmap and invert it without editing any weights: + +```{r} +iso_xmap |> + xmap_reverse() +``` + +A less trivial reversal is creating an aggregation map from a disaggregation map. Recall the country to state level disaggregation map we created above: + +```{r} +state_xmap |> + xmap_drop_extra() +``` + +Now imagine we wanted to re-aggregate the data, say after some state-level adjustments: + +```{r} +state_xmap |> + xmap_reverse(weights_into = "agg_w") |> + xmap_drop_extra() +``` diff --git a/vignettes/plot-anzsco-isco-bigraph.png b/vignettes/plot-anzsco-isco-bigraph.png new file mode 100644 index 0000000..8e0d578 Binary files /dev/null and b/vignettes/plot-anzsco-isco-bigraph.png differ diff --git a/vignettes/plot-weight-sum-matrix.png b/vignettes/plot-weight-sum-matrix.png new file mode 100644 index 0000000..29f8dce Binary files /dev/null and b/vignettes/plot-weight-sum-matrix.png differ diff --git a/vignettes/vis-xmaps.Rmd b/vignettes/vis-xmaps.Rmd new file mode 100644 index 0000000..4635e54 --- /dev/null +++ b/vignettes/vis-xmaps.Rmd @@ -0,0 +1,255 @@ +--- +title: "Visualising Crossmap Transformations" +output: + rmarkdown::html_vignette: + toc: yes +vignette: > + %\VignetteIndexEntry{Visualising Crossmap Transformations} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Alternative representations of Crossmaps + +Crossmaps aim to encode dataset integration and harmonisation choices separately to the code used to apply those such designs to data. It follows that visualisations and plots of the candidate crossmaps could be useful during the design process. For instance, Sankey diagrams are sometimes used to visualise schema crosswalks. + +This article provides a few ggplot2 code examples for visualising crossmaps. The package will offer functions for generating these visualisations from `xmap` objects in future releases. + +```{r message=FALSE} +library(ggplot2) +library(dplyr) +library(stringr) +library(patchwork) +library(ggbump) +library(xmap) +``` + +### Table + +Let's start with visualising a section of the ANZSCO22 to ISCO8 crosswalk published by the Australian Bureau of Statistics: + +```{r} +anzsco_cw <- tibble::tribble( + ~anzsco22, ~anzsco22_descr, ~isco8, ~partial, ~isco8_descr, + "111111", "Chief Executive or Managing Director", "1112", "p", "Senior government officials", + "111111", "Chief Executive or Managing Director", "1114", "p", "Senior officials of special-interest organizations", + "111111", "Chief Executive or Managing Director", "1120", "p", "Managing directors and chief executives", + "111211", "Corporate General Manager", "1112", "p", "Senior government officials", + "111211", "Corporate General Manager", "1114", "p", "Senior officials of special-interest organizations", + "111211", "Corporate General Manager", "1120", "p", "Managing directors and chief executives", + "111212", "Defence Force Senior Officer", "0110", "p", "Commissioned armed forces officers", + "111311", "Local Government Legislator", "1111", "p", "Legislators", + "111312", "Member of Parliament", "1111", "p", "Legislators", + "111399", "Legislators nec", "1111", "p", "Legislators" + ) + +links <- anzsco_cw |> + dplyr::group_by(anzsco22) |> + dplyr::summarise(n_dest = dplyr::n_distinct(isco8)) |> + dplyr::ungroup() |> + dplyr::transmute(anzsco22, weight = 1/n_dest) |> + dplyr::left_join(anzsco_cw, by = "anzsco22") + +## get code tables +table_anzsco <- anzsco_cw |> + dplyr::distinct(anzsco22, anzsco22_descr) +table_isco8 <- anzsco_cw |> + dplyr::distinct(isco8, isco8_descr) + +## make xmap +anzsco_xmap <- links |> + as_xmap_df(anzsco22, isco8, weight) +``` + +The included `print()` method for `xmap_df` objects: + +```{r} +print(anzsco_xmap) +``` + +### Bigraph + +Visualisation as a bigraph is particularly useful for seeing the relations between the two nomenclature. + +```{r message=FALSE} +.bigraph_add_link_style <- function(edges, x_attrs, ...) { + ## generate out link type + style_out_case <- tibble::tribble( + ~out_case, ~line_type, ~font_type, + "unit_out", "solid", "bold", + "frac_out", "dashed", "italic") + edges |> + dplyr::mutate(out_case = dplyr::case_when(.data[[x_attrs$col_weights]] == 1 ~ "unit_out", + .data[[x_attrs$col_weights]] < 1 ~ "frac_out")) |> + dplyr::left_join(style_out_case, + by = "out_case") |> + dplyr::ungroup() +} + +.bigraph_add_node_positions <- function(edges, x_attrs, pos_from, pos_to, ...) { + ## attach node positions + edges |> + dplyr::left_join(pos_from, by = setNames("from_set", x_attrs$col_from)) |> + dplyr::left_join(pos_to, by = setNames("to_set", x_attrs$col_to)) |> + dplyr::mutate(from_x = 0, + to_x = 5) |> + dplyr::mutate(idx = dplyr::row_number()) +} + +plt_xmap_bigraph <- function(x, ...) { + stopifnot(is_xmap_df(x)) + x_attrs <- attributes(x) + edges_short <- tibble::as_tibble(x) + + df_out_style <- .bigraph_add_link_style(edges_short, x_attrs) + + ## generate node positions + from_nodes <- tibble::tibble(from_set = x_attrs$from_set) |> + dplyr::mutate(from_y = dplyr::row_number()) + to_nodes <- tibble::tibble(to_set = unique(x[[x_attrs$col_to]])) |> + dplyr::mutate(to_y = dplyr::row_number() - 1 + 0.5) + + df_gg <- .bigraph_add_node_positions(df_out_style, x_attrs, + from_nodes, to_nodes) + ## build ggplot + ggplot2::ggplot(data = df_gg, + aes(x = from_x, xend = to_x, + y = from_y, yend = to_y, + group = idx)) + + ## edges + ggbump::geom_sigmoid(aes(linetype = I(line_type))) + + ggplot2::geom_label(data = dplyr::filter(df_gg, out_case == "unit_out"), + aes(x = (from_x + to_x) / 4, + y = from_y, + label = round(.data[[x_attrs$col_weights]], 2))) + + ggplot2::geom_label(data = dplyr::filter(df_gg, out_case == "frac_out"), + aes(x = (((from_x + to_x) / 2) + to_x) / 2, + y = to_y, + label = round(.data[[x_attrs$col_weights]], 2))) + + ## from nodes + ggplot2::geom_text(aes(x = from_x - 0.5, y = from_y, + label = .data[[x_attrs$col_from]], + fontface=I(font_type)), + ## drop idx groups to avoid duplicate labels + stat = "unique", inherit.aes = FALSE) + + ## to nodes + ggplot2::geom_label(aes(x = to_x + 0.5, y = to_y, + label = .data[[x_attrs$col_to]]), + fill = "black", + alpha = 0.1) + + ggplot2::scale_y_reverse() + + ggplot2::theme_minimal() + + theme(legend.position = "bottom", + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.text.y = element_blank(), + axis.text.x = element_blank(), + plot.background = element_rect(fill = "white")) + + labs(x = NULL, y = NULL) +} +``` + +```{r message=FALSE, echo=FALSE} +gg_bigraph <- anzsco_xmap |> + plt_xmap_bigraph() +``` + +```{r message=FALSE, echo=FALSE, out.width="100%"} +# print bigraph and code tables +gg_bigraph +``` + +```{r message=FALSE, echo=FALSE} +knitr::kable(list(table_anzsco, table_isco8)) +``` + +This visualisation also has benefits over the traditionally used Sankey diagram. Sankey diagrams are often used to illustrated "flows" between nodes. However, variable link widths can actually clutter the visualisation of crosswalks. Consider this simple crossmap that might be used to harmonise national accounts data (e.g. GDP) across two time periods. + +```{r} +edges <- tribble(~ctr, ~ctr2, ~split, + "BLX", "BEL", 0.5, + "BLX", "LUX", 0.5, + "E.GER", "DEU", 1, + "W.GER", "DEU", 1) +``` + +![](https://raw.githubusercontent.com/cynthiahqy/viz-panel-maps/57272c61692ece2d2b94874d12fe7f0619c6e864/docs/plots/viz-country-concord/ggsankey.png){width="100%"} + +On the other hand, the bigraph visualisation shows more clearly how data is modified (or not) when harmonising between nomenclature. The solid lines show when a link does not modify the source values, whilst the dotted line style indicates that data will be split up. Furthermore, by using fixed width links, there is room to place labels on-top of each curve indicated the transformation weights. + +![](https://raw.githubusercontent.com/cynthiahqy/viz-panel-maps/workflowr/docs/plots/ggbump-sigmoid-graph-edges.jpg){width="100%"} + +### Matrix + +Another useful visualisation or representation of a crossmap is as an incidence matrix with the source nomenclature indexed along the rows and the target nomenclature indexed on the columns: + +```{r xmap-as-matrix} +plt_xmap_ggmatrix <- function(x, ...){ + stopifnot(is_xmap_df(x)) + x_attrs <- attributes(x) + edges_complete <- tibble::as_tibble(x) |> + tidyr::complete(.data[[x_attrs$col_from]], .data[[x_attrs$col_to]]) + + ## add link-out type + gg_df <- edges_complete |> + dplyr::mutate(out_case = dplyr::case_when(.data[[x_attrs$col_weights]] == 1 ~ "one-to-one", + .data[[x_attrs$col_weights]] < 1 ~ "one-to-many", + is.na(.data[[x_attrs$col_weights]]) ~ "none") + ) + + ## make plot + gg_df |> ggplot(aes(x=.data[[x_attrs$col_to]], + y=.data[[x_attrs$col_from]])) + + geom_tile(aes(fill=out_case), col="grey") + + scale_y_discrete(limits=rev) + + scale_x_discrete(position='top') + + scale_fill_brewer() + + coord_fixed() + + labs(x = x_attrs$col_to, y = x_attrs$col_from, fill="Outgoing Link Type") + + theme_minimal() + + geom_text(data = dplyr::filter(gg_df, !is.na(.data[[x_attrs$col_weights]])), aes(label=round(.data[[x_attrs$col_weights]], 2))) + + theme(legend.position = "bottom", + panel.grid.major = element_blank(), + panel.grid.minor = element_blank() + ) +} +``` + +```{r} +plt_xmap_ggmatrix(anzsco_xmap) +``` + +Notice that the requirement that a valid crossmap has outgoing weights which sum to 1 for each source node is equivalent to a requirement that the total of weights across each row sums to 1. + +![](plot-weight-sum-matrix.png){width="100%"} + +## Visualising Types of Mapping Relations + +```{r, echo=FALSE} +veg_1a <- c("eggplant", "capsicum", "zucchini") +veg_1b <- c("aubergine", "pepper", "courgette") +veg_2 <- c("vegetables") +fruit_1a <- c("peach", "raspberry", "kumquat") +fruit_2 <- c("fruits") +pb_1a <- c("salt", "sugar", "peanuts") +pb_2 <- c("peanut butter") +pb_1a_w <- c(0.02, 0.05, 0.93) + +recode <- data.frame(au = veg_1a, uk = veg_1b, link = 1) |> + as_xmap_df(au, uk, link) +gg_recode <- recode |> plt_xmap_bigraph() + +agg <- data.frame(item = fruit_1a, group = fruit_2, link = 1) |> + as_xmap_df(item, group, link) +gg_agg <- agg |> plt_xmap_bigraph() + +disagg <- data.frame(group = pb_2, item = pb_1a, link = pb_1a_w) |> + as_xmap_df(group, item, link) +gg_disagg <- disagg |> plt_xmap_bigraph() +``` + +```{r} +library(patchwork) + +gg_recode + gg_agg + gg_disagg +``` diff --git a/vignettes/xmap.Rmd b/vignettes/xmap.Rmd new file mode 100644 index 0000000..b75044e --- /dev/null +++ b/vignettes/xmap.Rmd @@ -0,0 +1,135 @@ +--- +title: "Getting Started with Crossmaps" +output: + rmarkdown::html_vignette: + toc: yes +vignette: > + %\VignetteIndexEntry{Getting Started with Crossmaps} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Checking existing pipelines + +`xmap` offers verification functions for mappings encoded in named vectors or lists as well as data frames. Many of these checks will seem trivial in short mappings, but can be useful checks for longer or more complex mappings. + +For example, if you are using a named list to manually recode categorical variables (e.g. using `forcats::fct_recode`), then you might want to verify some properties of the level mappings -- e.g. each existing level is only mapped once: + +```{r} +library(forcats) +library(xmap) + +x <- factor(c("apple", "bear", "banana", "dear")) +levels <- ## new = "old" levels + c(fruit = "apple", fruit = "banana") |> + verify_named_all_values_unique() +forcats::fct_recode(x, !!!levels) +``` + +```{r error=TRUE} +mistake_levels <- c(fruit = "apple", fruit = "banana", veg = "banana") |> + verify_named_all_values_unique() +``` + +Similarly, if you are using a named list to encode group members, you could check the members against a reference list. + +```{r error=TRUE} +## mistakenly assign kate to another group +student_group_mistake <- list(GRP1 = c("kate", "jane", "peter"), + GRP2 = c("terry", "ben", "grace"), + GRP2 = c("cindy", "lucy", "kate" )) + +student_list <- c("kate", "jane", "peter", "terry", "ben", + "grace", "cindy", "lucy", "alex") + +student_group_mistake |> + verify_named_matchset_values_exact(student_list) +``` + +If you are redistributing numeric values between categories, see `vignette("making-xmaps")` for details on how to check your transformation weights redistribute exactly 100% of your original data. + +## Crossmap transformations + +All crossmaps transformations can be decomposed into multiple "standard" data manipulation steps. + +1. **Rename original categories into target categories** +2. **Mutate source node values by link weight.** +3. **Summarise mutated values by target node.** + +To implement these steps use the following `dplyr` pipeline with verified links: + +1. `dplyr::left_join` the target categories (`to`) to the original data via the source labels (`from`). +2. `dplyr::mutate` the source values by multiplying them with the link weights (`weights`) to transform the original values into redistributed values. +3. `dplyr::group_by` and `dplyr::summarise` values by target groups (`to`) to complete any many-to-1 mappings. + +For example given some original data (`v19_data`) with source categories (`v19`), and a valid `xmap` with source categories (`from = version19`), target categories (`to = version18`) and link weights (`weights = w19to18`): + +```{r message=FALSE} +library(xmap) +library(tibble) +library(dplyr) +``` + +```{r} +# original data +v19_data <- tibble::tribble( + ~v19, ~count, + "1120", 300, + "1121", 400, + "1130", 200, + "1200", 600 +) + +# valid crossmap +xmap_19to18 <- tibble::tribble( + ~version19, ~version18, ~w19to18, + # many-to-1 collapsing + "1120", "A2", 1, + "1121", "A2", 1, + # 1-to-1 recoding + "1130", "A3", 1, + # 1-to-many redistribution + "1200", "A4", 0.6, + "1200", "A5", 0.4 +) |> + verify_links_as_xmap(from = version19, to = version18, weights = w19to18) + +# transformed data +(v18_data <- dplyr::left_join(x = v19_data, + y = xmap_19to18, + by = c(v19 = "version19")) |> + dplyr::mutate(new_count = count * w19to18) |> + dplyr::group_by(version18) |> + dplyr::summarise(v20_count = sum(new_count)) +) +``` + +Note that we expect multiple matches for 1-to-many relations so the `dplyr` warning can safely be ignored. + +## Creating an `xmap_df` object (EXPERIMENTAL) + +The `xmap_df` class aims to facilitates additional functionality such as graph property calculation and printing (i.e. relation types and crossmap direction) via a custom `print()` method, coercion to other useful classes (e.g. `xmap_to_matrix()`), visualisation (see `vignette("vis-xmaps")` for prototypes), and multi-step transformations (i.e. from nomenclature A to B to C). + +Please note that the `xmap_df` class and related functions are still in active development and subject to breaking changes in future releases. + +If you already have a data frame of candidate links, turn them into a valid `xmap` object by specifying the source (`from`) nodes, target (`to`) nodes, and weights (`weights`): + +```{r} +uk_shares <- tibble::tribble( + ~key1, ~key2, ~shares, + "UK, Channel Islands, Isle of Man", "Scotland", 0.1102047, + "UK, Channel Islands, Isle of Man", "Wales", 0.02720333, + "UK, Channel Islands, Isle of Man", "England", 0.862592 + ) + +uk_shares |> + as_xmap_df(from = key1, to = key2, weights = shares, tol = 3e-08) |> + print() +``` + +Note that both verification and coercion will fail without adjusting the tolerance `tol`, which specifies differences to ignore (i.e. what counts as close enough to 1). + +```{r error=TRUE} +uk_shares |> + verify_links_as_xmap(key1, key2, shares) +```