diff --git a/DESCRIPTION b/DESCRIPTION index 62553ec..892704e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: actxps Title: Create Actuarial Experience Studies: Prepare Data, Summarize Results, and Create Reports -Version: 1.5.0 +Version: 1.5.0.9000 Authors@R: person("Matt", "Heaphy", email = "mattrmattrs@gmail.com", role = c("aut", "cre")) Maintainer: Matt Heaphy @@ -20,7 +20,7 @@ URL: https://github.com/mattheaphy/actxps/, BugReports: https://github.com/mattheaphy/actxps/issues Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.0 +RoxygenNote: 7.3.2 Suggests: knitr, RColorBrewer, @@ -47,6 +47,7 @@ Imports: readr, tidyr, vctrs, - clock + clock, + cli LazyData: true VignetteBuilder: knitr diff --git a/NEWS.md b/NEWS.md index 2c8661e..3078bae 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,22 @@ +# actxps (development version) + +- New feature - control variables + + - `exp_stats()` has two new arguments: `control_vars` and + `control_distinct_max`. + - Control variables are used to estimate the impact of any grouping variables + on observed experience after accounting for the impact of control + variables. The idea here is that experience is first summarized across the + control variables to derive a new expected values basis. This expected + values basis is then used to calculate actual-to-expected ratios. + - The `control_distinct_max` argument places an upper limit on the maximum + number of unique values allowed in any control variable. + - `exp_shiny()` now includes a control variable selection widget. + +- Added the "cli" package to replace "rlang" for certain errors, warnings, and + messages. +- Various small typo fixes. + # actxps 1.5.0 - `expose_split()` bug fixes: diff --git a/R/exp_df_helpers.R b/R/exp_df_helpers.R index 535bb38..0af0b73 100644 --- a/R/exp_df_helpers.R +++ b/R/exp_df_helpers.R @@ -149,7 +149,8 @@ as_exp_df <- function(x, expected = NULL, wt = NULL, wt = wt, credibility = credibility, conf_level = conf_level, cred_r = cred_r, - conf_int = conf_int) + conf_int = conf_int, + control_vars = NULL) } diff --git a/R/exp_shiny.R b/R/exp_shiny.R index d20f44f..1305348 100644 --- a/R/exp_shiny.R +++ b/R/exp_shiny.R @@ -49,11 +49,13 @@ #' #### Termination studies #' #' The expected values checkboxes are used to activate and deactivate expected -#' values passed to the `expected` argument. This impacts the table output -#' directly and the available "y" variables for the plot. If there are no -#' expected values available, this widget will not appear. The "Weight by" -#' widget is used to specify which column, if any, contains weights for -#' summarizing experience. +#' values passed to the `expected` argument. These checkboxes also include a +#' a "control" item for expected values derived using control variables. +#' These boxes impact the table output directly and the available "y" variables +#' for the plot. The "Weight by" widget is used to specify which column, if any, +#' contains weights for summarizing experience. The "Control variables" widget +#' is used to specify which columns, if any, are used as control variables ( +#' see [exp_stats()] for more information). #' #' #### Transaction studies #' @@ -265,8 +267,8 @@ exp_shiny <- function(dat, choices <- info$scope[[1]] if (is.null(dat[[x]])) { - rlang::abort( - glue::glue("Error creating an input widget for {x}. {x} does not exist in the input data.") + cli::cli_abort( + "Error creating an input widget for {x}. {x} does not exist in the input data." ) } @@ -309,8 +311,8 @@ exp_shiny <- function(dat, } else { - rlang::abort( - glue::glue("Error creating an input widget for {x}. {x} is of class {class(dat[[x]]) |> paste(collapse = ', ')}, which is not supported.") + cli::cli_abort( + "Error creating an input widget for {x}. {x} is of class {class(dat[[x]])}, which is not supported." ) } @@ -355,18 +357,10 @@ exp_shiny <- function(dat, } # expected values set up - if (length(expected) > 0) { - - has_expected <- TRUE - - expected_widget <- checkboxGroupPred("ex_checks", "Expected values:", 6, - choices = expected, - selected = expected) - - } else { - has_expected <- FALSE - expected_widget <- NULL - } + expected <- c(expected, "control") + expected_widget <- checkboxGroupPred( + "ex_checks", "Expected values:", 4, + choices = expected, selected = expected[expected != "control"]) # transactions set up if (has_trx) { @@ -506,9 +500,12 @@ exp_shiny <- function(dat, value = "exp", shiny::fluidRow( expected_widget, - selectPred("weightVar", "Weight by:", 6, + selectPred("weightVar", "Weight by:", 4, choices = c("None", - filter(preds, is_number)$predictors)) + filter(preds, is_number)$predictors)), + selectPred("controlVars", "Control variables:", 4, + multiple = TRUE, + choices = preds_small) )), trx_tab @@ -528,16 +525,16 @@ exp_shiny <- function(dat, shiny::markdown( '
- - `y`-axis variable selection - - `Second y-axis` toggle and variable - - `Geometry` for plotting - - `Add smoothing`: add smooth loess curves - - `Confidence intervals`: If available, draw confidence interval - error bars - - `Free y-scales`: enable separate `y` scales in each subplot - - `Log y-axis`: plot y-axes on a log-10 scale - - The grouping variables selected above will determine the - variable on the `x`-axis, the color variable, and faceting + - **y-axis** variable selection + - **Second y-axis** toggle and variable + - **Geometry** for plotting + - **Add smoothing**: add smooth loess curves + - **Confidence intervals**: If available, draw confidence + interval error bars + - **Free y-scales**: enable separate y scales in each subplot + - **Log y-axis**: plot y-axes on a log-10 scale + - The **grouping variables** selected above will determine the + variable on the x-axis, the color variable, and faceting variables used to create subplots.
'), @@ -806,8 +803,9 @@ exp_shiny <- function(dat, } } - ex <- if (has_expected) { - input$ex_checks + ex <- input$ex_checks[input$ex_checks != "control"] + ctrl <- if ("control" %in% input$ex_checks) { + input$controlVars %||% ".none" } if (input$study_type == "exp") { @@ -815,7 +813,7 @@ exp_shiny <- function(dat, group_by(dplyr::across(dplyr::all_of(.groups))) |> exp_stats(wt = wt, credibility = credibility, expected = ex, conf_level = conf_level, cred_r = cred_r, - conf_int = TRUE) + conf_int = TRUE, control_vars = ctrl) } else { rdat() |> group_by(dplyr::across(dplyr::all_of(.groups))) |> diff --git a/R/exp_stats.R b/R/exp_stats.R index 472e2be..a75e01c 100644 --- a/R/exp_stats.R +++ b/R/exp_stats.R @@ -14,9 +14,39 @@ #' # Expected values #' #' The `expected` argument is optional. If provided, this argument must -#' be a character vector with values corresponding to columns in `.data` +#' be a character vector with values corresponding to column names in `.data` #' containing expected experience. More than one expected basis can be provided. #' +#' # Control variables +#' +#' The `control_vars` argument is optional. If provided, this argument must +#' be `".none"` (more on this below) or a character vector with values +#' corresponding to column names in `.data`. Control variables are used to +#' estimate the impact of any grouping variables on observed experience +#' *after accounting for* the impact of control variables. +#' +#' Mechanically, when values are passed to `control_vars`, a separate call +#' is made to [exp_stats()] using the control variables as grouping variables. +#' This is used to derive a new expected values basis called `control`, which is +#' both added to `.data` and appended to the `expected` argument. In the final +#' output, a column called `ae_control` shows the relative impact of any +#' grouping variables after accounting for the control variables. +#' +#' **About `".none"`**: If `".none"` is passed to `control_vars`, a single +#' aggregate termination rate is calculated for the entire data set and used to +#' compute `control` and `ae_control`. +#' +#' The `control_distinct_max` argument places an upper limit on the number of +#' unique values that a control variable is allowed to have. This limit exists +#' to prevent an excessive number of groups on continuous or high-cardinality +#' features. +#' +#' It should be noted that usage of control variables is a rough approximation +#' and not a substitute for rigorous statistical models. The impact of control +#' variables is calculated in isolation and does consider other features or +#' possible confounding variables. As such, control variables are most useful +#' for exploratory data analysis. +#' #' # Credibility #' #' If `credibility` is set to `TRUE`, the output will contain a @@ -69,6 +99,10 @@ #' method #' @param conf_int If `TRUE`, the output will include confidence intervals #' around the observed termination rates and any actual-to-expected ratios. +#' @param control_vars `".none"` or a character vector containing column names +#' in `.data` to use as control variables +#' @param control_distinct_max Maximum number of unique values allowed for +#' control variables #' @param object An `exp_df` object #' @param ... Groups to retain after `summary()` is called #' @@ -76,8 +110,9 @@ #' and `data.frame`. The results include columns for any grouping variables, #' claims, exposures, and observed termination rates (`q_obs`). #' -#' - If any values are passed to `expected`, expected termination rates and -#' actual-to-expected ratios. +#' - If any values are passed to `expected` or `control_vars`, additional +#' columns are added for expected termination rates and actual-to-expected +#' (A/E) ratios. A/E ratios are prefixed by `ae_`. #' - If `credibility` is set to `TRUE`, additional columns are added #' for partial credibility and credibility-weighted termination rates #' (assuming values are passed to `expected`). Credibility-weighted termination @@ -100,7 +135,7 @@ #' exp_res <- census_dat |> #' expose("2019-12-31", target_status = "Surrender") |> #' group_by(pol_yr, inc_guar) |> -#' exp_stats() +#' exp_stats(control_vars = "product") #' #' exp_res #' summary(exp_res) @@ -115,7 +150,9 @@ exp_stats <- function(.data, target_status = attr(.data, "target_status"), wt = NULL, credibility = FALSE, conf_level = 0.95, cred_r = 0.05, - conf_int = FALSE) { + conf_int = FALSE, + control_vars, + control_distinct_max = 25L) { .groups <- groups(.data) start_date <- attr(.data, "start_date") @@ -123,12 +160,12 @@ exp_stats <- function(.data, target_status = attr(.data, "target_status"), if (is.null(target_status)) { target_status <- levels(.data$status)[-1] - rlang::warn(c(x = "No target status was provided.", - i = glue::glue("{paste(target_status, collapse = ', ')} was assumed."))) + cli::cli_warn(c(x = "No target status was provided.", + i = "{.val {target_status}} {?was/were} assumed.")) } if (length(wt) > 1) { - rlang::abort(c(x = glue::glue("Only 1 column can be passed to `wt`. You supplied {length(wt)} values."))) + cli::cli_abort(c(x = "Only 1 column can be passed to `wt`. You supplied {length(wt)} values.")) } check_split_expose_basis(.data, col_exposure) @@ -151,31 +188,95 @@ exp_stats <- function(.data, target_status = attr(.data, "target_status"), res$claims <- res$n_claims } + if (missing(expected) || is.null(expected)) { + expected <- NULL + } else { + verify_col_exist(names(res), expected, "expected values column") + } + + if (!missing(control_vars) && !is.null(control_vars)) { + + # special handling for aggregates + if (all(control_vars == ".none")) { + if (".none" %in% names(res)) { + cli::cli_abort( + "Name conflict error: {.val .none} cannot be passed to `control_vars` + because there is also column in the data with the same name." + ) + } + control_vars <- "None" + res[["None"]] <- 1L + } else { + if (".none" %in% control_vars) { + cli::cli_abort( + "If {.val .none} is passed to `control_vars`, then no other values + are allowed." + ) + } + verify_col_exist(names(res), control_vars, "control variable") + } + + # throw an error if too many unique values + nd_ctrl <- res |> + ungroup() |> + dplyr::summarize(dplyr::across(control_vars, dplyr::n_distinct)) + nd_ctrl <- colnames(nd_ctrl)[nd_ctrl > control_distinct_max] + if (length(nd_ctrl) > 0) { + cli::cli_abort(c( + x = "There are too many distinct values in the {.val {nd_ctrl}} control variable{?s}.", + i = paste0( + "Limit = {control_distinct_max}. Update the `control_distinct_max` ", + "argument to increase the limit, or consider techniques like ", + "binning to reduce cardinality") + )) + } + + # calculate observed rates across control variables + ctrl_dat <- finish_exp_stats( + res |> group_by(dplyr::across(dplyr::all_of(control_vars))), + target_status, expected = NULL, .groups = control_vars, + start_date = start_date, end_date = end_date, + credibility = FALSE, conf_level = conf_level, + cred_r = cred_r, wt = wt, conf_int = FALSE, control_vars = NULL + ) |> + select(dplyr::all_of(control_vars), control = q_obs) + + # join observed rates on control variables to the data and add these + # to expected values + res <- left_join(res, ctrl_dat, by = control_vars) + expected <- c(expected, "control") + + } else { + control_vars <- NULL + } + finish_exp_stats(res, target_status, expected, .groups, start_date, end_date, credibility, - conf_level, cred_r, wt, conf_int) + conf_level, cred_r, wt, conf_int, control_vars) } #' @export print.exp_df <- function(x, ...) { - cat("Experience study results\n\n") + cli::cli_h2("Experience study results") if (length(groups(x)) > 0) { - cat(" Groups:", paste(groups(x), collapse = ", "), "\n") + cli::cli_ul("{.field Groups}: {groups(x)}") + } + cli::cli_ul(c( + "{.field Target status}: {attr(x, 'target_status')}", + "{.field Study range}: {attr(x, 'start_date')} to {attr(x, 'end_date')}")) + if (!is.null(attr(x, "control_vars"))) { + cli::cli_ul("{.field Control variables}: {attr(x, 'control_vars')}") } - cat(" Target status:", paste(attr(x, "target_status"), collapse = ", "), "\n", - "Study range:", as.character(attr(x, "start_date")), "to", - as.character(attr(x, "end_date")), "\n") if (!is.null(attr(x, "expected"))) { - cat(" Expected values:", paste(attr(x, "expected"), collapse = ", "), "\n") + cli::cli_ul("{.field Expected values}: {attr(x, 'expected')}") } - if (is.null(attr(x, "wt"))) { - cat("\n") - } else { - cat(" Weighted by:", attr(x, "wt"), "\n\n") + if (!is.null(attr(x, "wt"))) { + cli::cli_ul("{.field Weighted by}: {attr(x, 'wt')}\n") } + cat("\n") NextMethod() } @@ -198,11 +299,12 @@ summary.exp_df <- function(object, ...) { expected <- attr(object, "expected") xp_params <- attr(object, "xp_params") wt <- attr(object, "wt") + control_vars <- attr(object, "control_vars") finish_exp_stats(res, target_status, expected, .groups, start_date, end_date, xp_params$credibility, xp_params$conf_level, xp_params$cred_r, - wt, xp_params$conf_int) + wt, xp_params$conf_int, control_vars) } @@ -213,16 +315,16 @@ summary.exp_df <- function(object, ...) { finish_exp_stats <- function(.data, target_status, expected, .groups, start_date, end_date, credibility, conf_level, cred_r, - wt, conf_int) { + wt, conf_int, control_vars) { # expected value formulas. these are already weighted if applicable - if (!missing(expected)) { + if (!is.null(expected)) { ex_mean <- exp_form("weighted.mean({.col}, exposure)", "{.col}", expected) ex_ae <- exp_form("q_obs / {.col}", "ae_{.col}", expected) } else { - ex_ae <- ex_mean <- expected <- NULL + ex_ae <- ex_mean <- NULL } # additional columns for weighted studies @@ -341,13 +443,14 @@ finish_exp_stats <- function(.data, target_status, expected, wt = wt, credibility = credibility, conf_level = conf_level, cred_r = cred_r, - conf_int = conf_int) + conf_int = conf_int, + control_vars = control_vars) } # low level class constructor new_exp_df <- function(x, .groups, target_status, start_date, expected, end_date, wt, credibility, conf_level, - cred_r = cred_r, conf_int) { + cred_r = cred_r, conf_int, control_vars) { tibble::new_tibble(x, class = "exp_df", groups = .groups, @@ -359,7 +462,8 @@ new_exp_df <- function(x, .groups, target_status, start_date, expected, xp_params = list(credibility = credibility, conf_level = conf_level, cred_r = cred_r, - conf_int = conf_int)) + conf_int = conf_int), + control_vars = control_vars) } # this function is used to create formula specifications passed to dplyr::mutate @@ -382,8 +486,8 @@ exp_form <- function(form, new_col, .col) { verify_exp_df <- function(.data) { if (!inherits(.data, "exp_df")) { - rlang::abort(c(x = glue::glue("`{deparse(substitute(.data))}` must be an `exp_df` object."), - i = "Hint: Use `exp_stats()` to create `exp_df` objects." + cli::cli_abort(c(x = "`{deparse(substitute(.data))}` must be an `exp_df` object.", + i = "Hint: Use `exp_stats()` to create `exp_df` objects." )) } } diff --git a/R/expose.R b/R/expose.R index 030def0..08f3dbf 100644 --- a/R/expose.R +++ b/R/expose.R @@ -370,8 +370,8 @@ add_period <- function(expo_length) { x <- x[x %in% names(.data)] .data[x] <- NULL if (length(x > 0)) { - rlang::warn(c(x = glue::glue("`.data` contains the following conflicting columns that will be overridden: {paste(x, collapse = ', ')}."), - i = "If you don't want this to happen, rename these columns prior to calling the applicable expose function.")) + cli::cli_warn(c(x = "`.data` contains the following conflicting columns that will be overridden: {.val {x}}.", + i = "If you don't want this to happen, rename these columns prior to calling the applicable expose function.")) } .data } @@ -402,8 +402,8 @@ is.Date <- function(x) { .check_missing_dates <- function(x, name) { if (any(is.na(x))) { - rlang::abort(c( - glue::glue("Missing values are not allowed in the `{name}` column."), + cli::cli_abort(c( + "Missing values are not allowed in the `{name}` column.", i = "Make sure all dates are in YYYY-MM-DD format.") ) } diff --git a/R/exposed_df_helpers.R b/R/exposed_df_helpers.R index 330304f..a98a771 100644 --- a/R/exposed_df_helpers.R +++ b/R/exposed_df_helpers.R @@ -174,18 +174,19 @@ new_exposed_df <- function(x, end_date, start_date, target_status, #' @export print.exposed_df <- function(x, ...) { - cat("Exposure data\n\n", - "Exposure type:", attr(x, "exposure_type"), "\n", - "Target status:", paste(attr(x, "target_status"), collapse = ", "), "\n", - "Study range:", as.character(attr(x, "start_date")), "to", - as.character(attr(x, "end_date"))) + cli::cli_h2("Exposure data") + cli::cli_ul(c( + "{.field Exposure type}: {attr(x, 'exposure_type')}", + "{.field Target status}: {attr(x, 'target_status')}", + "{.field Study range}: {attr(x, 'start_date')} to {attr(x, 'end_date')}" + )) trx_types <- attr(x, "trx_types") if (!is.null(trx_types)) { - cat("\n", "Transaction types:", paste(trx_types, collapse = ", "), "\n") + cli::cli_ul("{.field Transaction types}: {trx_types}") } - cat("\n\n") + cat("\n") NextMethod() } @@ -589,8 +590,8 @@ make_date_col_names <- function(cal_expo, expo_length) { verify_exposed_df <- function(.data) { if (!is_exposed_df(.data)) { - rlang::abort(c(x = glue::glue("`{deparse(substitute(.data))}` must be an `exposed_df` object."), - i = "Hint: Use `as_exposed_df()` to convert your data to the required format." + cli::cli_abort(c(x = "`{deparse(substitute(.data))}` must be an `exposed_df` object.", + i = "Hint: Use `as_exposed_df()` to convert your data to the required format." )) } } @@ -602,8 +603,8 @@ verify_get_trx_types <- function(.data, required = TRUE) { trx_types <- attr(.data, "trx_types") if (is.null(trx_types)) { if (required) { - rlang::abort(c(x = "No transactions have been attached to `.data`.", - i = "Add transaction data using `add_transactions()` before calling this function.")) + cli::cli_abort(c(x = "No transactions have been attached to `.data`.", + i = "Add transaction data using `add_transactions()` before calling this function.")) } return(NULL) } @@ -615,7 +616,17 @@ verify_col_names <- function(x_names, required) { unmatched <- setdiff(required, x_names) if (length(unmatched) > 0) { - rlang::abort(c(x = glue::glue("The following columns are missing: {paste(unmatched, collapse = ', ')}."), - i = "Hint: create these columns or use the `col_*` arguments to specify existing columns that should be mapped to these elements.")) + cli::cli_abort(c(x = "The following column{?s} {?is/are} missing: {.val {unmatched}}.", + i = "Hint: create these columns or use the `col_*` arguments to specify existing columns that should be mapped to these elements.")) + } +} + +# similar to the above, but with a different context in the error message +verify_col_exist <- function(x_names, required, what = "column") { + unmatched <- setdiff(required, x_names) + n <- length(unmatched) + + if (length(unmatched) > 0) { + cli::cli_abort(c(x = "The following {what}{cli::qty(n)}{?s} {?was/were} not found in the data: {.val {unmatched}}.")) } } diff --git a/R/transactions.R b/R/transactions.R index 6c8d60f..811098b 100644 --- a/R/transactions.R +++ b/R/transactions.R @@ -76,8 +76,8 @@ add_transactions <- function(.data, trx_data, new_trx_types <- unique(trx_data$trx_type) conflict_trx_types <- intersect(new_trx_types, existing_trx_types) if (length(conflict_trx_types) > 0) { - rlang::abort(c(x = glue::glue("`trx_data` contains transaction types that have already been attached to `.data`: {paste(conflict_trx_types, collapse = ', ')}."), - i = "Update `trx_data` with unique transaction types.")) + cli::cli_abort(c(x = "`trx_data` contains transaction types that have already been attached to `.data`: {.val {conflict_trx_types}}.", + i = "Update `trx_data` with unique transaction types.")) } # add dates to transaction data diff --git a/R/trx_stats.R b/R/trx_stats.R index a3cd566..96fecd5 100644 --- a/R/trx_stats.R +++ b/R/trx_stats.R @@ -113,7 +113,7 @@ #' - `avg_trx`: mean transaction amount (`trx_amt / trx_flag`) #' - `avg_all`: mean transaction amount over all records (`trx_amt / exposure`) #' - `trx_freq`: transaction frequency when a transaction occurs (`trx_n / trx_flag`) -#' - `trx_utilization`: transaction utilization per observation period (`trx_flag / exposure`) +#' - `trx_util`: transaction utilization per observation period (`trx_flag / exposure`) #' #' If `percent_of` is provided, the results will also include: #' @@ -166,7 +166,7 @@ trx_stats <- function(.data, } else { unmatched <- setdiff(trx_types, all_trx_types) if (length(unmatched) > 0) { - rlang::abort(c(x = glue::glue("The following transactions do not exist in `.data`: {paste0(unmatched, collapse = ', ')}"))) + cli::cli_abort(c(x = "The following transactions do not exist in `.data`: {.val {unmatched}}")) } } @@ -218,17 +218,18 @@ trx_stats <- function(.data, #' @export print.trx_df <- function(x, ...) { - cat("Transaction study results\n\n") + cli::cli_h2("Transaction study results") if (length(groups(x)) > 0) { - cat(" Groups:", paste(groups(x), collapse = ", "), "\n") + cli::cli_ul("{.field Groups}: {groups(x)}") } - cat(" Study range:", as.character(attr(x, "start_date")), "to", - as.character(attr(x, "end_date")), "\n", - "Transaction types:", paste(attr(x, "trx_types"), collapse = ", "), "\n") + cli::cli_ul(c( + "{.field Study range}: {attr(x, 'start_date')} to {attr(x, 'end_date')}", + "{.field Transaction types}: {attr(x, 'trx_types')}")) if (!is.null(attr(x, "percent_of"))) { - cat(" Transactions as % of:", paste(attr(x, "percent_of"), collapse = ", "), "\n") + cli::cli_ul("{.field Transactions as % of}: {attr(x, 'percent_of')}") } + cat("\n") NextMethod() } @@ -378,8 +379,8 @@ new_trx_df <- function(x, .groups, trx_types, verify_trx_df <- function(.data) { if (!inherits(.data, "trx_df")) { - rlang::abort(c(x = glue::glue("`{deparse(substitute(.data))}` must be a `trx_df` object."), - i = "Hint: Use `trx_stats()` to create `trx_df` objects." + cli::cli_abort(c(x = "`{deparse(substitute(.data))}` must be a `trx_df` object.", + i = "Hint: Use `trx_stats()` to create `trx_df` objects." )) } } diff --git a/man/exp_shiny.Rd b/man/exp_shiny.Rd index 137f60f..b749ce4 100644 --- a/man/exp_shiny.Rd +++ b/man/exp_shiny.Rd @@ -105,11 +105,13 @@ study type. \subsection{Termination studies}{ The expected values checkboxes are used to activate and deactivate expected -values passed to the \code{expected} argument. This impacts the table output -directly and the available "y" variables for the plot. If there are no -expected values available, this widget will not appear. The "Weight by" -widget is used to specify which column, if any, contains weights for -summarizing experience. +values passed to the \code{expected} argument. These checkboxes also include a +a "control" item for expected values derived using control variables. +These boxes impact the table output directly and the available "y" variables +for the plot. The "Weight by" widget is used to specify which column, if any, +contains weights for summarizing experience. The "Control variables" widget +is used to specify which columns, if any, are used as control variables ( +see \code{\link[=exp_stats]{exp_stats()}} for more information). } \subsection{Transaction studies}{ diff --git a/man/exp_stats.Rd b/man/exp_stats.Rd index 184ea05..1c3f13c 100644 --- a/man/exp_stats.Rd +++ b/man/exp_stats.Rd @@ -15,7 +15,9 @@ exp_stats( credibility = FALSE, conf_level = 0.95, cred_r = 0.05, - conf_int = FALSE + conf_int = FALSE, + control_vars, + control_distinct_max = 25L ) \method{summary}{exp_df}(object, ...) @@ -49,6 +51,12 @@ method} \item{conf_int}{If \code{TRUE}, the output will include confidence intervals around the observed termination rates and any actual-to-expected ratios.} +\item{control_vars}{\code{".none"} or a character vector containing column names +in \code{.data} to use as control variables} + +\item{control_distinct_max}{Maximum number of unique values allowed for +control variables} + \item{object}{An \code{exp_df} object} \item{...}{Groups to retain after \code{summary()} is called} @@ -58,8 +66,9 @@ A tibble with class \code{exp_df}, \code{tbl_df}, \code{tbl}, and \code{data.frame}. The results include columns for any grouping variables, claims, exposures, and observed termination rates (\code{q_obs}). \itemize{ -\item If any values are passed to \code{expected}, expected termination rates and -actual-to-expected ratios. +\item If any values are passed to \code{expected} or \code{control_vars}, additional +columns are added for expected termination rates and actual-to-expected +(A/E) ratios. A/E ratios are prefixed by \code{ae_}. \item If \code{credibility} is set to \code{TRUE}, additional columns are added for partial credibility and credibility-weighted termination rates (assuming values are passed to \code{expected}). Credibility-weighted termination @@ -91,10 +100,40 @@ This will produce a warning message. } \section{Expected values}{ The \code{expected} argument is optional. If provided, this argument must -be a character vector with values corresponding to columns in \code{.data} +be a character vector with values corresponding to column names in \code{.data} containing expected experience. More than one expected basis can be provided. } +\section{Control variables}{ +The \code{control_vars} argument is optional. If provided, this argument must +be \code{".none"} (more on this below) or a character vector with values +corresponding to column names in \code{.data}. Control variables are used to +estimate the impact of any grouping variables on observed experience +\emph{after accounting for} the impact of control variables. + +Mechanically, when values are passed to \code{control_vars}, a separate call +is made to \code{\link[=exp_stats]{exp_stats()}} using the control variables as grouping variables. +This is used to derive a new expected values basis called \code{control}, which is +both added to \code{.data} and appended to the \code{expected} argument. In the final +output, a column called \code{ae_control} shows the relative impact of any +grouping variables after accounting for the control variables. + +\strong{About \code{".none"}}: If \code{".none"} is passed to \code{control_vars}, a single +aggregate termination rate is calculated for the entire data set and used to +compute \code{control} and \code{ae_control}. + +The \code{control_distinct_max} argument places an upper limit on the number of +unique values that a control variable is allowed to have. This limit exists +to prevent an excessive number of groups on continuous or high-cardinality +features. + +It should be noted that usage of control variables is a rough approximation +and not a substitute for rigorous statistical models. The impact of control +variables is calculated in isolation and does consider other features or +possible confounding variables. As such, control variables are most useful +for exploratory data analysis. +} + \section{Credibility}{ If \code{credibility} is set to \code{TRUE}, the output will contain a \code{credibility} column equal to the partial credibility estimate under @@ -136,7 +175,7 @@ toy_census |> expose("2022-12-31", target_status = "Surrender") |> exp_res <- census_dat |> expose("2019-12-31", target_status = "Surrender") |> group_by(pol_yr, inc_guar) |> - exp_stats() + exp_stats(control_vars = "product") exp_res summary(exp_res) diff --git a/man/trx_stats.Rd b/man/trx_stats.Rd index e3dfc45..aaab1bf 100644 --- a/man/trx_stats.Rd +++ b/man/trx_stats.Rd @@ -62,7 +62,7 @@ variables and transaction types, plus the following: \item \code{avg_trx}: mean transaction amount (\code{trx_amt / trx_flag}) \item \code{avg_all}: mean transaction amount over all records (\code{trx_amt / exposure}) \item \code{trx_freq}: transaction frequency when a transaction occurs (\code{trx_n / trx_flag}) -\item \code{trx_utilization}: transaction utilization per observation period (\code{trx_flag / exposure}) +\item \code{trx_util}: transaction utilization per observation period (\code{trx_flag / exposure}) } If \code{percent_of} is provided, the results will also include: diff --git a/tests/testthat/test-add_predictions.R b/tests/testthat/test-add_predictions.R index b3744a9..c34a316 100644 --- a/tests/testthat/test-add_predictions.R +++ b/tests/testthat/test-add_predictions.R @@ -20,8 +20,10 @@ test_that("add_predictions works with exposed_df and data frames", { }) test_that("add_predictions works with matrix output", { - expect_s3_class(add_predictions(expo, mod, type = 'terms'), - 'exposed_df') + suppressMessages( + expect_s3_class(add_predictions(expo, mod, type = 'terms'), + 'exposed_df') + ) }) test_that("add_predictions col_expected works", { diff --git a/tests/testthat/test-exp_df_helpers.R b/tests/testthat/test-exp_df_helpers.R index 2d980da..6091495 100644 --- a/tests/testthat/test-exp_df_helpers.R +++ b/tests/testthat/test-exp_df_helpers.R @@ -26,7 +26,7 @@ test_that("as_exp_df works", { expect_true(is_exp_df(res3)) - expect_error(as_exp_df(res4), regexp = "The following columns are missing") + expect_error(as_exp_df(res4), regexp = "The following column is missing") expect_no_error(as_exp_df(res4, col_exposure = "expo")) expect_no_error(as_exp_df(res5, col_exposure = "expo", col_claims = "clms")) diff --git a/tests/testthat/test-exp_stats.R b/tests/testthat/test-exp_stats.R index 7e6956b..7b5149d 100644 --- a/tests/testthat/test-exp_stats.R +++ b/tests/testthat/test-exp_stats.R @@ -59,3 +59,42 @@ test_that("Confidence intervals work", { less_confident$q_obs_upper - less_confident$q_obs_lower)) }) + +test_that("Control variables work", { + + exp_ctrl1 <- study_py |> + group_by(pol_yr, inc_guar) |> + exp_stats(control_vars = c("pol_yr", "inc_guar")) + + # when control variables and grouping variables overlap, A/E's should be one + expect_equal(exp_ctrl1$ae_control, rep(1, nrow(exp_ctrl1))) + expect_equal(exp_stats(study_py, control_vars = ".none")$ae_control, 1) + + # manually reproduce control var results + exp_prod_qual <- study_py |> + group_by(product, qual) |> + exp_stats(wt = 'premium') |> + select(product, qual, q_prod_qual = q_obs) + exp_ctrl2 <- study_py |> + left_join(exp_prod_qual, by = c("product", "qual")) |> + group_by(pol_yr, inc_guar) |> + exp_stats(control_vars = c("product", "qual"), + expected = "q_prod_qual", wt = 'premium') + expect_equal(exp_ctrl2$ae_control, exp_ctrl2$ae_q_prod_qual) + + +}) + +test_that("Error messages work", { + + expect_error(study_py |> exp_stats(expected = 'banana'), + regexp = 'expected values column was not found') + expect_error(study_py |> exp_stats(control_vars = c('banana', 'apple')), + regexp = 'control variables were not found') + expect_error(study_py |> mutate(.none = 1) |> + exp_stats(control_vars = ".none"), + regexp = 'Name conflict error') + expect_error(study_py |> rename(ex = exposure) |> exp_stats(), + regexp = "Can't rename columns") + +}) diff --git a/tests/testthat/test-trx_df_helpers.R b/tests/testthat/test-trx_df_helpers.R index 6c9fd2d..2dc3e97 100644 --- a/tests/testthat/test-trx_df_helpers.R +++ b/tests/testthat/test-trx_df_helpers.R @@ -29,7 +29,7 @@ test_that("as_trx_df works", { expect_true(is_trx_df(res3)) - expect_error(as_trx_df(res4), regexp = "The following columns are missing") + expect_error(as_trx_df(res4), regexp = "The following column is missing") expect_no_error(as_trx_df(res4, col_exposure = "expo")) expect_no_error(as_trx_df(res5, col_exposure = "expo", col_trx_amt = "tamt", col_trx_n = "tn")) diff --git a/vignettes/articles/visualizations.Rmd b/vignettes/articles/visualizations.Rmd index e288da5..167053a 100644 --- a/vignettes/articles/visualizations.Rmd +++ b/vignettes/articles/visualizations.Rmd @@ -273,7 +273,7 @@ The "Grouping variables" box includes widgets to select grouping variables for s The "Study type" box will always include a tab for termination studies. If transactions are attached to the `exposed_df` object^[See `add_transactions()` for information on attaching transactions to an `exposed_df` object.], an additional section will be displayed for transaction studies. -- Termination study options include the ability to activate and deactivate expected values and the selection of an optional numeric weighting variable for claims and exposures. Available expected value choices are dictated by the `expected` argument. If this argument is not specified, any columns containing the word "expected" are assumed to be expected values. +- Termination study options include the ability to activate and deactivate expected values, the selection of an optional numeric weighting variable for claims and exposures, and the selection of control variables. Available expected value choices are dictated by the `expected` argument. If this argument is not specified, any columns containing the word "expected" are assumed to be expected values. - Transaction study options include the ability to activate and deactivate transaction types, optional numeric columns to use in "percentage of" statistics, and an option to lump all transaction types into a single category. The output section includes tabs for plots, tables, and exporting results. diff --git a/vignettes/exp_summary.Rmd b/vignettes/exp_summary.Rmd index eca961c..474a5e3 100644 --- a/vignettes/exp_summary.Rmd +++ b/vignettes/exp_summary.Rmd @@ -127,7 +127,7 @@ exposed_data |> ## Expected values and A/E ratios -As common metric in experience studies is the actual-to-expected, or A/E ratio. +A common metric in experience studies is the actual-to-expected, or A/E ratio. $$ A/E\ ratio = \frac{observed\ value}{expected\ value} @@ -171,6 +171,48 @@ exposed_data |> ``` +### Control variables + +Control variables are a related concept to expected values. Control variables are used to estimate the impact of any grouping variables on observed experience *after accounting for* the impact of other (control) variables. + +Control variables can help answer questions like, "How much lower are surrender rates by policy year for contracts with a guaranteed income rider relative to contracts without a rider?". Here, the presence of a guaranteed income rider is a grouping variable and policy year is a control variable. + +Control variables are specified using the optional `control_vars` argument. If provided, this argument must be `".none"` (more on this below) or a character vector with values corresponding to column names in `.data`. + +To answer the question above, we can group the data by `inc_guar` and add `control_vars = "pol_yr"` in a call to `exp_stats()`. + +```{r act-exp-ctrl} +exposed_data |> + group_by(inc_guar) |> + exp_stats(control_vars = "pol_yr") |> + select(inc_guar, q_obs, control, ae_control) +``` + +In the resulting output two new columns appeared: + +- `control`: Observed surrender rates considering the control variables (`pol_yr`) only. The fact that the two values of `control` above do not match is not surprising and simply represents the fact that the distributions of `pol_yr` across the levels of `inc_guar` are not identical. +- `ae_control`: The A/E ratio of observed experience versus `control`. This is an estimate of the impact of `inc_guar` after accounting for `pol_yr` effects. + +These results show that the presence of a guaranteed income rider decreases surrender rates by a very significant amount. The converse is true for contracts without a rider. + +As an alternative, if `".none"` is passed to `control_vars`, a single aggregate termination rate is calculated for the entire data set and used to compute `control` and `ae_control`. + +```{r act-exp-ctrl2} +exposed_data |> + group_by(inc_guar) |> + exp_stats(control_vars = ".none") |> + select(inc_guar, q_obs, control, ae_control) +``` + +Note that: + +- `control` is now a constant value +- Different results are yielded for `ae_control` + +The `control_distinct_max` argument places an upper limit on the number of unique values that a control variable is allowed to have. This limit exists to prevent an excessive number of groups on continuous or high-cardinality features. + +It should be noted that usage of control variables is a rough approximation and not a substitute for rigorous statistical models. The impact of control variables is calculated in isolation and does consider other features or possible confounding variables. As such, control variables are most useful for exploratory data analysis. + ## Credibility If the `credibility` argument is set to `TRUE`, `exp_stats()` will produce an estimate of partial credibility under the Limited Fluctuation credibility method (also known as Classical Credibility) assuming a binomial distribution of claims.^[See Herzog, Thomas (1999). Introduction to Credibility Theory for more information on Limited Fluctuation Credibility.]