diff --git a/DESCRIPTION b/DESCRIPTION index f9242aa..86f247a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: optic Title: Simulation Tool for Causal Inference Using Longitudinal Data -Version: 1.0.1 +Version: 1.0.2 Authors@R: c( person("Beth Ann", "Griffin", , "bethg@rand.org", role = c("aut", "cph"), comment = c(ORCID = "0000-0002-2391-4601")), diff --git a/R/apply-treatment-effect.R b/R/apply-treatment-effect.R index 09c9531..9af50c4 100644 --- a/R/apply-treatment-effect.R +++ b/R/apply-treatment-effect.R @@ -16,12 +16,10 @@ #' @param concurrent bool for whether this is concurrent run or not #' #' @noRd -apply_treatment_effect <- function(x, model_formula, model_call, te, effect_direction, concurrent) { - # identify outcome - outcome <- model_terms(model_formula)[["lhs"]] +apply_treatment_effect <- function(x, model_formula, outcome, model_call, te, effect_direction, concurrent){ # identify additive or multiplicative modification of outcome required - if (model_call == "lm" | model_call == "feols" | model_call == "multisynth" | model_call == 'lmer') { + if (model_call != "glm.nb") { modifier <- "additive" } else if (model_call == "glm.nb") { modifier <- "multiplicative" @@ -46,9 +44,11 @@ apply_treatment_effect <- function(x, model_formula, model_call, te, effect_dire } else { x[[outcome]] <- x[[outcome]] + (x[[outcome]] * te * x[["treatment"]]) } - x[[outcome]] <- round2(x[[outcome]], 0) + + #l + # x[[outcome]] <- round2(x[[outcome]], 0) } return(x) } -} \ No newline at end of file +} diff --git a/R/concurrent-methods.R b/R/concurrent-methods.R index 24341c2..dfdf2fb 100644 --- a/R/concurrent-methods.R +++ b/R/concurrent-methods.R @@ -183,7 +183,8 @@ concurrent_premodel <- function(model_simulation) { model_call=model$model_call, te=c(model_simulation$effect_magnitude1, model_simulation$effect_magnitude2), effect_direction=model_simulation$effect_direction, - concurrent=TRUE + concurrent=TRUE, + outcome = outcome ) # if autoregressive, need to add lag for crude rate diff --git a/R/no-confounding-methods.R b/R/no-confounding-methods.R index 5f0faa0..592f783 100644 --- a/R/no-confounding-methods.R +++ b/R/no-confounding-methods.R @@ -45,11 +45,12 @@ noconf_sample <- function(single_simulation) { model_type = names(single_simulation$models) # If there is at least one did model - if(!any(model_type == "drdid")){ + if(!any(model_type == "did")){ outcomes <- unique(sapply(single_simulation$models, function(x) { optic::model_terms(x[["model_formula"]])[["lhs"]] })) }else{ - outcomes <- as.character(single_simulation$models$drdid$model_args$yname) + outcomes <- as.character(single_simulation$models$did$model_args$yname) } + ############################################### ### IDENTIFY TREATED UNITS AND TIME PERIODS ### ############################################### @@ -206,8 +207,15 @@ noconf_premodel <- function(model_simulation) { x <- model_simulation$data model <- model_simulation$models - outcome <- optic::model_terms(model$model_formula)[["lhs"]] - oo <- dplyr::sym(outcome) + + if (model$type != "did") { + outcome <- optic::model_terms(model$model_formula)[["lhs"]] + oo <- dplyr::sym(outcome) + } else if (model$type=='did') { + outcome <- as.character(model_simulation$models$model_args$yname) + oo <- dplyr::sym(outcome) + } + model_type <- model$type balance_statistics <- NULL @@ -221,38 +229,34 @@ noconf_premodel <- function(model_simulation) { model_call=model$model_call, te=c(model_simulation$effect_magnitude), effect_direction=model_simulation$effect_direction, - concurrent=FALSE + concurrent=FALSE, + outcome = outcome ) + + unit_sym <- dplyr::sym(model_simulation$unit_var) + time_sym <- dplyr::sym(model_simulation$time_var) # PNL note # this implementation does not seem to use the lagged crude rate # if autoregressive, need to add lag for crude rate # when outcome is deaths, derive new crude rate from modified outcome + # if autoregressive, need to add lagged outcome if (model_type == "autoreg") { - # get lag of crude rate and add it to the model - unit_sym <- dplyr::sym(model_simulation$unit_var) - time_sym <- dplyr::sym(model_simulation$time_var) - x <- x %>% dplyr::arrange(!!unit_sym, !!time_sym) %>% dplyr::group_by(!!unit_sym) %>% dplyr::mutate(lag_outcome = dplyr::lag(!!oo, n=1L)) %>% dplyr::ungroup() - if(model$model_call == "feols"){ formula_components <- as.character(model_simulation$models$model_formula) updated_3 <- strsplit(formula_components[3], " | ", fixed=TRUE) - if(length(updated_3[[1]]==1)){ - new_fmla <- as.formula(paste(formula_components[2], formula_components[1], updated_3[[1]][1], "+ lag_outcome")) - }else{ - new_fmla <- as.formula(paste(formula_components[2], formula_components[1], updated_3[[1]][1], "+ lag_outcome |", updated_3[[1]][2])) - } + + new_fmla <- as.formula(paste(formula_components[2], formula_components[1], updated_3[[1]][[1]], "+ lag_outcome")) + model_simulation$models$model_formula <- new_fmla - }else{ - model_simulation$models$model_formula <- update.formula(model_simulation$models$model_formula, ~ . + lag_outcome) - } + } else if (model_type == "multisynth") { x$treatment[x$treatment > 0] <- 1 x$treatment_level[x$treatment_level > 0] <- 1 @@ -261,9 +265,11 @@ noconf_premodel <- function(model_simulation) { if (sum(is.na(x[[outcome]])) > 0) { stop("multisynth method cannot handle missingness in outcome.") } - } else if (model_type == "drdid") { + } else if (model_type == "did") { + x$treatment[x$treatment > 0] <- 1 x$treatment_level[x$treatment_level > 0] <- 1 + } # get balance information @@ -305,6 +311,53 @@ noconf_premodel <- function(model_simulation) { mu0 = mean((!!oo)[treatment == 0], na.rm=T), sd = sd(!!oo, na.rm=T)) bal_stats = bind_cols(bal_stats, bal_stats2) + + # Add on first year for treatment, which we will use to define + # treatment status across models. + + x <- x %>% + group_by(!!unit_sym) %>% + mutate(treatment_date = ifelse(max(trt_ind == 1), max(year ^ (1-treatment)), 0)) %>% + ungroup() + + # Depending on the model, we may need to recode treatment year or + # treatment date: + + if (model_type == "reg"|model_type == "autoreg"){ + + # Sun and Abraham models expect treatment year to be Inf, if untreated: + x <- x %>% + mutate(treatment_date = ifelse(treatment_date == 0, Inf, treatment_date)) + + }else if (model_type == "did"){ + + # CSA models expect treatment year to be Inf, if untreated and expects + # location to be a numeric variable + x <- x %>% + mutate(treatment_date = ifelse(treatment_date == 0, Inf, treatment_date), + !!unit_sym := as.numeric(as.factor(!!unit_sym))) + + }else if (model_type == "multisynth"){ + + # Multisynth expects treatment to be either a 0/1 (sampling process uses this + # variable instrumentally to produce draws of treatment effect. So reset to + # binary) + + x <- x %>% mutate(treatment = ifelse(treatment > 0, 1, 0)) + + }else if (model_type == "did_imputation"){ + + # I added this section for completeness, in case we want to modify later. + #Borusyak, Jaravel, and Spiess expects treatment_date to be zero for + # untreated units. So no need to change data coding. + + }else if (model_type == "did2s"){ + + # Similar to multisynth, we need treatment to again be binary for + # the first stage + x <- x %>% mutate(treatment = ifelse(treatment > 0, 1, 0)) + + } model_simulation$balance_statistics <- bal_stats model_simulation$data <- x @@ -329,14 +382,30 @@ noconf_premodel <- function(model_simulation) { #' @noRd noconf_model <- function(model_simulation) { model <- model_simulation$models + model_type <- model_simulation$models$type + addtl_args <- model$model_args + x <- model_simulation$data - if (model_simulation$models$name == "drdid") { - args = c(list(data=x), model[["model_args"]]) - } else if(model$model_call == "feols"){ - args = c(list(data=x, fml=model[["model_formula"]]), model[["model_args"]], notes=FALSE) - }else { - args = c(list(data=x, formula=model[["model_formula"]]), model[["model_args"]]) + # The only consistent argument across these + # models is "data". We need to also pass along user provided arguments + # and set some arguments based off the specific model type + + args <- list(data=x) + + if (length(addtl_args) >= 1){ + args <- append(args, addtl_args) + } + + # Change names of provided arguments to meet the needs of respective packages + if (model_type == "reg"|model_type == "autoreg"){ + args[['formula']] <- model$model_formula + }else if (model_type == "multisynth"){ + args[['form']] <- model$model_formula + }else if (model_type == "did_imputation"){ + args[['horizon']] <- T + }else if (model_type == "did2s"){ + args[['treatment']] <- 'treatment' } m <- do.call( @@ -361,7 +430,12 @@ noconf_model <- function(model_simulation) { #' @importFrom stats resid #' @noRd noconf_postmodel <- function(model_simulation) { - outcome <- optic::model_terms(model_simulation$models[["model_formula"]])[["lhs"]] + + if (model_simulation$models[["type"]] == "did"){ + outcome <- as.character(model_simulation$models$model_args$yname) + }else{ + outcome <- optic::model_terms(model_simulation$models[["model_formula"]])[["lhs"]] + } # get run metadata to merge in after meta_data <- data.frame( model_name = model_simulation$models$name, @@ -375,10 +449,11 @@ noconf_postmodel <- function(model_simulation) { n_units=model_simulation$n_units, effect_direction=model_simulation$effect_direction ) + + m <- model_simulation$model_result # get model result information and apply standard error adjustments - if (model_simulation$models[["type"]] != "multisynth") { - m <- model_simulation$model_result + if (model_simulation$models[["type"]] == "reg"|model_simulation$models[["type"]] == "autoreg"|model_simulation$models[["type"]] == "did2s") { if(model_simulation$models$model_call=="feols"){ cf <- as.data.frame(summary(m)$coeftable) } else{ @@ -413,7 +488,31 @@ noconf_postmodel <- function(model_simulation) { mse=mean(resid(m)^2, na.rm=T), stringsAsFactors=FALSE ) - } else { + } else if (model_simulation$models[["type"]] == "did"){ + m_agg <- did::aggte(m, type='group', na.rm=T) + + cf <- data.frame("estimate" = m_agg$overall.att, + "se" = m_agg$overall.se) + + estimate <- cf$estimate + se <- cf$se + variance <- se ^ 2 + t_stat <- NA + p_value <- 2 * pnorm(abs(estimate/se), lower.tail = FALSE) + + results <- data.frame( + outcome=outcome, + se_adjustment="none", + estimate=estimate, + se=se, + variance=variance, + t_stat=NA, + p_value=p_value, + mse = NA, + stringsAsFactors=FALSE + ) + } + else{ m <- model_simulation$model_result cf <- summary(m) cf <- cf$att @@ -588,9 +687,9 @@ noconf_postmodel <- function(model_simulation) { } results <- left_join(results, meta_data, by="outcome") - results <- left_join(results, model_simulation$balance_statistics, by="outcome") return(results) + } diff --git a/R/optic-model-class.R b/R/optic-model-class.R index 7d9615d..4245991 100644 --- a/R/optic-model-class.R +++ b/R/optic-model-class.R @@ -67,7 +67,7 @@ optic_model <- function(name, type,call, formula, se_adjust, ...) { # Constructor: new_optic_model <- function( name, - type= c("reg", "autoreg", "multisynth", "drdid"), + type= c("reg", "autoreg", "multisynth", "did"), call= c("lm", "feols", "multisynth", "glm.nb"), formula, args=list(weights=as.name('population')), diff --git a/man/optic_model.Rd b/man/optic_model.Rd index e5a4ad6..606ed50 100644 --- a/man/optic_model.Rd +++ b/man/optic_model.Rd @@ -9,7 +9,7 @@ optic_model(name, type, call, formula, se_adjust, ...) \arguments{ \item{name}{Name of the model object, used to identify the model when reviewing simulation results} -\item{type}{Estimator used to identify the treatment effect using simulated data. Specified as a string, which can either be 'reg' (regression), 'autoreg' (autoregression, which adds a lag for the outcome variable to a regression model), 'drdid' (doubly-robust difference-in-difference estimator), or 'multisynth' (augmented synthetic control)} +\item{type}{Estimator used to identify the treatment effect using simulated data. Specified as a string, which can either be 'reg' (regression), 'autoreg' (autoregression, which adds a lag for the outcome variable to a regression model), 'did' (doubly-robust difference-in-difference estimator), or 'multisynth' (augmented synthetic control)} \item{call}{String which specifies the R function to call for applying the estimator. Package currently supports either 'lm' (linear model), 'feols' (fixed-effect OLS), 'multisynth' (pooled synthetic controls), or 'glm.nb' (negative-binomial generalized nearlized linear model)} diff --git a/vignettes/intro_optic.Rmd b/vignettes/intro_optic.Rmd index 20995d8..3d3cef2 100644 --- a/vignettes/intro_optic.Rmd +++ b/vignettes/intro_optic.Rmd @@ -141,7 +141,7 @@ then attempt to estimate this effect based on user-provided models. The - `name`: A name for the model to identify the model type in results. -- `type`: Users can set this as either "autoreg", "drdid", +- `type`: Users can set this as either "autoreg", "did", "multisynth", or "reg". - "reg" uses a typical regression framework, implementing the @@ -149,7 +149,7 @@ then attempt to estimate this effect based on user-provided models. The - "autoreg" adds a dependent lag to the model formula. - - "drdid" estimates a treatment effect using a doubly-robust + - "did" estimates a treatment effect using a doubly-robust difference-in-difference estimator, with covariates in the `model_formula` argument used within both the propensity score stage and outcome modeling stage (for more details on doubly robust diff --git a/vignettes/optic_refs.json b/vignettes/optic_refs.json index 296123b..b8b81dc 100644 --- a/vignettes/optic_refs.json +++ b/vignettes/optic_refs.json @@ -139,6 +139,20 @@ ] } }, + { + "id":"griffin2023identifying", + "author":[{"family":"Griffin","given":"Beth Ann"},{"family":"Schuler","given":"Megan S"},{"family":"Stone","given":"Elizabeth M"},{"family":"Patrick","given":"Stephen W"},{"family":"Stein","given":"Bradley D"},{"family":"De Lima","given":"Pedro Nascimento"},{"family":"Griswold","given":"Max"},{"family":"Scherling","given":"Adam"},{"family":"Stuart","given":"Elizabeth A"}], + "citation-key":"griffin2023identifying", + "container-title":"Epidemiology (Cambridge, Mass.)", + "container-title-short":"Epidemiology", + "issue":"6", + "issued":{"date-parts":[["2023"]]}, + "page":"856–864", + "publisher":"LWW", + "title":"Identifying optimal methods for addressing confounding bias when estimating the effects of state-level policies", + "type":"article-journal", + "URL":"https://journals.lww.com/epidem/abstract/2023/11000/identifying_optimal_methods_for_addressing.14.aspx","volume":"34" + }, { "id": "ben-michaelAugmentedSyntheticControl2021", "type": "article-journal",