Skip to content

Commit

Permalink
Make model output based on Stata and R models
Browse files Browse the repository at this point in the history
  • Loading branch information
venexia committed Feb 14, 2024
1 parent 33ecb07 commit 89b82f1
Showing 1 changed file with 154 additions and 15 deletions.
169 changes: 154 additions & 15 deletions analysis/make_model_output.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@ active_analyses <- readr::read_rds("lib/active_analyses.rds")
# List available model outputs -------------------------------------------------
print('List available model outputs')

files <- list.files("output", pattern = "model_output-")
files_R <- list.files("output", pattern = "model_output-")

# Combine model outputs --------------------------------------------------------
print('Combine model outputs')
# Combine R model output -------------------------------------------------------
print('Combine R model output')

df <- NULL
df_R <- NULL

for (i in files) {
for (i in files_R) {

## Load model output

Expand Down Expand Up @@ -71,39 +71,178 @@ for (i in files) {

## Append to master dataframe

df <- plyr::rbind.fill(df,tmp)
df_R <- plyr::rbind.fill(df_R,tmp)
}


# Add details from active analyses ---------------------------------------------
print('Add details from active analyses')

df[,c("exposure","outcome")] <- NULL
df_R[,c("exposure","outcome")] <- NULL

df <- merge(df,
df_R <- merge(df_R,
active_analyses[,c("name","cohort","outcome","analysis")],
by = "name", all.x = TRUE)

df$outcome <- gsub("out_date_","",df$outcome)
df_R$outcome <- gsub("out_date_","",df_R$outcome)

df_R$source <- "R"

df_R <- df_R[,c("name","cohort","outcome","analysis","error","model","term",
"lnhr","se_lnhr","hr","conf_low","conf_high",
"N_total","N_exposed","N_events","person_time_total",
"outcome_time_median","strata_warning","surv_formula","source")]

# List Stata model output files to be combined ---------------------------------
print('List Stata model output files to be combined')

files_S <- list.files(path = "output/", pattern = "stata_model_output-")

# Create empty master data frame for Stata model output ------------------------
print('Create empty master data frame for Stata model output')

df_S <- NULL

# Append each file to master data frame for Stata model output -----------------
print('Append each file to master data frame for Stata model output ')

for (j in files_S) {

print(paste0("File: ",j))

# Load data ------------------------------------------------------------------
print('Load data')

stata_model_output <- readr::read_tsv(file = paste0("output/",j), skip = 2,
col_names = c("term",
"b_min","se_min","t_min",
"lci_min","uci_min","p_min",
"b_max","se_max","t_max",
"lci_max","uci_max","p_max"))

# Make variables numeric -----------------------------------------------------
print('Make variables numeric')

stata_model_output <- stata_model_output %>%
dplyr::mutate_at(setdiff(colnames(stata_model_output), "term"), as.numeric)

# Extract other details ------------------------------------------------------
print('Extract other details')

tmp <- stata_model_output[stata_model_output$term %in% c("N_sub","risk"),
c("term","b_min","b_max")]

# Transform data -------------------------------------------------------------
print('Transform data')

stata_model_output <- stata_model_output[(grepl("cov",stata_model_output$term) |
grepl("age",stata_model_output$term) |
grepl("days",stata_model_output$term)), ]

stata_model_output <- tidyr::pivot_longer(stata_model_output, cols = setdiff(colnames(stata_model_output), "term"))

stata_model_output <- tidyr::separate(stata_model_output, col = "name", into = c("stat","model"))

stata_model_output <- tidyr::pivot_wider(stata_model_output, id_cols = c("term","model"),names_from = "stat")

stata_model_output <- stata_model_output[!is.na(stata_model_output$b),]

# Add name -------------------------------------------------------------------
print('Add name')

stata_model_output$name <- gsub(".txt","",gsub("stata_model_output-","",j))

# Add median fup -------------------------------------------------------------
print('Add median fup')

median_fup <- readr::read_csv(file = paste0("output/",gsub(".txt",".csv",gsub("stata_model_output-","stata_fup-",j))))

median_fup <- dplyr::rename(median_fup,
"outcome_time_median" = "median_tte",
"N_events" = "events")

stata_model_output <- merge(stata_model_output,
median_fup,
by = "term",
all.x = TRUE)

# Rename columns -------------------------------------------------------------
print('Rename columns')

stata_model_output <- dplyr::rename(stata_model_output,
"lnhr" = "b",
"se_lnhr" = "se")

stata_model_output$model <- ifelse(stata_model_output$model=="max","mdl_max_adj",stata_model_output$model)
stata_model_output$model <- ifelse(stata_model_output$model=="min","mdl_age_sex",stata_model_output$model)

# Add other details ----------------------------------------------------------
print('Add other details')

tmp <- tidyr::pivot_longer(tmp, cols = setdiff(colnames(tmp), "term"), names_to = "model")

tmp$model <- ifelse(tmp$model=="b_max","mdl_max_adj",tmp$model)
tmp$model <- ifelse(tmp$model=="b_min","mdl_age_sex",tmp$model)

tmp$term <- ifelse(tmp$term=="N_sub","N_total",tmp$term)
tmp$term <- ifelse(tmp$term=="risk","person_time_total",tmp$term)

tmp <- tidyr::pivot_wider(tmp, names_from = "term")
stata_model_output <- merge(stata_model_output, tmp, by = "model", all.x = TRUE)

# Merge to master data frame -------------------------------------------------
print('Merge to master data frame')

df_S <- rbind(df_S, stata_model_output)

}

# Add missing columns --------------------------------------------------------
print('Add missing columns')

df_S <- tidyr::separate(df_S,
col = "name",
into = c("cohort","analysis","outcome"),
sep = "-", remove = FALSE)

df_S$cohort <- gsub("cohort_","",df_S$cohort)
df_S$hr <- exp(df_S$lnhr)
df_S$conf_low <- exp(df_S$lci)
df_S$conf_high <- exp(df_S$uci)
df_S$N_exposed <- NA
df_S$error <- ""
df_S$strata_warning <- ""
df_S$surv_formula <- ""
df_S$source <- "Stata"

df_S <- df_S[,c("name","cohort","outcome","analysis","error","model","term",
"lnhr","se_lnhr","hr","conf_low","conf_high",
"N_total","N_exposed","N_events","person_time_total",
"outcome_time_median","strata_warning","surv_formula","source")]

# Combine R and Stata model output ---------------------------------------------
print('Combine R and Stata model output')

df <- rbind(df_R,df_S)

# Save model output ------------------------------------------------------------
print('Save model output')

df <- df[,c("name","cohort","outcome","analysis","error","model","term",
"lnhr","se_lnhr","hr","conf_low","conf_high",
"N_total","N_exposed","N_events","person_time_total",
"outcome_time_median","strata_warning","surv_formula")]
"outcome_time_median","strata_warning","surv_formula","source")]

readr::write_csv(df, "output/model_output.csv")


# Perform redaction ------------------------------------------------------------
print('Perform redaction')

df[,c("N_total","N_exposed","N_events")] <- lapply(df[,c("N_total","N_exposed","N_events")],
FUN=function(y){roundmid_any(as.numeric(y), to=threshold)})
df$N_total_midpoint6 <- roundmid_any(as.numeric(df$N_total), to=threshold)
df$N_exposed_midpoint6 <- roundmid_any(as.numeric(df$N_exposed), to=threshold)
df$N_events_midpoint6 <- roundmid_any(as.numeric(df$N_events), to=threshold)
df[,c("N_total","N_exposed","N_events")] <- NULL

# Save model output ------------------------------------------------------------
print('Save model output')

readr::write_csv(df, "output/model_output_rounded.csv")
readr::write_csv(df, "output/model_output_midpoint6.csv")

0 comments on commit 89b82f1

Please sign in to comment.