-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathRev-Table2-Misspec-LogNormal.R
159 lines (140 loc) · 8.07 KB
/
Rev-Table2-Misspec-LogNormal.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# //////////////////////////////////////////////////////////////////////
# Run simulation results for Table 1 ///////////////////////////////////
# Compare full cohort analysis to CMI based on estimated survival / ////
# function and adaptive quadrature vs. trapezoidal rule for Weibull X //
# //////////////////////////////////////////////////////////////////////
# Load packages
## Run once: install.packages("devtools")
## Run once: devtools::install_github("sarahlotspeich/imputeCensRd")
library(imputeCensRd) # To impute censored covariates
# Load data generating function generate_data() from GitHub
library(devtools) # To source an R script from GitHub
source_url("https://raw.githubusercontent.com/sarahlotspeich/ExtrapolationBeforeImputation/main/generate_data.R")
# Set the number of replicates per setting
reps = 100 ## We used a total of 1000, but see NOTES below
# Choose seed to be used for each simulation setting
args = commandArgs(TRUE)
## When running on the cluster, give each array a unique seed by adding the array ID to 11422
start_sim_seed = 11422 + 100 * as.integer(args)
# Number of bootstraps for CSI-B standard errors
B = 1000
# Set simulation setting
censoring = "heavy" ## Censoring rate in X
distX = "lognormal" ## Set distribution for X | Z
# Loop over different sample sizes: n = 100, 500, 2000
for (n in c(100, 500, 2000)){
# Create dataframe to save results for setting
sett_res = data.frame(sim = start_sim_seed + 1:reps,
censoring, n, perc_censored = NA,
alpha_fc = NA, beta_fc = NA, gamma_fc = NA,
alpha_aq = NA, beta_aq = NA, gamma_aq = NA,
se_alpha_aq = NA, se_beta_aq = NA, se_gamma_aq = NA,
alpha_tr = NA, beta_tr = NA, gamma_tr = NA,
se_alpha_tr = NA, se_beta_tr = NA, se_gamma_tr = NA,
time_aq = NA, conv_msg_aq = NA, time_tr = NA)
# Loop over replicates
for (r in 1:reps) {
# For reproducibility
set.seed(sett_res$sim[r])
# Generate data
dat = generate_data(n = n, ## Sample size
censoring = censoring, ## Censoring setting
distX = distX, ## Distribution for X
XdepZ = TRUE) ## Since TRUE, assume that X depends on Z
# Save % censored
sett_res$perc_censored[r] = 1 - mean(dat$d)
# Method 1: Full cohort analysis
fit_fc = lm(y ~ x + z, data = dat)
sett_res[r, c("alpha_fc", "beta_fc", "gamma_fc")] = fit_fc$coefficients
# Method 2: CMI using adaptive quadrature
## Create imputed dataset
time_aq = system.time(
imp_dat <- cmi_sp(imputation_model = Surv(time = w, event = d) ~ z,
data = dat,
trapezoidal_rule = FALSE, ## approximate integral using adaptive quadrature
surv_between = "cf", ## Breslow's estimator is carry-forward interpolated
surv_beyond = "w") ## and extrapolated using the Weibull extension
)
sett_res[r, "time_aq"] = as.numeric(time_aq["elapsed"]) ## save computing time
## Check that imputation was successful
if (imp_dat$code) {
if (B == 0) {
## Fit model to imputed dataset
fit_aq = lm(y ~ imp + z,
data = imp_dat$imputed_data)
sett_res[r, c("alpha_aq", "beta_aq", "gamma_aq")] = fit_aq$coefficients
sett_res[r, c("se_alpha_aq", "se_beta_aq", "se_gamma_aq")] = sqrt(diag(vcov(fit_aq)))
} else {
## Fit model to B bootstrapped resamples from the imputed dataset
all_fits = do.call(what = rbind,
args = replicate(n = B,
expr = coefficients(summary(lm(y ~ imp + z,
data = imp_dat$imputed_data[sample(x = 1:n,
size = n,
replace = TRUE), ])
)
),
simplify = FALSE))
## Pool estimates and standard errors using Rubin's Rules
pooled_est = rowsum(x = all_fits[, "Estimate"], group = rep(1:3, times = B)) / B
sett_res[r, c("alpha_aq", "beta_aq", "gamma_aq")] = pooled_est
VW = rowsum(x = all_fits[, "Std. Error"] ^ 2, group = rep(1:3, times = B)) / B
VB = rowsum(x = (all_fits[, "Estimate"] - pooled_est[rep(1:3, times = B)]) ^ 2, group = rep(1:3, times = B)) / (B - 1)
pooled_var = VW + VB + (VB / B)
sett_res[r, c("se_alpha_aq", "se_beta_aq", "se_gamma_aq")] = sqrt(pooled_var)
}
} else {
sett_res$conv_msg_aq[r] = paste(unique(imp_dat$imputed_data$imp_msg), collapse = ", ")
}
# Method 3: CMI using trapezoidal rule
time_tr = system.time(
imp_dat <- cmi_sp(imputation_model = Surv(time = w, event = d) ~ z,
data = dat,
trapezoidal_rule = TRUE, ## approximate integral using adaptive quadrature
surv_between = "cf", ## Breslow's estimator is carry-forward interpolated
surv_beyond = "w") ## and extrapolated using the Weibull extension
)
sett_res[r, "time_tr"] = as.numeric(time_tr["elapsed"]) ## save computing time
## Check that imputation was successful
if (imp_dat$code) {
if (B == 0) {
## Fit model to imputed dataset
fit_tr = lm(y ~ imp + z,
data = imp_dat$imputed_data)
sett_res[r, c("alpha_tr", "beta_tr", "gamma_tr")] = fit_tr$coefficients
sett_res[r, c("se_alpha_tr", "se_beta_tr", "se_gamma_tr")] = sqrt(diag(vcov(fit_tr)))
} else {
## Fit model to B bootstrapped resamples from the imputed dataset
all_fits = do.call(what = rbind,
args = replicate(n = B,
expr = coefficients(summary(lm(y ~ imp + z,
data = imp_dat$imputed_data[sample(x = 1:n,
size = n,
replace = TRUE), ])
)
),
simplify = FALSE))
## Pool estimates and standard errors using Rubin's Rules
pooled_est = rowsum(x = all_fits[, "Estimate"], group = rep(1:3, times = B)) / B
sett_res[r, c("alpha_tr", "beta_tr", "gamma_tr")] = pooled_est
VW = rowsum(x = all_fits[, "Std. Error"] ^ 2, group = rep(1:3, times = B)) / B
VB = rowsum(x = (all_fits[, "Estimate"] - pooled_est[rep(1:3, times = B)]) ^ 2, group = rep(1:3, times = B)) / (B - 1)
pooled_var = VW + VB + (VB / B)
sett_res[r, c("se_alpha_tr", "se_beta_tr", "se_gamma_tr")] = sqrt(pooled_var)
}
}
# Save results
write.csv(x = sett_res,
file = paste0("rev_table2_", distX, "_", censoring, "_n", n, "_seed", start_sim_seed, ".csv"),
row.names = F)
}
}
# //////////////////////////////////////////////////////////////////////
# NOTES: When using the estimated survival function, we need to ///////
# to fit an imputation model or use an extrapolation method. This //////
# makes the simulations for this table slower than Table 1. It took ~3 /
# minutes to run 1 replication per setting MacBook Pro (M1) with 16GB //
# RAM. Based on this, it would take ~50 hours to run 1000 replications /
# per setting. We parallelized instead, using sim_seed = 114-133 and ///
# running reps = 50 replications per seed. /////////////////////////////
# //////////////////////////////////////////////////////////////////////