-
Notifications
You must be signed in to change notification settings - Fork 0
/
Stage2_Functions.R
552 lines (438 loc) · 18.6 KB
/
Stage2_Functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
##############
# Stage2_Functions.R
# R code to implement all Stage2 analyses to compare intervention effects between arms
#
# Laura B. Balzer, PhD MPhil
# lbalzer@umass.edu
# Lead Statistician for SEARCH
#
# Input: Stage1 estimated cluster-level outcome & cluster-level covariates/exposure
# Outputs: an estimate of the intervention effect
# Goal option specifies if arithmetic risk ratio (aRR) or risk difference (RD)
# Weighting option specifies to weight clusters or individuals equally when computing
# the intervention effect.
##---------------------------
#################
#-----------------------------------------------------#-----------------------------------------------------
# Stage2: function to estimate the intervention effect
#
# input:
# goal; aRR= arithmetic risk ratio; otherwise risk difference (RD)
# weighting; "clust" weights clusters equally; otherwise weights indv equally,
# observed data (data.input),
# name of column corresponding to the outcome (outcome),
# survival: indicator prob survival inputted but outcome is prob of badness
# currently applicable to TB/mortality
# set of candidate adjustment variables; must be at cluster-level (clust.adj)
# prespecified adjustment variables for estimation of the conditional mean outcome with main terms (e.g. Y~A+W1+W2) (QAdj),
# prespecified adjustment variables for estimation of the propensity score with main terms (e.g. A~W1+W2) (gAdj),
# indicator to do adaptive prespecification (do.data.adapt),
# indicator to break the pair-match (break.match)
# indicator to print updates (verbose)
# output: point estimate and inference
#
# note the data.input must have
# 1) a column 'id' indicating cluster membership
# 2) a column 'U' as dummy adjustment variable (=1 for all)
# both of these requirements are taken care with preprocess function
#
# note2: if you are running adaptive prespecification to select the candidate C-TMLE
# minimizing the variance (which nearly all Stage 2 analyses do),
# please set do.data.adapt=T and specify the candidate adjustment variables in clust.adj...
# Please do NOT use (QAdj, gAdj) which are reserved for running non-adaptive glms only.
#
# Note3: when specifying clust.adj, please include the dummy variable U
# to make sure that the unadjusted estimator is considered as a candidate.
# e.g. clust.adj <- c(‘U’, ‘hiv_prev_0’, ‘mc_cover_0’)
#
# note4:to run the unadjusted estimator than set clust.adj=NULL and do.data.adapt=F.
#
# EXAMPLES
## Stage2 estimation of the arithmetic risk ratio (ARR), weighting clusters equally, & preserving the matches
#
## First with TMLE with adaptive pre-specification
# clust.adj <- c('U', 'chc_prev', 'bmi')
# Yc1.adj <- Stage2(goal=‘aRR’, weighting='clust’, data.input=pri,
# outcome='Yc', clust.adj=clust.adj, do.data.adapt=T, break.match=F)
#
## Then with the unadjusted estimator
# Yc1.unadj <- Stage2(goal=‘aRR’, weighting=‘cluster-level’, data.input=pri,
# outcome= 'YcU', clust.adj=NULL, do.data.adapt=F, break.match=F)
#-----------------------------------------------------#-----------------------------------------------------
Stage2 <- function(goal='aRR', weighting='clust', data.input,
outcome=NULL, survival=F,
clust.adj, QAdj=NULL, gAdj=NULL,
do.data.adapt =F, break.match=F, verbose=F){
outcome.string <- make.full.word(outcome)
names(data.input)[grep(outcome.string, names(data.input) ) ] <- 'Y'
nIndv.string <- make.full.word( paste('nIndv', outcome, sep='_') )
names(data.input)[grep(nIndv.string, names(data.input) ) ] <- 'nIndv'
# if input survival probabilities, but the outcome is probability of badness (e.g. TB)
if(survival){
data.input$Y <- 1 - data.input$Y
print('Flipping outcome to be probability of badness')
}
# if haven't already, get weights
if( sum(grep('alpha', colnames(data.input)))==0){
data.input <- get.weights(data.input, weighting=weighting)
}
if(do.data.adapt){
# implement adaptive prespecificaton
select<- do.adaptive.prespec(goal=goal, weighting=weighting, break.match = break.match,
Ldata= data.input, clust.adj=clust.adj, QAdj=QAdj, gAdj=gAdj)
QAdj = select$QAdj
gAdj = select$gAdj
}
# Run full TMLE algorithm with prespecified adjustment set
est<- do.TMLE(goal=goal, weighting=weighting, train=data.input, QAdj=QAdj,
gAdj=gAdj, verbose=verbose)
# Get point estimates and inference
R1=est$R1
R0=est$R0
n.clust <- length(unique(data.input$id))
n.pair <- length(unique(data.input$pair))
if(break.match){
# if breaking the match, set df to (#clusters -2)
df <-n.clust - 2
inference <- get.inference(goal=goal, R1=R1, R0=R0, var.IC= est$var.break, df=df )
} else{
# if preserving the match, set df to (#pairs-1)
df <- n.pair-1
inference <- get.inference(goal=goal, R1=R1, R0=R0, var.IC= est$var.pair, df=df)
}
Txt<- get.CI(psi.hat=R1, var.IC=est$var.R1, df=(n.clust-2) )
Con <- get.CI(psi.hat=R0, var.IC=est$var.R0, df=(n.clust-2) )
RETURN<- data.frame(Txt=Txt, Con=Con, inference, QAdj=est$QAdj, gAdj=est$gAdj)
RETURN
}
#-----------------------------------------------------#-----------------------------------------------------
# do.TMLE: master function to run TMLE
#
# input:
# goal - aRR: arithmetic risk ratio; O/W risk difference,
# training data (train),
# weighting; "clust" weights clusters equally; O/W weights indv equally,
# prespecified adjustment variables for the conditional mean outcome (QAdj),
# prespecified adjustment variables for the propensity score (gAdj),
# initial estimator of the conditional mean outcome (Q.out),
# estimator of the propensity score (p.out),
# indicator to print updates (verbose)
#
# output: list
# training data augmented with estimates,
# prespecified adjustment variables for the conditional mean outcome (QAdj),
# prespecified adjustment variables for the propensity score (gAdj),
# initial estimator of the conditional mean outcome (Q.out),
# estimator of the propensity score (p.out),
# estimted fluctutation coef (epsilon),
# estimated risk under txt and control (R1, R0),
# estimated variance preserving the match (var.pair),
# estimated variance breaking the match (var.break)
#-----------------------------------------------------#-----------------------------------------------------
do.TMLE <- function(goal, train, weighting='clust',
QAdj, gAdj=NULL, Q.out=NULL, p.out=NULL,
verbose=F){
J <- length(unique(train$id))
# if haven't already, aggregate to the cluster-level & add weights
if( nrow(train) > J){
train <- aggregate(train, by=list(train$id), mean)[,2: (ncol(train)+1)]
}
# if haven't already, add a column for weights
if( sum(grep('alpha', colnames(train)))==0){
train <- get.weights(train, weighting=weighting)
}
#=====================================================
# Step1 - initial estimation of E(Y|A,W)= Qbar(A,W)
#=====================================================
# run glm on the adjustment set
Q<- do.Init.Qbar(train=train, QAdj=QAdj, glm.out=Q.out , verbose=verbose)
train <- Q$train
#==========================================================
# Step2: Calculate the clever covariate
#==========================================================
G <- get.clever.cov(train=train, gAdj=gAdj, p.out=p.out, verbose=verbose)
train <- G$train
#==========================================================
# Step3: Targeting
#==========================================================
eps <- get.epsilon(train=train, goal=goal, verbose=verbose)
train <- do.targeting(train=train, eps=eps, goal=goal)
#==========================================================
# Step4: Parameter estimation
#==========================================================
# aggregated data: the weights are 1 for cluster-effect & n_j*J/nTot for individual-level effect
# note weights are normalized to sum to J
R1 <- mean( train$alpha*train$Qbar1W.star )
R0 <- mean( train$alpha*train$Qbar0W.star )
variance.out <- get.IC.variance(goal=goal, Vdata=train, R1=R1, R0=R0)
RETURN<- list(train=train,
QAdj=Q$QAdj, Q.out=Q$glm.out,
gAdj=G$gAdj, p.out=G$p.out,
eps=eps, R1=R1, R0=R0,
var.R1=variance.out$var.R1,
var.R0=variance.out$var.R0,
var.pair=variance.out$var.pair,
var.break=variance.out$var.break)
RETURN
}
#-----------------------------------------------------#-----------------------------------------------------
# do.Init.Qbar - function to do initial estimation of E[Y|A,W] = Qbar(A,W)
# input:
# data set, adjustment variable(s), outcome regression fit on training set, verbose
# output:
# adjustment variable(s)
# outcome regression fit on training set
# data set augmented w/ initial predictions: Qbar(A,W), Qbar(1,W) and Qbar(0,W)
#-----------------------------------------------------#-----------------------------------------------------
do.Init.Qbar<- function(train, QAdj, glm.out=NULL, verbose=F){
if( is.null(QAdj) ){
QAdj<- 'U'
}
train.temp <- train[, c(QAdj, 'A', 'Y')]
X1<- X0<- train.temp
X1$A<-1; X0$A<- 0
if( is.null(glm.out) ){
# fit using the training data
glm.out<- suppressWarnings( glm( Y~. , family='binomial', data=train.temp, weights=train$alpha ) )
if(verbose) print(glm.out)
}
# get initial predictions
QbarAW <- predict(glm.out, newdata=train.temp, type='response')
Qbar1W <- predict(glm.out, newdata=X1, type='response')
Qbar0W <- predict(glm.out, newdata=X0, type='response')
list(QAdj=QAdj, glm.out=glm.out, train=data.frame(train, QbarAW , Qbar1W , Qbar0W ))
}
#-----------------------------------------------------#-----------------------------------------------------
# get.clever.cov - function calculate the clever covariate
# input:
# data set, adjustment variable(s), pscore regression fit on training set, verbose
# output:
# adjustment variable(s), pscore regression fit on training set,
# data set augmented with pscore & clever covariate (H.AW, H.1W, H.0W)
#-----------------------------------------------------#-----------------------------------------------------
get.clever.cov<- function(train, gAdj, p.out=NULL, verbose=F){
if( is.null(gAdj) ){
gAdj <- 'U'
}
train.temp <- train[, c(gAdj, 'A')]
if( is.null(p.out) ){
# fit pscore on training set
p.out<- suppressWarnings( glm( A~. , family='binomial', data= train.temp, weights=train$alpha) )
if(verbose){ print(p.out)}
}
# now use p.out to get estimated pscores
pscore <- predict(p.out, newdata= train.temp, type="response")
# Note if gAdj=U & train$alpha!=1, then this will differ from 0.5
# bound g - should not apply for a randomized trial
pscore [pscore < 0.025] <- 0.025
pscore [pscore > 0.975] <- 0.975
A.train <- train$A
# Clever covariate is two-dimensional;
H.1W <- A.train/pscore
H.0W <- (1-A.train)/(1-pscore )
# via delta method
H.AW <- H.1W - H.0W
list(gAdj=gAdj, p.out=p.out, train=data.frame(train, pscore, H.1W , H.0W , H.AW) )
}
#-----------------------------------------------------#-----------------------------------------------------
# get.epsilon - function calculate the fluctuation coefficient
# input:
# data set, goal with 'aRR'=arithmetic RR, verbose
# output:
# estimated fluctuation coefficient (eps)
#-----------------------------------------------------#-----------------------------------------------------
get.epsilon <- function(train, goal, verbose=F){
A.train<- train$A
Y.train<- train$Y
# Skip fitting if outcome=0 for all observations in either txt or control group
Skip.update <- mean(Y.train[A.train==1])==0 | mean(Y.train[A.train==0])==0 |
mean(Y.train[A.train==1])==1 | mean(Y.train[A.train==0])==1
if(goal!='aRR'){
# if not going after aRR, then use a 1-dim clever covariate
if(!Skip.update){
logitUpdate<- suppressWarnings(
glm(Y.train ~ -1 +offset(qlogis(train$QbarAW )) + train$H.AW, family="binomial", weights=train$alpha))
eps<-logitUpdate$coef
} else{
eps<- 0
}
names(eps) <- 'H.AW'
}else{
# targeting the aRR requires a two-dimensional clever covariate
if( !Skip.update ){
logitUpdate<- suppressWarnings(
glm(Y.train ~ -1 +offset(qlogis(train$QbarAW )) + train$H.0W + train$H.1W, family="binomial", weights=train$alpha))
eps<-logitUpdate$coef
} else{
eps <- c(0,0)
}
names(eps)<- c('H.0W', 'H.1W')
}
if(verbose) print(eps)
eps
}
#-----------------------------------------------------#-----------------------------------------------------
# do.targeting - function to update initial estimators of QbarAW
# input:
# data set (train), fluctuation coefficient (eps), goal (aRR= arithmetic risk ratio; otherwise RD)
# output:
# data.frame w/ targeted predictions: Qbar*(A,W), Qbar*(1,W), Qbar*(0,W)
#-----------------------------------------------------#-----------------------------------------------------
do.targeting <- function(train, eps, goal){
g1W<- train$pscore
g0W<- (1 - g1W)
if(goal!='aRR'){
# updated QbarAW estimates for training set.
QbarAW.star <- plogis( qlogis(train$QbarAW ) + eps*train$H.AW)
Qbar1W.star <- plogis( qlogis(train$Qbar1W ) + eps/g1W )
Qbar0W.star <- plogis( qlogis(train$Qbar0W ) - eps/g0W )
}else{
# updated QbarAW estimates for training set.
QbarAW.star <- plogis( qlogis(train$QbarAW) + eps['H.0W']*train$H.0W + eps['H.1W']*train$H.1W)
Qbar0W.star <- plogis( qlogis(train$Qbar0W) + eps['H.0W']/g0W )
Qbar1W.star <- plogis( qlogis(train$Qbar1W) + eps['H.1W']/g1W )
}
train <- data.frame(train, QbarAW.star, Qbar1W.star, Qbar0W.star)
train
}
#-----------------------------------------------------#-----------------------------------------------------
# get.IC.variance - function to do influence curve-based variance estimate
# input:
# goal (aRR= arithmetic risk ratio; otherwise RD)
# dataset
# risk estimates under txt and control R1 & R0
# output: on log scale for if goal='aRR'
# estimated IC & variance - preserving/breaking the match
#-----------------------------------------------------#-----------------------------------------------------
get.IC.variance<- function(goal, Vdata, R1, R0){
# aggregated data, wt=1 for cluster-level effect & wt=n_j*J/nTot for indv
# again noting that weights are normalized to sum to J
DY1 <- Vdata$alpha*Vdata$H.1W*(Vdata$Y - Vdata$Qbar1W.star)
DY0 <- Vdata$alpha*Vdata$H.0W*(Vdata$Y - Vdata$Qbar0W.star)
if(goal !='aRR'){
# going after RD, easy IC
DY <- DY1 - DY0
} else{
# going after aRR, then get IC estimate on log scale
# i.e. Delta method for log(aRR) = log(R1) - log(R0)
DY <- 1/R1*DY1 - 1/R0*DY0
}
# estimated variance for txt specific means or if break the match
J<- length( unique(Vdata$id) )
var.R1 <- var(DY1) /J
var.R0 <- var(DY0) / J
var.break <- var(DY) /J
# estimated variance if preserve the match
pairs <- unique(Vdata$pair)
n.pairs <- length(pairs)
DY.paired <- rep(NA, n.pairs)
for(i in 1:n.pairs){
these<- Vdata$pair== pairs[i]
DY.paired[i]<- 0.5*sum(DY[ these] )
}
var.pair <- var(DY.paired) / n.pairs
list(var.R1=var.R1, var.R0=var.R0, DY=DY, var.break=var.break, DY.paired=DY.paired, var.pair=var.pair)
}
#-----------------------------------------------------#-----------------------------------------------------
# get.CI: mini function to calculate two-sided (1-sig.level)% confidence intervals
# input:
# pt estimate
# var.IC (variance estimate)
# df (degrees of freedom for Student's t-dist )
# sig.level (significance level)
# output:
# data.frame with point est, lower/higher 95% CI
#-----------------------------------------------------#-----------------------------------------------------
get.CI <- function(psi.hat, var.IC, df, sig.level=0.05){
# cutoff based on t-dist for testing and CI
cutoff <- qt(sig.level/2, df=df, lower.tail=F)
# standard error (square root of the variance)
se<- sqrt(var.IC)
# 95% confidence interval
CI.lo <- (psi.hat - cutoff*se)
CI.hi <- (psi.hat + cutoff*se)
out<- data.frame(est=psi.hat, CI.lo, CI.hi, var.hat=var.IC)
out
}
#-----------------------------------------------------#-----------------------------------------------------
# get.Inference: function to calculate (1-sig.level)% confidence intervals & test the null hypothesis
# input:
# goal (aRR= arithmetic risk ratio; otherwise RD)
# risk estimates on txt and control (R1, R0)
# var (variance estimate - on log scale for aRR),
# df (degrees of freedom if using a Student's t-dist )
# sig.level (significance level)
# output:
# variance, test statistic, confidence intervals, pval, indicator reject null
# note: if goal=aRR, variance & test stat are on log-scale
#-----------------------------------------------------#-----------------------------------------------------
get.inference <- function(goal, R1, R0, var.IC, df, sig.level=0.05){
if( goal=='aRR' ){
psi.hat <- log(R1/R0)
} else{
psi.hat <- R1- R0
}
# cutoff based on t-dist for testing and CI
cutoff <- qt(sig.level/2, df=df, lower.tail=F)
# standard error (square root of the variance)
se<- sqrt(var.IC)
# test statistic (if goal=aRR then on the transformed scale)
tstat <- psi.hat/se
# p-value
pval<- 2*pt(abs(tstat), df=df, lower.tail=F)
# reject the null
reject <- pval < sig.level
# 95% confidence interval
CI.lo <- (psi.hat - cutoff*se)
CI.hi <- (psi.hat + cutoff*se)
if(goal=='aRR'){
psi.hat<- exp(psi.hat)
CI.lo <- exp(CI.lo)
CI.hi <- exp(CI.hi)
}
out<- data.frame(Effect.est=psi.hat, Effect.CI.lo=CI.lo, Effect.CI.hi=CI.hi, var.hat=var.IC, tstat, pval, reject)
out
}
#----------------------------------------------
# get.weights: function to get appropriate weights for the analysis
# input: data & weighting scheme (clusters equal or indiv equal)
# output: data with a column alpha for weights
#----------------------------------------------
################
get.weights<- function(data, weighting){
# reset any previously inputted weights
data$alpha <- -99
data.clust <- aggregate(data[,c('id', 'nIndv')], by=list(data$id), mean)
nTot <- sum(data.clust$nIndv)
J <- nrow(data.clust)
if( nrow(data) > J ){
# if not aggregated to the cluster-level
if(weighting=='clust'){
# corresponds to the empirical mean in each cluster
data$alpha <- 1/data$nIndv
} else{
data$alpha <- J/nTot
}
}else{
# aggregated to cluster-level
if(weighting=='clust'){
data$alpha <- 1
} else{
data$alpha <- data$nIndv*J/nTot
}
}
data
}
############
make.full.word <- function(string){
paste("\\<", string, "\\>", sep="")
}
get.worst.pair <- function(data.input, variable.name='hiv_prev_0') {
temp <- rep(NA, 16)
for(j in 1:16){
these<- data.input[data.input$pair==j, variable.name]
temp[j] <- abs( these[1]-these[2])
# print(c(j, temp[j]))
}
temp
}