generated from opensafely/research-template
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcox_model.do
226 lines (175 loc) · 6.39 KB
/
cox_model.do
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
* Specify parameters
local name "`1'"
local day0 "`2'"
// For local testing:
local name "cohort_prevax_extf-day0_sub_sex_male-depression"
local day0 "TRUE"
* Set Ado file path
adopath + "analysis/extra_ados"
* Unzip the input data
shell gunzip "./output/ready-`name'.csv.gz"
* Import and describe data
import delim using "./output/ready-`name'.csv", clear
describe
* Filter data
keep patient_id exposure outcome fup_start fup_stop cox_weight cov_cat* cov_num* cov_bin*
drop cov_num_age_sq
duplicates drop
* Rename variables
rename cov_num_age age
rename cov_cat_region region
* Generate pre vaccination cohort dummy variable
local prevax_cohort = regexm("`name'", "_pre")
display "`prevax_cohort'"
* Generate sex indicator
local sub_sex = regexm("`name'", "sub_sex")
display "`sub_sex'"
* Replace NA with missing value that Stata recognises
ds , has(type string)
foreach var of varlist `r(varlist)' {
replace `var' = "" if `var' == "NA"
}
* Reformat variables
foreach var of varlist exposure outcome fup_start fup_stop {
split `var', gen(tmp_date) parse(-)
gen year = real(tmp_date1)
gen month = real(tmp_date2)
gen day = real(tmp_date3)
gen `var'_tmp = mdy(month, day, year)
format %td `var'_tmp
drop `var' tmp_date* year month day
rename `var'_tmp `var'
}
* Encode non-numeric variables
foreach var of varlist region cov_bin* cov_cat* {
di "Encoding `var'"
local var_short = substr("`var'", 1, length("`var'") - 1)
encode `var', generate(`var_short'1)
drop `var'
rename `var_short'1 `var'
}
* Summarize missingness
misstable summarize
* Update follow-up end
replace fup_stop = fup_stop + 1
format fup_stop %td
* Make age spline
centile age, centile(10 50 90)
mkspline age_spline = age, cubic knots(`r(c_1)' `r(c_2)' `r(c_3)')
* Make outcome status variable
egen outcome_status = rownonmiss(outcome)
* Apply stset including IPW here as unsampled datasets will be provided with cox_weights set to 1
if `prevax_cohort'==1 {
if "`day0'"=="TRUE" {
stset fup_stop [pweight=cox_weight], failure(outcome_status) id(patient_id) enter(fup_start) origin(time mdy(01,01,2020))
stsplit time, after(exposure) at(0 1 28 197 365 714)
replace time = 714 if time==-1
}
else {
stset fup_stop [pweight=cox_weight], failure(outcome_status) id(patient_id) enter(fup_start) origin(time mdy(01,01,2020))
stsplit time, after(exposure) at(0 28 197 365 714)
replace time = 714 if time==-1
}
}
else {
if "`day0'"=="TRUE" {
stset fup_stop [pweight=cox_weight], failure(outcome_status) id(patient_id) enter(fup_start) origin(time mdy(01,06,2021))
stsplit time, after(exposure) at(0 1 28 197)
replace time = 197 if time==-1
}
else {
stset fup_stop [pweight=cox_weight], failure(outcome_status) id(patient_id) enter(fup_start) origin(time mdy(01,06,2021))
stsplit time, after(exposure) at(0 28 197)
replace time = 197 if time==-1
}
}
* Calculate study follow up
gen fup = _t - _t0
egen fup_total = total(fup)
* Make days variables
if "`day0'"=="TRUE" {
gen days0_1 = 0
replace days0_1 = 1 if time==0
tab days0_1
gen days1_28 = 0
replace days1_28 = 1 if time==1
tab days1_28
}
else {
gen days0_28 = 0
replace days0_28 = 1 if time==0
tab days0_28
}
gen days28_197 = 0
replace days28_197 = 1 if time==28
tab days28_197
if `prevax_cohort'==1 {
gen days197_365 = 0
replace days197_365 = 1 if time==197
tab days197_365
gen days365_714 = 0
replace days365_714 = 1 if time==365
tab days365_714
}
* Run models and save output [Note: cannot use efron method with weights]
tab time outcome_status
di "Total follow-up in days: " fup_total
bysort time: summarize(fup), detail
if `sub_sex'==1 {
stcox days* age_spline1 age_spline2, strata(region) vce(r)
est store min, title(Age_Sex)
stcox days* age_spline1 age_spline2 i.cov_cat_* cov_num_* cov_bin_*, strata(region) vce(r)
est store max, title(Maximal)
}
else {
stcox days* i.cov_cat_sex age_spline1 age_spline2, strata(region) vce(r)
est store min, title(Age_Sex)
stcox days* age_spline1 age_spline2 i.cov_cat_* cov_num_* cov_bin_*, strata(region) vce(r)
est store max, title(Maximal)
}
estout * using "output/stata_model_output-`name'.txt", cells("b se t ci_l ci_u p") stats(risk N_fail N_sub N N_clust) replace
* Calculate median follow-up among individuals with the outcome
keep if outcome_status==1
keep patient_id days* fup
rename fup tte
gen term = ""
if `prevax_cohort'==1 {
if "`day0'"=="TRUE" {
replace term = "days_pre" if days0_1==0 & days1_28==0 & days28_197==0 & days197_365==0 & days365_714==0
replace term = "days0_1" if days0_1==1 & days1_28==0 & days28_197==0 & days197_365==0 & days365_714==0
replace term = "days1_28" if days0_1==0 & days1_28==1 & days28_197==0 & days197_365==0 & days365_714==0
replace term = "days28_197" if days0_1==0 & days1_28==0 & days28_197==1 & days197_365==0 & days365_714==0
replace term = "days197_365" if days0_1==0 & days1_28==0 & days28_197==0 & days197_365==1 & days365_714==0
replace term = "days365_714" if days0_1==0 & days1_28==0 & days28_197==0 & days197_365==0 & days365_714==1
}
else {
replace term = "days_pre" if days0_28==0 & days28_197==0 & days197_365==0 & days365_714==0
replace term = "days0_28" if days0_28==1 & days28_197==0 & days197_365==0 & days365_714==0
replace term = "days28_197" if days0_28==0 & days28_197==1 & days197_365==0 & days365_714==0
replace term = "days197_365" if days0_28==0 & days28_197==0 & days197_365==1 & days365_714==0
replace term = "days365_714" if days0_28==0 & days28_197==0 & days197_365==0 & days365_714==1
}
}
else {
if "`day0'"=="TRUE" {
replace term = "days_pre" if days0_1==0 & days1_28==0 & days28_197==0
replace term = "days0_1" if days0_1==1 & days1_28==0 & days28_197==0
replace term = "days1_28" if days0_1==0 & days1_28==1 & days28_197==0
replace term = "days28_197" if days0_1==0 & days1_28==0 & days28_197==1
}
else {
replace term = "days_pre" if days0_28==0 & days28_197==0
replace term = "days0_28" if days0_28==1 & days28_197==0
replace term = "days28_197" if days0_28==0 & days28_197==1
replace term = "days197_535" if days0_28==0 & days28_197==0
}
}
replace tte = tte + 28 if term == "days28_197"
replace tte = tte + 197 if term == "days197_365"
replace tte = tte + 365 if term == "days365_714"
replace tte = tte + 197 if term == "days197_535"
bysort term: egen median_tte = median(tte)
egen events = count(patient_id), by(term)
keep term median_tte events
duplicates drop
export delimited using "output/stata_fup-`name'.csv", replace