-
Notifications
You must be signed in to change notification settings - Fork 1
/
DataPrepDisbayes.Rmd
555 lines (454 loc) · 27.3 KB
/
DataPrepDisbayes.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
---
title: "Preparing data from the Global Burden of Disease for METAHIT modelling for English City Regions"
author:
- "Jackson, Chris <chris.jackson@mrc-bsu.cam.ac.uk>"
date: "`r Sys.time()`"
output:
html_document:
code_folding: show
params:
date: !r Sys.Date() - 1
---
# Read in and combine data
First load required packages and specify local file paths
```{r}
library(tidyverse)
library(progress)
library(disbayes)
library(tempdisagg)
# relative_path_gbd <- "~/scratch/chronic"
# relative_path_execute <- "~/scratch/chronic/mh-execute"
relative_path_execute <- paste0("~/mh-execute")
relative_path_mslt <- paste0("~/mh-mslt")
```
# Prepare data for city regions
Read look-up table mapping between local authorities and city regions, and add names for regions that are not city regions
```{r,eval=FALSE}
regions <- c("East Midlands", "East of England", "Greater London", "North East England",
'North West England', "South East England", "South West England", "Yorkshire and the Humber", "West Midlands")
# LTLAs to city regions and regions
local_government_areas <- read_csv(file.path(relative_path_execute, "inputs/mh_regions_lad_lookup.csv")) %>% rename(region="gordet")
lad <- read.csv(file.path(relative_path_mslt,"input/Local_Authority_District_to_Region_(December_2017)_Lookup_in_England.csv"))
utla <- read.csv(file.path(relative_path_mslt,"input/Lower_Tier_Local_Authority_to_Upper_Tier_Local_Authority_(December_2017)_Lookup_in_England_and_Wales.csv"))
## Make UTLA to region lookup
lad_wales <- c("Isle of Anglesey", "Gwynedd", "Conwy", "Denbighshire", "Flintshire",
"Wrexham", "Ceredigion", "Pembrokeshire", "Carmarthenshire",
"Swansea", "Neath Port Talbot", "Bridgend", "Vale of Glamorgan",
"Cardiff", "Rhondda Cynon Taf", "Caerphilly", "Blaenau Gwent",
"Torfaen", "Monmouthshire", "Newport", "Powys", "Merthyr Tydfil")
utlareg <- utla %>%
filter(!duplicated(UTLA17NM)) %>%
select(LTLA17NM, UTLA17NM) %>%
rename("LAD17NM"=LTLA17NM) %>%
left_join(lad, by="LAD17NM") %>%
filter(!(LAD17NM %in% lad_wales))
```
Read Global Burden of Disease data
UPDATE: BZ added latest diseases 11-10-21 and removed hard coded files
```{r,eval=FALSE}
list_of_files <- list.files(path = paste0(relative_path_mslt, "/input/gbd/GBD2019/METAHIT/"),
pattern = "\\.csv$",
full.names = TRUE)
gbd <- do.call(rbind, lapply(list_of_files, read.csv))
## Attach geographical info to each GBD estimate
## Manual step - this should cover them all
gbd$location_name[gbd$location_name=="St Helens"] <- "St. Helens"
gbd <- gbd %>% mutate(
areatype = case_when(location_name %in% c("United Kingdom","Australia") ~ "country",
location_name %in% c("England","Scotland","Wales","Northern Ireland") ~ "nation",
location_name %in% regions ~ "region",
location_name %in% unique(utla$LTLA17NM) ~ "ltla",
location_name %in% unique(utla$UTLA17NM) ~ "utla"
)
)
stopifnot(all(!is.na(gbd$areatype)))
## Determine which estimates are in city regions
gbd <- gbd %>%
left_join(local_government_areas, by=c("location_name"="lad11nm")) %>%
mutate(in_cityregion = !is.na(cityregion)) %>%
filter(areatype %in% c("ltla","utla"))
names(gbd) <- gsub(pattern = "_name", replacement = "", x = names(gbd))
gbd <- gbd %>%
select(-contains("id")) %>%
mutate(cause = tolower(cause))
#
# population <- gbd %>% filter(cause == "all causes" & measure == "Deaths") %>%
# select(location, sex, age, val, metric) %>%
# pivot_wider(names_from="metric", values_from="val") %>%
# mutate(pop = Number*100000/Rate)
#
# gbd_temp <- left_join(gbd, population)
save(gbd, local_government_areas, DISEASE_SHORT_NAMES, file=file.path(relative_path_mslt, "/input/gbd/GBD2019/METAHIT/GBD2019.rda"))
## Next we want to decide whether to use LTLA/UTLA or region for areas not in city regions
```
Read disease coding table
UPDATE: BZ added latest diseases to disease_names_execute 11-10-21
```{r,eval=FALSE}
disease_names_execute <- read_csv(file.path(relative_path_execute, "inputs/dose_response/disease_outcomes_lookup_new.csv")) %>% select(GBD_name, acronym) %>%
mutate(disease = tolower(GBD_name))
DISEASE_SHORT_NAMES <- disease_names_execute %>%
mutate(sname = abbreviate(disease)) %>%
mutate(is_not_dis = ifelse((grepl("injuries", disease) | # grepl determines if the disease string contains 'injuries'
grepl("all causes", disease) |
grepl("lower respiratory infections", disease)),
1, 0)) %>%
mutate(is_not_dis = case_when(sname == "allc" ~ 2,
sname == "lwri" ~ 1,
sname == "npls" ~ 2,
sname == "crdd" ~ 2,
TRUE ~ is_not_dis)) %>%
mutate(
males = ifelse(disease %in% c("uterine cancer", "breast cancer"), 0, 1),
females = ifelse(disease %in% "prostate cancer", 0, 1),
sname = gsub("'", '', sname),
acronym = ifelse(is.na(acronym), sapply(strsplit(disease, " "), head, 1), acronym))
saveRDS(DISEASE_SHORT_NAMES, paste0(relative_path_mslt, "/input/gbd/GBD2019/METAHIT/DISEASE_SHORT_NAMES.rds"))
```
# Determining "effective sample sizes" behind estimates
<!-- NOTE COUNTS FOR AREAS CONTAINING CITY REGIONS ARE NOT CONSTRAINED TO BE GREATER THAN TOTAL COUNTS FOR CITY REGIONS WITHIN THAT AREA. we will exclude non-city region data from this analysis. see notes in noncityregions.Rmd -->
This part is the most computationally intensive. For each estimated "rate" published by the GBD (actually a proportion), the associated credible interval is converted to an "effective sample size" that describes the amount of information that the estimate is based on.
For example, for prevalence estimates, these are assumed to be based on a survey of n people in the area, and observing r people with the disease. The point estimate is assumed to equal r/n, and the credible interval is interpreted as the posterior interval we would get from a Bayesian analysis of these data with a vague (Beta(0,0)) prior. The implied values of r and n can then be computed based on the point estimate and credible interval.
n is referred to as the "effective sample size". Note that this is generally not equal to the actual population of the area. They would only be equal if a survey of the full population had been conducted. We do not have access to the underlying data or methods that GBD used to calculate the credible interval, so we do not know the true value of n.
Before determining the effective sample sizes, we filter the data to include only the "Rate" estimates for specific diseases (excluding "all causes" results). Note the published "Rates" are actually the expected number of events (e.g. prevalent cases at the start of a year, incident events within a year or deaths within a year) among a population of 100000 that includes all people in the demographic subgroup of interest, not just those at risk of the event. The "Rates" are divided by 100000 so they can be interpreted as proportions.
Lower limits published as 0 and upper limits published as 1 are modified to be close but not equal to these bounds, while remaining consistent with the point estimate, since values of exactly 0 and 1 are inconsistent with the Beta distribution (TODO put this in disbayes::ci2num).
```{r,eval=FALSE}
gbdp <-
gbd_temp %>%
filter(metric == "Rate") %>%
filter(cause != "all causes") %>%
mutate(num = NA, denom = NA) %>%
mutate(val = val/100000, lower = lower / 100000, upper = upper / 100000) %>%
mutate(lower = if_else(lower==0, pmin(val/2, 0.00001), lower)) %>%
mutate(upper = if_else(upper>=1, pmax((1+val)/2, 0.99999), upper))
```
The function `ci2num` in the `disbayes` package is then used to calculate the effective sample sizes. This takes about 40 minutes on a fairly fast laptop. It could be made a lot faster by using parallel processing.
```{r,eval=FALSE,cache=TRUE}
nest <- nrow(gbdp)
pb <- progress_bar$new(total = nest) # show progress of computation
for (i in 1:nest){
if (gbdp$val[i] < gbdp$upper[i] &
gbdp$val[i] > gbdp$lower[i]) {
pb$tick()
counts <- disbayes::ci2num(gbdp$val[i], gbdp$lower[i], gbdp$upper[i])
gbdp$num[i] <- counts$num
gbdp$denom[i] <- counts$denom
}
}
saveRDS(gbdp, file=file.path(relative_path_mslt, "/input/gbd/GBD2019/METAHIT/gbdp.rds"))
```
```{r,echo=FALSE}
gbdp <- readRDS(file=file.path(relative_path_mslt, "/input/gbd/GBD2019/METAHIT/gbdp.rds"))
```
The estimates are very close to the implicit numerator divided by the implicit denominator in all cases - so those implicit counts can be used in place of the point estimates.
```{r}
summary(gbdp$num/gbdp$denom - gbdp$val)
```
The remaining counts still to be filled in are those where the point estimate is exactly 0 or 1, which is incompatible with a beta distribution.
There are also many estimates for which the implicit denominator is implausibly large. These correspond to disease events which are very rare in particular subgroups, thus the estimates of the effective sample size are unstable.
```{r}
gbdp %>% select(val, num, denom) %>% arrange(desc(denom)) %>% head
```
If the point estimate is 0 or 1, or if the denominator obtained from `ci2num` is larger than the actual population size, we will simply use the actual population size of the subgroup as the denominator, which will be closer to the amount of information contributing the estimate.
# Determining actual population sizes
Therefore we reconstruct the actual population sizes of each subgroup, assumed to be equal to the estimates published by GBD as the estimated "Number" of cases, divided by the estimated "rates" per person.
Update BZ: no need for this step as population alrady calculated earlier, move to next step.
```{r,eval=FALSE}
gbdnum <- gbd %>%
filter(metric=="Number") %>%
select(measure, location, sex, age, cause, Number=val)
gbdp <- gbdp %>%
select(!Number) %>%
left_join(gbdnum, by=c("measure","location","sex","age","cause")) %>%
mutate(pop = Number / val)
saveRDS(gbdp, file=file.path(relative_path_mslt, "/input/gbd/GBD2019/METAHIT/gbdp.rds"))
gbdp <- readRDS(file=file.path(relative_path_mslt, "/input/gbd/GBD2019/METAHIT/gbdp.rds"))
```
We can then use these to fill in the effective sample sizes "d" that were missing or implausible, and deduce the effective numerators "n" by multiplying "d" by the point estimate of the proportion.
```{r}
gbdp <- gbdp %>%
mutate(pop = if_else(is.na(pop), 5000, pop)) %>%
mutate(nodenom = is.na(denom) | (denom > pop),
denom = if_else(is.na(denom), pop, denom),
denom = if_else(denom > pop, pop, denom),
num = ifelse(nodenom, round(val*denom), num))
```
Note that the data are still grouped as originally published - estimates by five-year age groups (not one year) and local authorities (not city regions), as well as by gender and disease measure.
Now we have reconstructed the implicit count "data" on which these estimates are based, these counts can now be easily aggregated or disaggregated to produce estimates for smaller or larger subgroups. The counts will retain their meaning as the implicit number of events, or number of individuals, observed in the subgroup.
# Disaggregating by age groups
Firstly we can disaggregate the five year age groups to single years of age. If we assume that there was an equal amount of information from each single year contributing to the five-year result, we can simply divide the numerators r and denominators n for the five-year estimates by 5 (and round to the nearest integer).
However a smarter method involves a temporal disaggregation model, using the `tempdisagg` package (Sax and Steiner, 2020). For each measure, location, sex, and cause, the five year counts are disaggregated to one-year counts in a way that preserves the five year totals, but the resulting one-year counts vary smoothly.
```{r}
# Working with the data that has one row per five-year age group,
# first construct 1-year counts as an extra column.
gbdp <- gbdp %>%
extract(age, c("from_age", "to_age"), "(.+) to (.+)", remove=FALSE, convert=TRUE) %>%
mutate(from_age = case_when(age=="95 plus" ~ 95L,
age=="Under 5" ~ 0L,
TRUE ~ from_age),
to_age = case_when(age=="95 plus" ~ 99L,
age=="Under 5" ~ 4L,
TRUE ~ to_age),
agediff = to_age - from_age + 1, # this will equal 5
num1yr = round(num/agediff),
denom1yr = round(denom/agediff)) %>%
rename(agegroup = age)
## Now stretch the data out using an index, to create a data frame with 1 row per year of age and create a variable for year of age.
index <- rep(1:nrow(gbdp), gbdp$agediff)
gbdpyrd5 <- gbdp[index,] %>%
mutate(ageyr = from_age + sequence(gbdp$agediff) - 1)
gbdpyrd5 <- gbdpyrd5 %>%
select(measure, location, ageyr, sex, agegroup, from_age, to_age, cause, year, areatype, cityregion, region, in_cityregion, num1yr, denom1yr, pop)
saveRDS(gbdpyrd5, file=file.path(relative_path_mslt, "/input/gbd/GBD2019/METAHIT/gbdpyrd5.rds"))
## More advanced method for smooth disaggregation
## using the tempdisagg package
# tmp <- gbdp %>% group_by(measure, location, sex, cause) %>% filter(cur_group_id()==1)
gbdp_grp <- gbdp %>%
group_by(measure, location, sex, cause,
areatype, cityregion, region, in_cityregion) %>% # filter(cur_group_id() %in% 1:3) %>%
arrange(measure, location, sex, cause, from_age)
disagg <- function(dat, key){
res <- with(dat, {
data.frame(
ageyr = rep(from_age,agediff) + sequence(agediff) - 1,
num = predict(td(num ~ 1, to=5, method="fast")),
denom = predict(td(denom ~ 1, to=5, method="fast"))
) } )
res
}
# This takes about a minute
gbdpyr <- group_modify(gbdp_grp, disagg) %>%
ungroup %>%
left_join(gbdpyrd5, select("measure","location","ageyr","sex","cause",
"areatype", "cityregion", "region", "in_cityregion"), by=c("measure","location","ageyr","sex","cause",
"areatype", "cityregion", "region", "in_cityregion"))
# Sometimes the results are negative.
# Revert to dividing by 5 for all agegroups that contain a negative result
neg_num <- gbdpyr %>%
filter(num<0) %>%
select("measure","location","sex","cause","agegroup") %>%
distinct() %>%
mutate(zeronum = TRUE)
neg_denom <- gbdpyr %>%
filter(denom<0) %>%
select("measure","location","sex","cause","agegroup") %>%
distinct() %>%
mutate(zerodenom = TRUE)
gbdpyr <- gbdpyr %>%
left_join(neg_num, by=c("measure", "location","sex","cause","agegroup")) %>%
left_join(neg_denom, by=c("measure", "location","sex","cause","agegroup")) %>%
mutate(zeronum = replace_na(zeronum, FALSE)) %>%
mutate(zerodenom = replace_na(zerodenom, FALSE)) %>%
mutate(num = if_else(zeronum, num1yr, round(num)),
denom = if_else(zerodenom, denom1yr, round(denom)))
saveRDS(gbdpyr, file=file.path(relative_path_mslt, "/input/gbd/GBD2019/METAHIT/gbdpyr.rds"))
## Check for a few cases that the series match with the naive dividing by 5 method and the smooth methods
tloc <- sample(unique(gbdpyrd5$location), 3)
test2 <- gbdpyrd5 %>%
filter(cause=="ischemic heart disease", sex=="Male", measure=="Deaths",
location %in% tloc)
gbdpyr %>%
filter(cause=="ischemic heart disease", sex=="Male", measure=="Deaths",
location %in% tloc) %>%
ggplot(aes(x=ageyr, y=denom, col=location, group=location)) +
geom_line() + geom_point() +
geom_line(data=test2, aes(y=denom1yr))
```
#### Example
Data by 5-year age groups for one location/cause/measure of interest
```{r}
gbdp %>%
filter(measure=="Incidence", location=="bristol", cause=="ischemic heart disease",
agegroup %in% c("40 to 44", "45 to 49"), sex=="Female") %>%
select(agegroup, sex, num, denom, num1yr, denom1yr)
```
Equivalent data by single year of age. Note due to rounding errors, the one-year counts will not always add up exactly to the five-year counts.
```{r}
gbdpyr %>%
filter(measure=="Incidence", location=="England", cause=="ischemic heart disease",
ageyr %in% 40:49, sex=="Female") %>%
select(ageyr, sex, num, denom)
```
# Aggregating by area
Secondly we can aggregate the data from local authorities to produce data by city regions (defined as groups of local authorities), and regions excluding city regions. This covers the whole of England with mutually exclusive areas. No extra assumptions are required to do this. It doesn't matter whether this is done before or after converting from 5-year to 1-year age groups.
Update: BZ added nations to then use English data for small diseases
```{r}
options(dplyr.summarise.inform = FALSE)
## City regions
gbdpyrcr <- gbdpyr %>%
filter(in_cityregion) %>%
group_by(measure, sex, ageyr, cause, cityregion) %>%
summarise(num = sum(num), denom = sum(denom), pop = sum(pop)) %>%
mutate(areatype = "cityregion") %>%
rename(location = cityregion) %>%
ungroup %>%
select(measure, sex, ageyr, cause, location, areatype, num, denom, pop)
## UTLAs with estimates in GBD
gbdpyru <- gbdpyr %>%
filter(!in_cityregion, areatype=="utla") %>%
select(measure, sex, ageyr, cause, location, areatype, num, denom, pop)
## LTLAs from GBD aggregated to UTLAs
gbdpyrlagg <- gbdpyr %>%
filter(!in_cityregion, areatype=="ltla") %>%
left_join(utla, by=c("location"="LTLA17NM")) %>%
group_by(measure, sex, ageyr, cause, UTLA17NM) %>%
summarise(num=sum(num), denom=sum(denom), pop=sum(pop)) %>%
rename(location = "UTLA17NM") %>%
mutate(areatype = "utla_agg") %>%
ungroup %>%
select(measure, sex, ageyr, cause, location, areatype, num, denom, pop)
## don't use this aggregation for now, but save for later if needed
gbdmodu <- rbind(gbdpyrcr, gbdpyru, gbdpyrlagg)
# LTLAs and UTLAs from GBD aggregated to regions, excluding areas that are in city regions
gbdpyrreg <- gbdpyr %>%
mutate(region = case_when(areatype=="ltla" ~ lad$RGN17NM[match(location, lad$LAD17NM)],
areatype=="utla" ~ utlareg$RGN17NM[match(location, utlareg$UTLA17NM)],
TRUE ~ region
)) %>%
filter(!in_cityregion) %>%
group_by(measure, sex, ageyr, cause, region) %>%
summarise(num=sum(num), denom=sum(denom), pop=sum(pop)) %>%
rename(location = "region") %>%
mutate(areatype = "region") %>%
ungroup %>%
select(measure, sex, ageyr, cause, location, areatype, num, denom, pop)
gbdmod <- rbind(gbdpyrcr, gbdpyrreg)
saveRDS(gbdmod,file=file.path(relative_path_mslt, "/input/gbd/GBD2019/METAHIT/gbdmod.rds"))
gbdmod <- readRDS(file=file.path(relative_path_mslt, "/input/gbd/GBD2019/METAHIT/gbdmod.rds"))
```
#### Example
Data by local authority for one cause/measure/demography of interest
```{r}
gbdpyr %>%
filter(measure=="Incidence", cityregion=="bristol", ageyr==44, cause=="ischemic heart disease") %>%
arrange(sex) %>%
select(location, areatype, sex, num, denom)
```
Equivalent data by city region: the numbers and denominators for each local authority are just added up
```{r}
gbdpyrcr %>%
filter(measure=="Incidence", location=="bristol", ageyr==44, cause=="ischemic heart disease") %>%
select(location, sex, num, denom)
```
### Spreading so one row by area x subgroup
Put numerators and denominators for incidence, prevalence and mortality in different columns
Rename variables for consistency with disbayes
```{r}
gbdmod <- readRDS(file=file.path(relative_path_mslt, "/input/gbd/GBD2019/METAHIT/gbdmod.rds"))
gbddb <- gbdmod %>%
select(-pop) %>%
filter(measure %in% c("Deaths","Incidence", "Prevalence")) %>%
mutate(measure = fct_recode(measure, mort="Deaths", inc="Incidence", prev="Prevalence")) %>%
pivot_wider(names_from=measure,
values_from=c("num","denom"),
values_fill = list(num=0, denom=5000)) %>%
mutate(area = fct_recode(str_to_title(location),
`Greater Manchester`="Greatermanchester",
`North East (city region)`="Northeast",
`North East (region)`="North East",
`West Midlands (city region)`="Westmidlands",
`West Midlands (region)`="West Midlands",
`East of England`="East Of England")) %>%
mutate(disease = str_to_sentence(cause)) %>%
rename(age = ageyr,
gender = sex,
inc_num = num_inc, inc_denom = denom_inc,
prev_num = num_prev, prev_denom = denom_prev,
mort_num = num_mort, mort_denom = denom_mort
) %>%
select(age, gender, disease, area, areatype, inc_num, inc_denom, prev_num, prev_denom, mort_num, mort_denom) %>%
arrange(area, disease, gender, age) %>%
filter(! disease %in% c("Hypertensive heart disease"))
saveRDS(gbddb, file=file.path(relative_path_mslt, "/input/gbd/GBD2019/METAHIT/gbddb.rds"))
gbdmod %>%
filter(ageyr==60, sex=="Male", cause=="ischemic heart disease", measure=="Incidence") %>%
filter(areatype %in% c("cityregion", "nation")) %>%
select(location, num, denom)
# Some age groups are not available. Assume count zero, denom 5000 for these
# Note again that counts for the wider regions / countries are not constrained to be consistent with the smaller areas
# but doesn't affect our analysis
```
# Cancer remission rates
10-year survival to use to inform remission rates, from the [ONS](https://www.ons.gov.uk/peoplepopulationandcommunity/healthandsocialcare/conditionsanddiseases/bulletins/cancersurvivalinengland/nationalestimatesforpatientsfollowedupto2017). These are not published by age. They are aggregated over ages 15-99, whereas 1, 5 year survival is published by age group.
Methods [here](https://www.ons.gov.uk/peoplepopulationandcommunity/healthandsocialcare/conditionsanddiseases/methodologies/cancersurvivalstatisticalbulletinsqmi). Assumes "conditional probabilities of surviving for patients diagnosed in current year are equal to those diagnosed over the full 10-year period of available data." Caveats about trends through time.
Our approach allows uncertainty around this - could represent expected variations between age groups, given variations for 5 year survival.
Annual remission probability is computed from the 10 year survival probability. Assumes that remission happened in one of the 10 years, with some unknown annual probability r such that the probability of remission in 10 years is equal to the 10 year survival probability s = 1 - (1-r)^10, so that r = 1 -(1-s)^(1/10).
Convert remission "rates" to numerators and denominators by assuming a denominator defined by the number of cancer cases over all ages and areas, but dividing all numerators and denominators by 100 to represent uncertainty about the true area-specific prevalence. For example, this gives a 95% credible interval for the annual breast cancer remission probability of 0.11 to 0.16.
```{r}
library(readxl)
rem <- read_excel(path=paste0(relative_path_mslt,"/input/Figure_10__Predicted_10-year_net-survival_using_the_hybrid_approach_for_men_and_women_(aged_15_to_99_years).xls"), range="A7:C26", na=":") %>%
pivot_longer(cols=c("Men","Women"), names_to = "sex", values_to = "surv10") %>%
mutate(prem1 = 1 - (1 - as.numeric(surv10)/100)^(1/10)) %>%
filter(Cancer %in% c("Breast","Uterus","Colorectal", "Kidney", "Bladder", "Liver", "Leukaemia", "Myeloma", "Stomach","Lung"),
!is.na(prem1)) %>%
mutate(Cancer = fct_recode(Cancer,
"Breast cancer"="Breast",
"Uterine cancer"="Uterus",
"Colon and rectum cancer"="Colorectal",
"Kidney cancer"="Kidney",
"Bladder cancer"="Bladder",
# "Liver cancer"="Liver",
"Chronic myeloid leukemia"="Leukaemia",
"Multiple myeloma"="Myeloma",
"Stomach cancer"="Stomach",
"Tracheal, bronchus, and lung cancer"="Lung"
),
gender = fct_recode(sex, "Male"="Men", "Female"="Women")) %>%
rename(disease = Cancer)
cancers <- c("Breast cancer", "Uterine cancer", "Colon and rectum cancer",
"Kidney cancer", "Bladder cancer", "Chronic myeloid leukemia", "Multiple myeloma",
"Stomach cancer", "Tracheal, bronchus, and lung cancer")
num_cancer <- gbddb %>%
filter(disease %in% cancers,
age %in% 15:99) %>%
group_by(gender, disease) %>%
summarise(ncases = sum(prev_num)) %>%
left_join(rem, by=c("disease","gender")) %>%
mutate(rem_num = round(ncases*prem1 / 100),
rem_denom = round(ncases / 100)) %>%
select(gender, disease, rem_num, rem_denom)
num_cancer
gbddb <- gbddb %>%
left_join(num_cancer) %>%
mutate(rem_num = replace_na(rem_num, 0),
rem_denom = replace_na(rem_denom, 0))
saveRDS(gbddb,file=file.path(relative_path_mslt, "/input/city regions/Input disbayes/disbayes_input_2_11_21.rds"))
```
Replace diseases with small numbers with data for England.
```{r}
gbddb <- readRDS(file=file.path(relative_path_mslt, "/input/city regions/Input disbayes/disbayes_input_2_11_21.rds"))
small_numbers <- gbddb %>% group_by(gender, disease, area) %>%
summarise(inc_total=sum(inc_num),
prev_total=sum(prev_num),
mort_total=sum(mort_num)) %>%
mutate(mort_exclude=ifelse(mort_total<=100, T, F),
inc_exclude=ifelse(inc_total<=100,T,F)) %>%
select(gender, disease, area, mort_exclude, inc_exclude)
small_numbers_true <- filter(small_numbers, mort_exclude==T, inc_exclude==T)
```
Generate data for England and use it to replace data in city regions with small numbers.
Exclude diseases with small numbers from city regions (we use England as whole data) and injuries (do not need disbayes processing).
STILL TO DO-ADD TRENDS
```{r}
library(data.table)
input_disbayes <- gbddb %>% filter(!str_detect(disease, "injuries")) %>%
filter(!disease %in% "Lower respiratory infections") %>%
left_join(small_numbers)
input_england <- readRDS(file=file.path(relative_path_mslt, "/input/england/Input disbayes/disbayes_input_7_11_21.rds")) %>%
mutate(mort_exclude=TRUE,
inc_exclude=TRUE)
setDT(input_disbayes)
setDT(input_england)
## Could be done more elegantly rather than repeating the same 4 times.
input_disbayes <- input_disbayes[input_england, on = c("age", "gender", "disease", "mort_exclude", "inc_exclude"), inc_num := i.inc_num]
input_disbayes <- input_disbayes[input_england, on = c("age", "gender", "disease", "mort_exclude"),
prev_num := i.prev_num]
input_disbayes <- input_disbayes[input_england, on = c("age", "gender", "disease", "mort_exclude", "inc_exclude"),mort_num := i.mort_num]
input_disbayes <- input_disbayes[input_england, on = c("age", "gender", "disease", "mort_exclude", "inc_exclude"),rem_num := i.rem_num]
## Check that for areas with small numbers, values were replaced with England data
test <- filter(input_england, gender=="Female", disease=="Liver cancer")
test2 <- filter(input_disbayes, area=="Bristol", gender=="Female", disease=="Liver cancer")
sum(test$inc_num-test2$inc_num)
sum(test$prev_num-test2$prev_num)
sum(test$mort_num-test2$mort_num)
saveRDS(input_disbayes,file=file.path(relative_path_mslt, "/input/city regions/Input disbayes/disbayes_input_8_11_21.rds"))
```