-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.Rmd
290 lines (225 loc) · 11.1 KB
/
index.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
---
title: "Bayes-DL Covid-19"
author: "Stefano Cabras"
date: '`r format(Sys.time())` '
output: html_document
---
```{r setup, include=FALSE}
rm(list=ls())
knitr::opts_chunk$set(echo = TRUE,warning = FALSE)
library(reticulate)
Sys.setenv(RETICULATE_MINICONDA_PATH = "~/miniforge3/")
Sys.setenv(RETICULATE_PYTHON = "~/miniforge3/envs/r-reticulate/bin/python")
use_python("~/miniforge3/envs/r-reticulate/bin/python")
use_condaenv("~/miniforge3/envs/r-reticulate/")
conda_python(envname = "~/miniforge3/envs/r-reticulate")
conda_binary()
library(reshape2)
library(ggplot2)
library(DT)
```
# Manuscript
The model here implemented is documented in
Stefano Cabras, A Bayesian-Deep Learning Model for Estimating COVID-19 Evolution in Spain, *Mathematics*, **9**(22), 2921, 2021. (https://doi.org/10.3390/math9222921)
# Data preparation
Data are downloaded from https://cnecovid.isciii.es/covid19/#documentaci%C3%B3n-y-datos. Such data are supposed to represent values day by day of COVID-19 incidence for regions (CCAA) in Spain.
```{r,eval=TRUE}
ccaa.lab=c("AN", "AR", "AS", "CB", "CE", "CL", "CM", "CN", "CT",
"EX", "GA","IB", "MC", "MD", "ML", "NC","PV", "RI", "VC")
cvirus.ccaa=read.table("https://cnecovid.isciii.es/covid19/resources/casos_diag_ccaadecl.csv",
sep = ",", header = T)
#cvirus.ccaa=cvirus.ccaa[cvirus.ccaa$fecha<="2021-10-18",]
maxr=max(which(cvirus.ccaa$fecha==max(cvirus.ccaa$fecha)))
cvirus.ccaa$ccaa_iso=factor(cvirus.ccaa$ccaa_iso)
cvirus.ccaa=data.frame(region=cvirus.ccaa$ccaa_iso,
day=as.Date(as.character(cvirus.ccaa$fecha),
format="%Y-%m-%d"),
cases=cvirus.ccaa$num_casos)
pop=read.table("demcca.csv",header = TRUE,sep=";")
pop=data.frame(region=pop$CCAA,pop=pop$pob)[1:19,]
pop=pop[order(pop$region),]
pop$w=pop$pop/sum(pop$pop)
```
The statistical unit is the day and variables are cumulative cases of the past 14 days up to the day in each one of the 19 Spanish regions (CCAA).
```{r}
cvirus.ccaa.fat=dcast(cvirus.ccaa,formula = day~region,value.var = "cases")
for(j in 2:ncol(cvirus.ccaa.fat)){
y=cvirus.ccaa.fat[,j]
y=c(rep(0,13),y)
y=cumsum(y)
y=y[14:length(y)]-y[1+(0:(length(y)-14))]
cvirus.ccaa.fat[,j]=as.integer((y/pop$pop[pop$region==colnames(cvirus.ccaa.fat)[j]])*100000)
}
cvirus=cvirus.ccaa.fat
last.observed=max(cvirus$day)
dim(cvirus)
```
We have `r nrow(cvirus)` days (since 2020-01-01 till `r last.observed`) and `r ncol(cvirus)` variables which represent the cumulative incidence at 14 days per 100000 people along the regions.
These are daily cumulative 14 days incidence :
```{r obsinc}
meltdat=melt(cvirus,id="day",variable.name = "region",
value.name = "Incidence")
p=ggplot(meltdat, aes(x =day, y = Incidence,color=region))+
geom_line()+
theme(axis.text.x = element_text(angle = 90,size=rel(0.8)))+ theme_bw()
p
```
# Model
We have to model all observed sequences. Let $Y_{ts}$ be the incidence at time $t=1,\ldots,$`r nrow(cvirus)` on region $s=1,\ldots,20$. The objective of this paper is to estimate
$$P=Pr(Y_{ts}\leq y|\mathcal{F_{t-1}}),$$
where $\mathcal{F_{t-1}}$ represents the process filtration up to the day before over all possible sequences (over all possible $s$).
If $P$ were known then it allows to predict:
1. covid-19 evolution conditionally on the paste;
2. spillover effects along regions and along with features and thus answer, to question of the kind: what will happen if in some of the Spanish region covid-19 start to spread out or disappear?
3. Which are the most connected regions? For instance, who much it is important to control cases more in the Canary Islands or Madrid in terms of variations of induced more cases overall Spain?
We attempt to estimate $P$ into two steps:
1. by approximating the mode of $Y_{ts}$ using a Deep Learning approach for sequences, with particular attention to LSTM architectures;
2. by finally estimating $P$ according to an i.i.d. Poisson model for each $t$ and $s$ where the prior on the intensity is given by the DL estimation which provides the mean along with the error of the DL model which provides the prior intensity variance.
## LSTM Estimation
LSTM models are a special instance of recurrent neural networks, see https://en.wikipedia.org/wiki/Long_short-term_memory.
### Model train and test
```{r trainlstm}
est.lstm=function(dat,test.from=max(cvirus$day)+1){
unlink("dlio", recursive = TRUE)
dir.create("dlio")
ii=dat[,1]<test.from
write.table(dat[ii,-1],file="dlio/civinput.csv",
sep=",",row.names = FALSE)
write.table(dat[,-1],file="dlio/civinput-test.csv",
sep=",",row.names = FALSE)
py_set_seed(17)
py_run_file("dl.py") # Here is the LSTM model used
}
```
All observations aftert `test.from` are for test and not for train.
```{r collectlstmoutput}
collect.lstmoutout=function(datobs,testset=FALSE){
allres=allerror=allres.bayes=NULL
for(i in 1:max_delay){
if(testset){
preds <- read.csv(paste("dlio/civout-test",i,".csv",sep=""),sep=",")[-1]
}else{
preds <- read.csv(paste("dlio/civout",i,".csv",sep=""),sep=",")[-1]
}
preds[preds<0]=0
dd=(datobs$day)[(nrow(datobs)-nrow(preds)+1):nrow(datobs)]
preds$day=dd+i-1
preds$delay=i
preds$type="lstm.pred"
mm=match(preds$day,datobs$day)
error=datobs[mm,-1]-preds[,1:19]
error$delay=i
error$day=datobs$day[mm]
allres.bayes=rbind(allres.bayes,cbind(preds,datobs[mm,]))
allres=rbind(allres,preds)
allerror=rbind(allerror,error)
}
allres=rbind(allres,data.frame(datobs,type="observed",delay=0))
allres=melt(allres,id.vars=c("day","delay","type"),
variable.name = "region",value.name = "cases")
allerror=melt(allerror,id.vars=c("day","delay"),
variable.name = "region",value.name = "error")
return(list(allres.bayes,allres,allerror))
}
```
Here we estimate twice the model: the first for evaluating performance on test set and the second for the final analysis.
```{r lstmtraintest}
max_delay=length(list.files(path = "dlio",pattern = "civout-test"))
est.lstm(cvirus,test.from=last.observed-max_delay) # Model evaluation on Test set
max_delay=60
tt=collect.lstmoutout(cvirus,testset = TRUE)
allres.test=tt[[2]]
allerror.test=tt[[3]]
allres.bayes.test=tt[[1]]
est.lstm(cvirus) # Now we use all data for the rest of analysis
tt=collect.lstmoutout(cvirus)
allres=tt[[2]]
allerror=tt[[3]]
allres.bayes=tt[[1]]
```
The model is learning from the data because the prediction error decreases.
```{r modtrainres}
history=read.csv("dlio/history.csv")
history=history*sd(unlist(cvirus[,-1]))
ggplot(history,aes(x=1:nrow(history),y=val_loss))+
geom_point()+geom_smooth()+xlab("Epoch")+
ylab("Mean Squared Error (Incidence)")+ theme_bw()
```
These are the errors depending on how many days before was made the prediction (delay) and the region (some region is more easy to be predicted than others). There are the LSTM errors (prior variances):
```{r lstmerror}
ggplot(allerror,aes(x=factor(delay),y=error))+
facet_wrap(. ~ region)+geom_violin()+ ylab("Incidence")+ theme_bw()
```
These are the LSTM predictions (prior means):
```{r lstmpred}
ggplot(subset(allres,(delay%in%c(0,1))),
aes(x=day,y=cases,color=type))+facet_wrap(. ~ region)+geom_line()+ theme_bw() +
theme(axis.text.x = element_text(angle = 90,size=rel(1)))
ggplot(subset(allres,day>as.Date("2021-01-01")),aes(x=day,y=cases,color=type))+
facet_wrap(. ~ region)+geom_line()+
theme(axis.text.x = element_text(angle = 90,size=rel(0.8)))+ ylab("Incidence")+ theme_bw()
```
Such output contains a guess of the process filtration $\mathcal{F_{t-1}}$ in $P$.
# Bayesian model
To account for prediction uncertainty we assume that the LSTM prediction and the corresponding errors act as an expert who elicits the mean of the Poisson (the LSTM prediction) along with its variance (the LSTM mean squared error). The variances are calculated conditionally to the region and the number of days ahead in the prediction (delay).
The Negative Binomial Bayesian Model predictive posterior is calculated here. In particular the model reports the posterior median along with 90% Credible Intervals.
```{r negbin}
bayes.pred=function(allres.bayes,no.obs=FALSE){
preds=melt(data.frame(allres.bayes[,1:21]),
id.vars = c("day","delay"),value.name = "cases",
variable.name = "region")
obs=melt(data.frame(allres.bayes[,c(20,24:42)]),id.vars = "day",
value.name = "cases",variable.name = "region")
lstm.var=data.frame(error=(preds$cases-obs$cases)^2,delay=preds$delay,region=preds$region)
lstm.var=aggregate(lstm.var$error,by = list(delay=lstm.var$delay,region=lstm.var$region),mean,na.rm=TRUE)
prior.var=apply(preds[,2:3],1,function(xx) lstm.var$x[(lstm.var$delay==as.numeric(xx[1]))&(lstm.var$region==as.character(xx[2]))])
ii=is.na(obs$cases)|no.obs
a=preds$cases^2/prior.var # Prior Gamma a parameter
b=preds$cases/prior.var # Prior Gamma b parameter
p=(b+1)/(b+2) # Predictive posterior p parameter
p[ii]=b[ii]/(b[ii]+1)
oo=obs$cases
oo[ii]=0
size=a+oo # Predictive posterior size parameter
post.pred.summary=t(apply(cbind(size,p),1,function(x) qnbinom(c(0.05,0.5,0.95),size=x[1],prob=x[2])))
colnames(post.pred.summary)=c("Inf90","median","Sup90")
final.res=data.frame(day=preds$day,delay=preds$delay,
region=preds$region,
lstm=preds$cases,observed=obs$cases,post.pred.summary)
# Only best prediction for that unknown day
diff.pred=preds$day-max(allres.bayes[,23],na.rm=TRUE)
final.res=final.res[(diff.pred<=0)|((diff.pred>0)&(preds$delay-diff.pred<=1)),]
return(final.res)
}
final.res=bayes.pred(allres.bayes)
```
# Results on available data
These are the prediction of the incidence given the observed data: posterior predictive and prior predictive (according to the LSTM "expert" elicitation).
## Incidence by region
For each region here is the observed incidence along with predictive posterior summaries: median (red) and 95% credible intervals (grey):
```{r regpredinc}
toplot=subset(final.res,day>as.Date("2021-01-01"))
toplot=toplot[toplot$delay>=1,]
ggplot(toplot,aes(x=day,y=observed))+geom_line()+
geom_line(aes(x=day,y=median),colour="red")+
geom_ribbon(aes(x=day,ymin=Inf90,ymax=Sup90),alpha = 0.5)+
facet_wrap(. ~ region)+
theme(axis.text.x = element_text(angle = 90,size=rel(0.8)))+
ylab("Incidence")+ theme_bw()
datatable(toplot[is.na(toplot$observed),],rownames = NULL)
```
## Incidence for Spain
The same as above but for all Spain in which regions are weighted according to their population.
```{r}
final.res.spain=round(final.res[c("observed","Inf90","median","Sup90")]*pop$w[final.res$region],0)
final.res.spain=aggregate(final.res.spain,by=list(day=final.res$day,delay=final.res$delay),sum)
toplot=subset(final.res.spain,day>as.Date("2021-11-01"))
toplot=toplot[toplot$delay>=1,]
pmax=ggplot(toplot,aes(x=day,y=observed))+geom_line()+
geom_line(aes(x=day,y=median),colour="red")+
geom_ribbon(aes(x=day,ymin=Inf90,ymax=Sup90),alpha = 0.5)+
theme(axis.text.x = element_text(angle = 90,size=rel(0.8)))+
ylab("Incidence")+ theme_bw()
pmax
datatable(toplot[is.na(toplot$observed),],rownames = NULL)
```