-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcovid-hera-hdx-explore.Rmd
386 lines (273 loc) · 13 KB
/
covid-hera-hdx-explore.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
---
title: "covid-hera-hdx-explore"
output: html_document
---
# code now moved to https://github.com/afrimapr/africovid
# HERA on HDX have various subnational covid data for ~16 African countries
# starting to explore them
# can potentially use as case study for data joining
https://data.humdata.org/organization/hera-humanitarian-emergency-response-africa?
searched for this on github to see if anyone has solved these issues already, but seemingly not
subnational_covid19_hera
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE, message=FALSE, warning=FALSE)
library(dplyr)
library(ggplot2)
library(readr)
library(sf)
library(tmap)
library(lubridate)
library(afriadmin)
#using new path specification avoiding double slashes
folder <- r"(C:\Dropbox\_afrimapr\data\covid-hera-hdx\)"
```
```{r, hera-tgo, eval=TRUE, warning=FALSE}
dftgo <- readr::read_csv2(r"(C:\Dropbox\_afrimapr\data\covid-hera-hdx\tgo_subnational_covid19_hera.csv)")
head(dftgo)
# ID DATE ISO_3 PAYS ID_PAYS REGION ID_REGION CONTAMINES DECES GUERIS CONTAMINES_FEMME
# <dbl> <chr> <chr> <chr> <dbl> <chr> <dbl> <chr> <chr> <chr> <chr>
# 1 1 06/0~ TGO Togo 3 Centre 22 0 0 0 0
# 2 2 06/0~ TGO Togo 3 Kara 21 0 0 0 0
# 3 3 06/0~ TGO Togo 3 Marit~ 24 1 0 0 1
#REGION column contains region names
unique(dftgo$REGION)
# "Centre" "Kara" "Maritime" "Plateaux" "Savanes" "Non spécifié"
sftgoadm1 <- afriadmin::afriadmin('tgo',level=1)
unique(sftgoadm1$shapeName)
#"Savanes Region" "Kara Region" "Centrale Region" "Plateaux Region" "Maritime Region"
#so to join will need to remove Region from geoboundaries & change some names eg centre v centrale
```
```{r, hera-bfa, eval=TRUE, warning=FALSE}
#dfbfaold <- readr::read_csv2(r"(C:\Dropbox\_afrimapr\data\covid-hera-hdx\bfa_subnational_covid19_hera.csv)")
dfbfa <- readr::read_csv2(r"(C:\Dropbox\_afrimapr\data\covid-hera-hdx\bfa_subnational_covid19_hera.csv)",
col_types='iccciciiiiiiicc')
#D for date didn't work on the way the date is formatted
library(lubridate)
dfbfa$date <- lubridate::dmy(dfbfa$DATE)
head(dfbfa)
# ID DATE ISO_3 PAYS ID_PAYS REGION ID_REGION CONTAMINES DECES GUERIS CONTAMINES_FEMME
# <dbl> <chr> <chr> <chr> <dbl> <chr> <dbl> <chr> <chr> <chr> <chr>
# 1 1 09/0~ BFA Burk~ 16 Boucl~ 208 0 0 0 0
#not needed once column types specified in read_csv
#dfbfa$CONTAMINES <- as.numeric(dfbfa$CONTAMINES)
unique(dfbfa$REGION)
# [1] "Boucle du Mouhoun" "Cascades" "Centre" "Centre-Est" "Centre-Nord"
# [6] "Centre-Ouest" "Centre-Sud" "Est" "Haut-Bassins" "Nord"
# [11] "Plateau-Central" "Sahel" "Sud-Ouest" "Non spécifié"
sfbfaadm1 <- afriadmin::afriadmin('bfa',level=1)
unique(sfbfaadm1$shapeName)
# [1] "Centre" "Boucle du Mouhoun" "Cascades" "Centre-Est" "Centre-Nord"
# [6] "Centre-Ouest" "Centre-Sud" "Est" "Hauts-Bassins" "Nord"
# [11] "Plateau Central" "Sahel" "Sud-Ouest"
# cool these match better than togo, except for Hauts-Bassins != Haut-Bassins
# Plateau-Central != Plateau Central
# good case studies of things to match
# explore dfbfa
length(unique(dfbfa$DATE))
#216
#I wonder if fuzzyjoin works to join the data to the admin areas map ?
library(fuzzyjoin)
# cool the antijoin gets no results
# seems to suggest the inner join will catch all
fuzzyjoin::stringdist_anti_join(sfbfaadm1,dfbfa, by=c(shapeName = 'REGION'))
#Simple feature collection with 0 features and 5 fields
sf1 <- fuzzyjoin::stringdist_inner_join(sfbfaadm1,dfbfa, by=c(shapeName = 'REGION'))
# convert back to sf after fuzzyjoin
sf1 <- st_as_sf(sf1)
#plotting in tmap, maybe faceting by date, will need to be careful that in date order
# works but with 216 cells can't read date legend to see if in correct order
# tmap::tm_shape(sf1) +
# tm_polygons("CONTAMINES") +
# tm_facets(by = "date") #date is the formatted date column
# aggregate by month or week, using lubridate::floor_date
# first go just sum the results
# sfmonthly <- sf1 %>% group_by(month=floor_date(date, "month"),
# REGION=REGION) %>%
# summarize(CONTAMINES=sum(CONTAMINES))
#calculate weekly totals
sfweekly <- sf1 %>% group_by(week=floor_date(date, "week"),
REGION=REGION) %>%
summarize(CONTAMINES=sum(CONTAMINES,na.rm=TRUE))
#nice weekly facetted map
tmap::tm_shape(sfweekly) +
tm_polygons("CONTAMINES") +
tm_facets(by = "week")
#didn't change title
#tm_polygons("CONTAMINES", legend.title = "Burkina Faso, CONTAMINES") +
#this heatmap by region of cases, loosely based on Colin Angus work looks good
#TODO could add rolling 7 day averages to it
ggplot(sf1, aes(x=date, y=REGION, fill=CONTAMINES))+
geom_tile(colour="White")+
theme_classic()+
#scale_fill_distiller(palette="Spectral") +
#scale_fill_viridis_c()+
scale_fill_distiller(palette="YlGnBu", direction=1, na.value='white') +
theme(axis.line.y=element_blank())
#there is also some admin level2/city data
# lets see if I can download direct from the url got by right clicking on the link
url <- "https://data.humdata.org/dataset/4a09b23b-e941-43bf-bede-e35e9a149f3c/resource/14cbbd27-516f-4292-9033-fd9c5fd63f06/download/bfa_cityleveladm2_covid19_hera.csv"
# cool reading directly from the url does work
dfbfa_city <- readr::read_csv2(url)
dfbfa_city$date <- lubridate::dmy(dfbfa_city$DATE)
# [1] "ID" "DATE" "ISO_3" "PAYS" "ID_PAYS"
# [6] "REGION" "ID_REGION" "VILLES" "ID_VILLES" "COMMUNES_TYPE"
# [11] "Contaminés" "Décès" "Guéris" "Femme" "Homme"
# [16] "Genre_non spécifié" "Source"
ggplot(dfbfa_city, aes(x=date, y=VILLES, fill=Contaminés))+
geom_tile(colour="White")+
theme_classic()+
scale_fill_distiller(palette="Spectral") + #(palette="YlGnBu") +
theme(axis.line.y=element_blank())
```
```{r, hera-gha, eval=TRUE, warning=FALSE}
#Ghana
url <- "https://data.humdata.org/dataset/bc3589a6-04bc-4681-b531-7910ec800b4f/resource/1a9d2a9b-36a0-46bc-94df-b1a1d4f62bd5/download/gha_subnational_covid19_hera.csv"
# cool reading directly from the url does work
dfgha <- readr::read_csv2(url)
dfgha$date <- lubridate::dmy(dfgha$DATE)
month_breaks <- as.Date(lubridate::parse_date_time(c("2020-04","2020-05","2020-06","2020-07","2020-08","2020-09","2020-10","2020-11"), orders="ym"))
#this heatmap by region of cases, loosely based on Colin Angus work looks good
#https://github.com/VictimOfMaths/COVID-19/blob/master/Heatmaps/English%20LA%20Heatmaps.R
#TODO could add rolling 7 day averages to it
ggplot(dfgha, aes(x=date, y=REGION, fill=CONTAMINES))+
geom_tile(colour="White")+
theme_classic()+
#scale_fill_distiller(palette="Spectral") +
#scale_fill_viridis_c()+
scale_x_date(name="Date", expand=c(0,0), breaks=month_breaks, date_labels = "%B")+ #%B full month name
scale_fill_distiller(palette="YlGnBu", direction=1, na.value='white') +
labs(title="Timelines for COVID-19 cases in Ghana",
#subtitle=paste0(""),
caption="Data from @HeraAfrica via @humdata | Plot by @afrimapr")+
theme(axis.line.y=element_blank())
#try fuzzjoining to adm1 map
sfghaadm1 <- afriadmin::afriadmin('gha',level=1)
```
```{r, hera-nga, eval=TRUE, warning=FALSE}
#Nigeria
url <- "https://data.humdata.org/dataset/f5c35452-d766-468a-a272-4bd82d0a3be0/resource/e8777e62-870d-41a7-952f-97c6ff977706/download/nga_subnational_covid19_hera.csv"
# cool reading directly from the url does work
dfnga <- readr::read_csv2(url)
dfnga$date <- lubridate::dmy(dfnga$DATE)
month_breaks <- as.Date(lubridate::parse_date_time(c("2020-04","2020-05","2020-06","2020-07","2020-08","2020-09","2020-10","2020-11"), orders="ym"))
#this heatmap by region of cases, loosely based on Colin Angus work looks good
#TODO could add rolling 7 day averages to it
ggplot(dfnga, aes(x=date, y=REGION, fill=CONTAMINES))+
geom_tile(colour="White")+
theme_classic()+
#scale_fill_distiller(palette="Spectral") +
#scale_fill_viridis_c()+
scale_x_date(name="Date", expand=c(0,0), breaks=month_breaks, date_labels = "%B")+ #%B full month name
scale_fill_distiller(palette="YlGnBu", direction=1, na.value='white') +
labs(title="Timelines for COVID-19 cases in Nigeria",
#subtitle=paste0(""),
caption="Data from @HeraAfrica via @humdata | Plot by @afrimapr")+
theme(axis.line.y=element_blank())
#try fuzzjoining to adm1 map
sfngaadm1 <- afriadmin::afriadmin('nga',level=1, plot=FALSE)
fuzzyjoin::stringdist_anti_join(sfngaadm1,dfnga, by=c(shapeName = 'REGION'))
#Simple feature collection with 0 features and 5 fields
sf1 <- fuzzyjoin::stringdist_inner_join(sfngaadm1,dfnga, by=c(shapeName = 'REGION'))
# convert back to sf after fuzzyjoin
sf1 <- st_as_sf(sf1)
#plotting in tmap, faceting by date
# aggregate by month or week, using lubridate::floor_date
# first go just sum the results
#calculate weekly totals
sfweekly <- sf1 %>% group_by(week=floor_date(date, "week"),
REGION=REGION) %>%
summarize(CONTAMINES=sum(CONTAMINES,na.rm=TRUE))
#nice weekly facetted map
#TODO check whether colour breaks are same as for the heatmap
tmap::tm_shape(sfweekly) +
tm_polygons("CONTAMINES", palette = "YlGnBu") +
tm_facets(by = "week")
#try monthly
sfmonthly <- sf1 %>% group_by(month=floor_date(date, "month"),
REGION=REGION) %>%
summarize(CONTAMINES=sum(CONTAMINES,na.rm=TRUE))
#nice weekly facetted map
#TODO check whether colour breaks are same as for the heatmap
tmap::tm_shape(sfmonthly) +
tm_polygons("CONTAMINES", palette = "YlGnBu") +
tm_facets(by = "month")
```
## looking into getting hera data via rhdx
```{r, hera-rhdx, eval=TRUE, warning=FALSE}
library(rhdx)
set_rhdx_config(hdx_site = "prod")
get_rhdx_config()
#modified example from readme
library(tidyverse)
df2 <- search_datasets("hera", rows = 2) %>%
pluck(1) %>% ## select the first dataset
get_resource(2) %>% ## 2nd resource is csv
read_resource(delim=';')
#Ahmadou example from issue 8
pull_dataset("mauritania_covid19_subnational") %>%
get_resources(format = "csv") %>%
pluck(1) %>%
read_resource(delim = ";", locale = locale(decimal_mark = ","))
# this does return hera datasets
ds <- search_datasets("hera", rows=99)
#default returns 10
#this returns 21 subnational datasets
ds <- search_datasets("hera subnational", rows=99)
#trying to read them all into single dataframe
#11 Liberia has problems with DATE column, stops in May, doesn't have an ID column
#15 is called a web app and fails to load
#16 & later are cumulative for all Africa & different columns
to_exclude <- c(11,15,16,17,18,19,20,21)
ds <- ds[-to_exclude]
dfall <- NULL
for( i in 1:length(ds))
{
cat(i)
# df1 <- get_resource(ds[[i]], 1) %>% ## 1st resource is XLS (csv is ; delimited causing problems)
# read_resource() # read into R
#df1 <- get_resource(ds[[i]], 2) %>% ## 2nd resource is csv
df1 <- get_resources(ds[[i]], format = "csv") %>%
pluck(1) %>% ## select the first csv
read_resource(delim = ";", locale = locale(decimal_mark = ",")) # read into R
#should work too
#get_resources(format = "csv") %>%
#Liberia stops in May, doesn't have an ID column
#also gives this error, probably to do with dates
#Error in as.POSIXlt.character(x, tz, ...) :
#character string is not in a standard unambiguous format
if (names(df1)[1]=='DATE')
{
next #to miss out Liberia
#df1 <- NULL #set to NULL to miss Liberia out for now
# dfid <- tibble(ID=1:nrow(df1))
# df1 <- cbind(dfid,df1)
}
#some, but not all xls files, read rownames into first column
if (df1[[1,1]] == 'ID')
{
#set column names from 1st row
names(df1) <- as.character(df1[1,])
#remove first row
df1 <- df1[-1,]
}
#Benin stops in October and has different named columns, title case rather than upper case
#patch to fix it
if ("Femme" %in% names(df1))
names(df1) <- names(dfall)
#remove this column that occurs in just some ds e.g Gambia
if ('LIEN SOURCE' %in% names(df1))
df1 <- dplyr::select(df1, !`LIEN SOURCE`)
#bind these country rows onto all country rows
dfall <- rbind(dfall, df1)
}
#7 Gambia with csv
# Warning: 1365 parsing failures.
# row col expected actual file
# 1081 CONTAMINES a double null 'C:/Users/andy.south/AppData/Local/Cache/R/rhdx/gmb_subnational_covid19_hera.csv'
#saving the data object for all countries
dfhera <- dfall
# usethis::use_data(dfhera)
# Error: `use_data()` is designed to work with packages.
# Project 'afrimapr_dev' is not an R package.
save(dfhera, file=r"(data/dfhera.rda)")
```