covid-hera-hdx-explore.Rmd

---
title: "covid-hera-hdx-explore"
output: html_document
---
  
# code now moved to https://github.com/afrimapr/africovid  
  
# HERA on HDX have various subnational covid data for ~16 African countries
# starting to explore them  
# can potentially use as case study for data joining
  
https://data.humdata.org/organization/hera-humanitarian-emergency-response-africa?


searched for this on github to see if anyone has solved these issues already, but seemingly not
subnational_covid19_hera


```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE, message=FALSE, warning=FALSE)


library(dplyr)
library(ggplot2)
library(readr)
library(sf)
library(tmap)
library(lubridate)
library(afriadmin)

#using new path specification avoiding double slashes
folder <- r"(C:\Dropbox\_afrimapr\data\covid-hera-hdx\)"

```


```{r, hera-tgo, eval=TRUE, warning=FALSE}

dftgo <- readr::read_csv2(r"(C:\Dropbox\_afrimapr\data\covid-hera-hdx\tgo_subnational_covid19_hera.csv)")

head(dftgo)

#      ID DATE  ISO_3 PAYS  ID_PAYS REGION ID_REGION CONTAMINES DECES GUERIS CONTAMINES_FEMME
#   <dbl> <chr> <chr> <chr>   <dbl> <chr>      <dbl> <chr>      <chr> <chr>  <chr>           
# 1     1 06/0~ TGO   Togo        3 Centre        22 0          0     0      0               
# 2     2 06/0~ TGO   Togo        3 Kara          21 0          0     0      0               
# 3     3 06/0~ TGO   Togo        3 Marit~        24 1          0     0      1 

#REGION column contains region names

unique(dftgo$REGION)

# "Centre"       "Kara"         "Maritime"     "Plateaux"     "Savanes"      "Non spécifié"

sftgoadm1 <- afriadmin::afriadmin('tgo',level=1)

unique(sftgoadm1$shapeName)

#"Savanes Region"  "Kara Region"     "Centrale Region" "Plateaux Region" "Maritime Region"

#so to join will need to remove Region from geoboundaries & change some names eg centre v centrale

```


```{r, hera-bfa, eval=TRUE, warning=FALSE}

#dfbfaold <- readr::read_csv2(r"(C:\Dropbox\_afrimapr\data\covid-hera-hdx\bfa_subnational_covid19_hera.csv)")
dfbfa <- readr::read_csv2(r"(C:\Dropbox\_afrimapr\data\covid-hera-hdx\bfa_subnational_covid19_hera.csv)",
                          col_types='iccciciiiiiiicc')
#D for date didn't work on the way the date is formatted

library(lubridate)

dfbfa$date <- lubridate::dmy(dfbfa$DATE)

head(dfbfa)
#      ID DATE  ISO_3 PAYS  ID_PAYS REGION ID_REGION CONTAMINES DECES GUERIS CONTAMINES_FEMME
#   <dbl> <chr> <chr> <chr>   <dbl> <chr>      <dbl> <chr>      <chr> <chr>  <chr>           
# 1     1 09/0~ BFA   Burk~      16 Boucl~       208 0          0     0      0  

#not needed once column types specified in read_csv
#dfbfa$CONTAMINES <- as.numeric(dfbfa$CONTAMINES)


unique(dfbfa$REGION)

#  [1] "Boucle du Mouhoun" "Cascades"          "Centre"            "Centre-Est"        "Centre-Nord"    
#  [6] "Centre-Ouest"      "Centre-Sud"        "Est"               "Haut-Bassins"      "Nord"           
# [11] "Plateau-Central"   "Sahel"             "Sud-Ouest"         "Non spécifié"     

sfbfaadm1 <- afriadmin::afriadmin('bfa',level=1)

unique(sfbfaadm1$shapeName)

#  [1] "Centre"            "Boucle du Mouhoun" "Cascades"          "Centre-Est"        "Centre-Nord"    
#  [6] "Centre-Ouest"      "Centre-Sud"        "Est"               "Hauts-Bassins"     "Nord"           
# [11] "Plateau Central"   "Sahel"             "Sud-Ouest"  

# cool these match better than togo, except for Hauts-Bassins != Haut-Bassins
# Plateau-Central != Plateau Central
# good case studies of things to match

# explore dfbfa
length(unique(dfbfa$DATE))
#216

#I wonder if fuzzyjoin works to join the data to the admin areas map ?
library(fuzzyjoin)

# cool the antijoin gets no results
# seems to suggest the inner join will catch all

fuzzyjoin::stringdist_anti_join(sfbfaadm1,dfbfa, by=c(shapeName = 'REGION')) 
#Simple feature collection with 0 features and 5 fields

sf1 <- fuzzyjoin::stringdist_inner_join(sfbfaadm1,dfbfa, by=c(shapeName = 'REGION')) 
# convert back to sf after fuzzyjoin
sf1 <- st_as_sf(sf1)

#plotting in tmap, maybe faceting by date, will need to be careful that in date order

# works but with 216 cells can't read date legend to see if in correct order
# tmap::tm_shape(sf1) +
#     tm_polygons("CONTAMINES") +
#     tm_facets(by = "date") #date is the formatted date column

# aggregate by month or week, using lubridate::floor_date
# first go just sum the results
# sfmonthly <- sf1 %>% group_by(month=floor_date(date, "month"),
#                               REGION=REGION) %>%
#    summarize(CONTAMINES=sum(CONTAMINES))

#calculate weekly totals
sfweekly <- sf1 %>% group_by(week=floor_date(date, "week"),
                             REGION=REGION) %>%
   summarize(CONTAMINES=sum(CONTAMINES,na.rm=TRUE))

#nice weekly facetted map
tmap::tm_shape(sfweekly) +
    tm_polygons("CONTAMINES") +
    tm_facets(by = "week") 

#didn't change title
#tm_polygons("CONTAMINES", legend.title = "Burkina Faso, CONTAMINES") +

#this heatmap by region of cases, loosely based on Colin Angus work looks good
#TODO could add rolling 7 day averages to it
ggplot(sf1, aes(x=date, y=REGION, fill=CONTAMINES))+
  geom_tile(colour="White")+   
  theme_classic()+
  #scale_fill_distiller(palette="Spectral") +
  #scale_fill_viridis_c()+
  scale_fill_distiller(palette="YlGnBu", direction=1, na.value='white') +
  theme(axis.line.y=element_blank())


#there is also some admin level2/city data
# lets see if I can download direct from the url got by right clicking on the link
url <- "https://data.humdata.org/dataset/4a09b23b-e941-43bf-bede-e35e9a149f3c/resource/14cbbd27-516f-4292-9033-fd9c5fd63f06/download/bfa_cityleveladm2_covid19_hera.csv"

# cool reading directly from the url does work
dfbfa_city <- readr::read_csv2(url)
dfbfa_city$date <- lubridate::dmy(dfbfa_city$DATE)

#  [1] "ID"                 "DATE"               "ISO_3"              "PAYS"               "ID_PAYS"           
#  [6] "REGION"             "ID_REGION"          "VILLES"             "ID_VILLES"          "COMMUNES_TYPE"     
# [11] "Contaminés"         "Décès"              "Guéris"             "Femme"              "Homme"             
# [16] "Genre_non spécifié" "Source"   

ggplot(dfbfa_city, aes(x=date, y=VILLES, fill=Contaminés))+
  geom_tile(colour="White")+
  theme_classic()+
  scale_fill_distiller(palette="Spectral") + #(palette="YlGnBu") +
  theme(axis.line.y=element_blank())


```


```{r, hera-gha, eval=TRUE, warning=FALSE}

#Ghana

url <- "https://data.humdata.org/dataset/bc3589a6-04bc-4681-b531-7910ec800b4f/resource/1a9d2a9b-36a0-46bc-94df-b1a1d4f62bd5/download/gha_subnational_covid19_hera.csv"

# cool reading directly from the url does work
dfgha <- readr::read_csv2(url)
dfgha$date <- lubridate::dmy(dfgha$DATE)

month_breaks <- as.Date(lubridate::parse_date_time(c("2020-04","2020-05","2020-06","2020-07","2020-08","2020-09","2020-10","2020-11"), orders="ym")) 

#this heatmap by region of cases, loosely based on Colin Angus work looks good
#https://github.com/VictimOfMaths/COVID-19/blob/master/Heatmaps/English%20LA%20Heatmaps.R
#TODO could add rolling 7 day averages to it
ggplot(dfgha, aes(x=date, y=REGION, fill=CONTAMINES))+
  geom_tile(colour="White")+   
  theme_classic()+
  #scale_fill_distiller(palette="Spectral") +
  #scale_fill_viridis_c()+
  scale_x_date(name="Date", expand=c(0,0), breaks=month_breaks, date_labels = "%B")+ #%B full month name
  scale_fill_distiller(palette="YlGnBu", direction=1, na.value='white') +
    labs(title="Timelines for COVID-19 cases in Ghana",
       #subtitle=paste0(""),
       caption="Data from @HeraAfrica via @humdata | Plot by @afrimapr")+
  theme(axis.line.y=element_blank())

#try fuzzjoining to adm1 map
sfghaadm1 <- afriadmin::afriadmin('gha',level=1)


```


```{r, hera-nga, eval=TRUE, warning=FALSE}

#Nigeria
url <- "https://data.humdata.org/dataset/f5c35452-d766-468a-a272-4bd82d0a3be0/resource/e8777e62-870d-41a7-952f-97c6ff977706/download/nga_subnational_covid19_hera.csv"

# cool reading directly from the url does work
dfnga <- readr::read_csv2(url)
dfnga$date <- lubridate::dmy(dfnga$DATE)

month_breaks <- as.Date(lubridate::parse_date_time(c("2020-04","2020-05","2020-06","2020-07","2020-08","2020-09","2020-10","2020-11"), orders="ym")) 

#this heatmap by region of cases, loosely based on Colin Angus work looks good
#TODO could add rolling 7 day averages to it
ggplot(dfnga, aes(x=date, y=REGION, fill=CONTAMINES))+
  geom_tile(colour="White")+   
  theme_classic()+
  #scale_fill_distiller(palette="Spectral") +
  #scale_fill_viridis_c()+
  scale_x_date(name="Date", expand=c(0,0), breaks=month_breaks, date_labels = "%B")+ #%B full month name
  scale_fill_distiller(palette="YlGnBu", direction=1, na.value='white') +
    labs(title="Timelines for COVID-19 cases in Nigeria",
       #subtitle=paste0(""),
       caption="Data from @HeraAfrica via @humdata | Plot by @afrimapr")+
  theme(axis.line.y=element_blank())

#try fuzzjoining to adm1 map
sfngaadm1 <- afriadmin::afriadmin('nga',level=1, plot=FALSE)

fuzzyjoin::stringdist_anti_join(sfngaadm1,dfnga, by=c(shapeName = 'REGION')) 
#Simple feature collection with 0 features and 5 fields

sf1 <- fuzzyjoin::stringdist_inner_join(sfngaadm1,dfnga, by=c(shapeName = 'REGION')) 
# convert back to sf after fuzzyjoin
sf1 <- st_as_sf(sf1)

#plotting in tmap, faceting by date

# aggregate by month or week, using lubridate::floor_date
# first go just sum the results
#calculate weekly totals
sfweekly <- sf1 %>% group_by(week=floor_date(date, "week"),
                             REGION=REGION) %>%
   summarize(CONTAMINES=sum(CONTAMINES,na.rm=TRUE))

#nice weekly facetted map
#TODO check whether colour breaks are same as for the heatmap
tmap::tm_shape(sfweekly) +
    tm_polygons("CONTAMINES", palette = "YlGnBu") +
    tm_facets(by = "week") 

#try monthly
sfmonthly <- sf1 %>% group_by(month=floor_date(date, "month"),
                             REGION=REGION) %>%
   summarize(CONTAMINES=sum(CONTAMINES,na.rm=TRUE))

#nice weekly facetted map
#TODO check whether colour breaks are same as for the heatmap
tmap::tm_shape(sfmonthly) +
    tm_polygons("CONTAMINES", palette = "YlGnBu") +
    tm_facets(by = "month") 

```

## looking into getting hera data via rhdx

```{r, hera-rhdx, eval=TRUE, warning=FALSE}

library(rhdx)

set_rhdx_config(hdx_site = "prod")
get_rhdx_config()

#modified example from readme
library(tidyverse)
df2 <- search_datasets("hera", rows = 2) %>% 
  pluck(1) %>% ## select the first dataset
  get_resource(2) %>% ## 2nd resource is csv
  read_resource(delim=';') 

#Ahmadou example from issue 8
pull_dataset("mauritania_covid19_subnational") %>%
  get_resources(format = "csv") %>%
  pluck(1) %>%
  read_resource(delim = ";", locale = locale(decimal_mark = ","))


# this does return hera datasets
ds <- search_datasets("hera", rows=99)

#default returns 10

#this returns 21 subnational datasets
ds <- search_datasets("hera subnational", rows=99)

#trying to read them all into single dataframe

#11 Liberia has problems with DATE column, stops in May, doesn't have an ID column
#15 is called a web app and fails to load
#16 & later are cumulative for all Africa & different columns
to_exclude <- c(11,15,16,17,18,19,20,21)
ds <- ds[-to_exclude]

dfall <- NULL

for( i in 1:length(ds))
{
  cat(i)
  
  # df1 <- get_resource(ds[[i]], 1) %>% ## 1st resource is XLS (csv is ; delimited causing problems)
  #        read_resource() # read into R
  
  #df1 <- get_resource(ds[[i]], 2) %>% ## 2nd resource is csv 
  df1 <- get_resources(ds[[i]], format = "csv") %>%
         pluck(1) %>% ## select the first csv
         read_resource(delim = ";", locale = locale(decimal_mark = ",")) # read into R
  
  #should work too
  #get_resources(format = "csv") %>%
  
  #Liberia stops in May, doesn't have an ID column
  #also gives this error, probably to do with dates
  #Error in as.POSIXlt.character(x, tz, ...) : 
  #character string is not in a standard unambiguous format
  if (names(df1)[1]=='DATE')
  {
    next #to miss out Liberia
    #df1 <- NULL #set to NULL to miss Liberia out for now
    # dfid <- tibble(ID=1:nrow(df1))
    # df1 <- cbind(dfid,df1)
  }  
  
  #some, but not all xls files, read rownames into first column
  if (df1[[1,1]] == 'ID')
  {
    #set column names from 1st row
    names(df1) <- as.character(df1[1,])
    #remove first row
    df1 <- df1[-1,]    
  }
  
  #Benin stops in October and has different named columns, title case rather than upper case
  #patch to fix it
  if ("Femme" %in% names(df1))
    names(df1) <- names(dfall)

  #remove this column that occurs in just some ds e.g Gambia
  if ('LIEN SOURCE' %in% names(df1))
    df1 <- dplyr::select(df1, !`LIEN SOURCE`)
  
  #bind these country rows onto all country rows  
  dfall <- rbind(dfall, df1)
}

#7 Gambia with csv
# Warning: 1365 parsing failures.
#  row                           col expected actual                                                                              file
# 1081 CONTAMINES                    a double   null 'C:/Users/andy.south/AppData/Local/Cache/R/rhdx/gmb_subnational_covid19_hera.csv'

#saving the data object for all countries
dfhera <- dfall

# usethis::use_data(dfhera)
# Error: `use_data()` is designed to work with packages.
# Project 'afrimapr_dev' is not an R package.

save(dfhera, file=r"(data/dfhera.rda)")


```