Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
ElliottMess committed Jan 12, 2021
0 parents commit 01f4886
Show file tree
Hide file tree
Showing 11 changed files with 106,612 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
^.*\.Rproj$
^\.Rproj\.user$
^README\.Rmd$
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.Rproj.user
.Rhistory
.RData
.Ruserdata
19 changes: 19 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
Package: pcodesOCHA
Type: Package
Title: Scrap pcodes from UNOCHA COD
Version: 0.1.0
Author: Elliott Messeiller <elliott.messeiller@impact-initiatives.org>
Maintainer: Elliott Messeiller <elliott.messeiller@impact-initiatives.org>
Description: Scrap pcodes from UNOCHA Common Operational Datasets web server
Depends:
R (>= 3.6),
dplyr,
tidyr,
stringr,
httr,
jsonlite,
rvest
License: AGPL (>= 3)
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.1.1
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Generated by roxygen2: do not edit by hand

export(all_pcodes_feature_servers)
181 changes: 181 additions & 0 deletions R/pcodes_scrapping.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@

#' Scrap pcodes datasets from UNOCHA REST API
#'
#' @return a dataframe containing all pcodes at the lowest administrative level available
#' @export
#'
#' @examples
#' \dontrun{
#' all_pcodes_feature_servers()
#' }
all_pcodes_feature_servers <- function(){
base_URL <- "https://gistmaps.itos.uga.edu/"

COD_URL <- paste0(base_URL, "arcgis/rest/services/COD_External")

COD_list <-read_html(COD_URL) %>%
html_nodes("a")%>%
xml_attr("href")

COD_list <- COD_list[grepl("FeatureServer$", COD_list)]

all_dfs <- list()

# for(i in 1:length(COD_list)){
# all_dfs[[1]] <- scrap_country(COD_list[2])
# }

all_dfs <- lapply(COD_list, scrap_country) %>%
bind_rows()

all_dfs_rearranged <- all_dfs %>%
select(-OBJECTID) %>%
mutate(admin0Name = case_when(
is.na(admin0Name_en) & !is.na(admin0Name_fr) ~ admin0Name_fr,
is.na(admin0Name_en) & !is.na(admin0Name_es) ~ admin0Name_es,
is.na(admin0Name_en) & !is.na(admin0Name_pt) ~ admin0Name_pt,
!is.na(admin0Name_en) ~ admin0Name_en,
TRUE ~ NA_character_
),
admin1Name = case_when(
is.na(admin1Name_en) & !is.na(admin1Name_fr) ~ admin1Name_fr,
is.na(admin1Name_en) & !is.na(admin1Name_es) ~ admin1Name_es,
is.na(admin1Name_en) & !is.na(admin1Name_pt) ~ admin1Name_pt,
!is.na(admin1Name_en) ~ admin1Name_en,
TRUE ~ NA_character_
),
admin2Name = case_when(
is.na(admin2Name_en) & !is.na(admin2Name_fr) ~ admin2Name_fr,
is.na(admin2Name_en) & !is.na(admin2Name_es) ~ admin2Name_es,
is.na(admin2Name_en) & !is.na(admin2Name_pt) ~ admin2Name_pt,
!is.na(admin2Name_en) ~ admin2Name_en,
TRUE ~ NA_character_
))%>%
relocate(admin0Name,admin0Pcode, admin1Name,admin1Pcode, admin2Name, admin2Pcode)

return(all_dfs_rearranged)


}

scrap_one_URL_properties <- function(URL){
COD_URL <- "https://gistmaps.itos.uga.edu"

url_name <- tryCatch({paste0(COD_URL, URL)%>%
read_html() %>%
html_nodes("h2") %>%
as.character() %>%
stringr::str_extract("(?<=: ).*?(?= \\()")},
error=function(e){cat("URL not available:", conditionMessage(e), "\n")
})

query_URL <- paste0(COD_URL, URL, "/query?where=OBJECTID%20%3E%200&outFields=%2A&returnGeometry=false&f=json")

json <- tryCatch({fromJSON(query_URL)}, error=function(e){cat("URL not available:", conditionMessage(e), "\n")})

properties <- json$features$attributes
list_properties <- list(name = url_name, data = properties)
return(list_properties)

}


scrap_country <- function(URL){
COD_URL <- "https://gistmaps.itos.uga.edu"

all_layers <- tryCatch({paste0(COD_URL, URL, "/layers")%>%
read_html() %>%
html_nodes("a")%>%
xml_attr("href")},
error=function(e){cat("URL not available:", conditionMessage(e), "\n")}
)

all_layers <- all_layers[grepl("[0-9]$", all_layers)]

all_layers_available <- tryCatch({paste0(COD_URL, URL, "/layers")%>%
read_html() %>%
html_nodes("h3") %>%
as.character()},
error=function(e){cat("URL not available:", conditionMessage(e), "\n")}
)

if(is.null(all_layers) | is.null(all_layers_available) ){
message("URL not available")
return(NULL)
}else{

all_layers_names <-stringr::str_extract(all_layers_available,'(?<=">).*?(?=<)')
all_layers_id <- stringr::str_extract(all_layers_available,"(?<=<\\/a> \\().*?(?=\\)<\\/h3>)")

admin_levels_nb <- as.numeric(unlist(str_extract_all(all_layers_names[grepl("^Admin[0-9]", all_layers_names)], "[0-9]")))
lowest_admin <- all_layers_names[grepl(paste0("Admin",max(admin_levels_nb)), all_layers_names)]
lowest_admin_id <- all_layers_id[grepl(lowest_admin, all_layers_names)]


lowest_admin_layer <- all_layers[grepl(paste0(lowest_admin_id, "$"), all_layers)]

all_properties <- scrap_one_URL_properties(lowest_admin_layer)

return(all_properties$data)
}
}

scrap_one_URL_geoJSON <- function(URL){

COD_URL <- "https://gistmaps.itos.uga.edu"

url_name <- tryCatch({paste0(COD_URL, URL)%>%
read_html() %>%
html_nodes("h2") %>%
as.character() %>%
stringr::str_extract("(?<=: ).*?(?= \\()")},
error=function(e){cat("URL not available:", conditionMessage(e), "\n")
})

query_URL <- paste0(COD_URL, URL, "/query?where=OBJECTID+>%3D+0&objectIds=&time=&geometry=&geometryType=esriGeometryPolygon&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson")
readlines_geojson <- readLines(query_URL, warn = FALSE)

return(readlines_geojson)

}


scrap_country_geojson <- function(URL){
COD_URL <- "https://gistmaps.itos.uga.edu"

all_layers <- tryCatch({paste0(COD_URL, URL, "/layers")%>%
read_html() %>%
html_nodes("a")%>%
xml_attr("href")},
error=function(e){cat("URL not available:", conditionMessage(e), "\n")}
)

all_layers <- all_layers[grepl("[0-9]$", all_layers)]

all_layers_available <- tryCatch({paste0(COD_URL, URL, "/layers")%>%
read_html() %>%
html_nodes("h3") %>%
as.character()},
error=function(e){cat("URL not available:", conditionMessage(e), "\n")}
)

if(is.null(all_layers) | is.null(all_layers_available) ){
message("URL not available")
return(NULL)
}else{

all_layers_names <-stringr::str_extract(all_layers_available,'(?<=">).*?(?=<)')
all_layers_id <- stringr::str_extract(all_layers_available,"(?<=<\\/a> \\().*?(?=\\)<\\/h3>)")

admin_levels_nb <- as.numeric(unlist(str_extract_all(all_layers_names[grepl("^Admin[0-9]", all_layers_names)], "[0-9]")))
lowest_admin <- all_layers_names[grepl(paste0("Admin",max(admin_levels_nb)), all_layers_names)]
lowest_admin_id <- all_layers_id[grepl(lowest_admin, all_layers_names)]


lowest_admin_layer <- all_layers[grepl(paste0(lowest_admin_id, "$"), all_layers)]

geo_json <- scrap_one_URL_geoJSON(lowest_admin_layer)

return(geo_json)
}
}
26 changes: 26 additions & 0 deletions README.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
---
output: github_document
---

<!-- README.md is generated from README.Rmd. Please edit that file -->

```{r, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
fig.path = "man/figures/README-",
out.width = "100%"
)
```

# pcodesOCHA

<!-- badges: start -->
<!-- badges: end -->

The goal of pcodesOCHA is to scrap pcodes from [UNOCHA Common Operational Datasets web server](https://gistmaps.itos.uga.edu/arcgis/rest/services)

There is just one function exported: all_pcodes_feature_servers
Other helper functions are available.

One function implemented partially to scrap geoJSON files: scrap_country_geojson
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@

<!-- README.md is generated from README.Rmd. Please edit that file -->

# pcodesOCHA

<!-- badges: start -->

<!-- badges: end -->

The goal of pcodesOCHA is to scrap pcodes from [UNOCHA Common
Operational Datasets web
server](https://gistmaps.itos.uga.edu/arcgis/rest/services)

There is just one function exported: all\_pcodes\_feature\_servers Other
helper functions are available.

One function implemented partially to scrap geoJSON files:
scrap\_country\_geojson
19 changes: 19 additions & 0 deletions man/all_pcodes_feature_servers.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 01f4886

Please sign in to comment.