Initial commit

ElliottMess · Jan 12, 2021 · 01f4886 · 01f4886
commit 01f4886
Show file tree

Hide file tree

Showing 11 changed files with 106,612 additions and 0 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -0,0 +1,3 @@
+^.*\.Rproj$
+^\.Rproj\.user$
+^README\.Rmd$
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,19 @@
+Package: pcodesOCHA
+Type: Package
+Title: Scrap pcodes from UNOCHA COD
+Version: 0.1.0
+Author: Elliott Messeiller <elliott.messeiller@impact-initiatives.org>
+Maintainer: Elliott Messeiller <elliott.messeiller@impact-initiatives.org>
+Description: Scrap pcodes from UNOCHA Common Operational Datasets web server
+Depends:
+  R (>= 3.6),
+  dplyr,
+  tidyr,
+  stringr,
+  httr,
+  jsonlite,
+  rvest
+License: AGPL (>= 3)
+Encoding: UTF-8
+LazyData: true
+RoxygenNote: 7.1.1
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,3 @@
+# Generated by roxygen2: do not edit by hand
+
+export(all_pcodes_feature_servers)
diff --git a/R/pcodes_scrapping.R b/R/pcodes_scrapping.R
@@ -0,0 +1,181 @@
+
+#' Scrap pcodes datasets from UNOCHA REST API
+#'
+#' @return a dataframe containing all pcodes at the lowest administrative level available
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' all_pcodes_feature_servers()
+#' }
+all_pcodes_feature_servers <- function(){
+  base_URL <- "https://gistmaps.itos.uga.edu/"
+
+  COD_URL <- paste0(base_URL, "arcgis/rest/services/COD_External")
+
+  COD_list <-read_html(COD_URL) %>%
+    html_nodes("a")%>%
+    xml_attr("href")
+
+  COD_list <- COD_list[grepl("FeatureServer$", COD_list)]
+
+  all_dfs <- list()
+
+  # for(i in 1:length(COD_list)){
+  #   all_dfs[[1]] <- scrap_country(COD_list[2])
+  # }
+
+  all_dfs <- lapply(COD_list, scrap_country) %>%
+    bind_rows()
+
+  all_dfs_rearranged <- all_dfs %>%
+    select(-OBJECTID) %>%
+    mutate(admin0Name = case_when(
+      is.na(admin0Name_en) & !is.na(admin0Name_fr) ~ admin0Name_fr,
+      is.na(admin0Name_en) & !is.na(admin0Name_es) ~ admin0Name_es,
+      is.na(admin0Name_en) & !is.na(admin0Name_pt) ~ admin0Name_pt,
+      !is.na(admin0Name_en) ~ admin0Name_en,
+      TRUE ~ NA_character_
+    ),
+    admin1Name = case_when(
+      is.na(admin1Name_en) & !is.na(admin1Name_fr) ~ admin1Name_fr,
+      is.na(admin1Name_en) & !is.na(admin1Name_es) ~ admin1Name_es,
+      is.na(admin1Name_en) & !is.na(admin1Name_pt) ~ admin1Name_pt,
+      !is.na(admin1Name_en) ~ admin1Name_en,
+      TRUE ~ NA_character_
+    ),
+    admin2Name = case_when(
+      is.na(admin2Name_en) & !is.na(admin2Name_fr) ~ admin2Name_fr,
+      is.na(admin2Name_en) & !is.na(admin2Name_es) ~ admin2Name_es,
+      is.na(admin2Name_en) & !is.na(admin2Name_pt) ~ admin2Name_pt,
+      !is.na(admin2Name_en) ~ admin2Name_en,
+      TRUE ~ NA_character_
+      ))%>%
+    relocate(admin0Name,admin0Pcode, admin1Name,admin1Pcode, admin2Name, admin2Pcode)
+
+  return(all_dfs_rearranged)
+
+
+}
+
+scrap_one_URL_properties <- function(URL){
+  COD_URL <- "https://gistmaps.itos.uga.edu"
+
+  url_name <- tryCatch({paste0(COD_URL, URL)%>%
+    read_html() %>%
+    html_nodes("h2") %>%
+    as.character() %>%
+    stringr::str_extract("(?<=: ).*?(?= \\()")},
+    error=function(e){cat("URL not available:", conditionMessage(e), "\n")
+      })
+
+  query_URL <- paste0(COD_URL, URL, "/query?where=OBJECTID%20%3E%200&outFields=%2A&returnGeometry=false&f=json")
+
+  json <- tryCatch({fromJSON(query_URL)}, error=function(e){cat("URL not available:", conditionMessage(e), "\n")})
+
+  properties <- json$features$attributes
+  list_properties <- list(name = url_name, data = properties)
+  return(list_properties)
+
+}
+
+
+scrap_country <- function(URL){
+  COD_URL <- "https://gistmaps.itos.uga.edu"
+
+  all_layers <- tryCatch({paste0(COD_URL, URL, "/layers")%>%
+    read_html() %>%
+    html_nodes("a")%>%
+    xml_attr("href")},
+    error=function(e){cat("URL not available:", conditionMessage(e), "\n")}
+  )
+
+  all_layers <- all_layers[grepl("[0-9]$", all_layers)]
+
+  all_layers_available <- tryCatch({paste0(COD_URL, URL, "/layers")%>%
+    read_html() %>%
+    html_nodes("h3") %>%
+    as.character()},
+    error=function(e){cat("URL not available:", conditionMessage(e), "\n")}
+  )
+
+  if(is.null(all_layers) | is.null(all_layers_available) ){
+    message("URL not available")
+    return(NULL)
+  }else{
+
+    all_layers_names <-stringr::str_extract(all_layers_available,'(?<=">).*?(?=<)')
+    all_layers_id <- stringr::str_extract(all_layers_available,"(?<=<\\/a> \\().*?(?=\\)<\\/h3>)")
+
+    admin_levels_nb <- as.numeric(unlist(str_extract_all(all_layers_names[grepl("^Admin[0-9]", all_layers_names)],  "[0-9]")))
+    lowest_admin  <- all_layers_names[grepl(paste0("Admin",max(admin_levels_nb)), all_layers_names)]
+    lowest_admin_id <- all_layers_id[grepl(lowest_admin, all_layers_names)]
+
+
+    lowest_admin_layer <- all_layers[grepl(paste0(lowest_admin_id, "$"), all_layers)]
+
+    all_properties <- scrap_one_URL_properties(lowest_admin_layer)
+
+    return(all_properties$data)
+  }
+}
+
+scrap_one_URL_geoJSON <- function(URL){
+
+  COD_URL <- "https://gistmaps.itos.uga.edu"
+
+  url_name <- tryCatch({paste0(COD_URL, URL)%>%
+      read_html() %>%
+      html_nodes("h2") %>%
+      as.character() %>%
+      stringr::str_extract("(?<=: ).*?(?= \\()")},
+      error=function(e){cat("URL not available:", conditionMessage(e), "\n")
+      })
+
+  query_URL <- paste0(COD_URL, URL, "/query?where=OBJECTID+>%3D+0&objectIds=&time=&geometry=&geometryType=esriGeometryPolygon&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson")
+  readlines_geojson <- readLines(query_URL, warn = FALSE)
+
+  return(readlines_geojson)
+
+}
+
+
+scrap_country_geojson <- function(URL){
+  COD_URL <- "https://gistmaps.itos.uga.edu"
+
+  all_layers <- tryCatch({paste0(COD_URL, URL, "/layers")%>%
+      read_html() %>%
+      html_nodes("a")%>%
+      xml_attr("href")},
+      error=function(e){cat("URL not available:", conditionMessage(e), "\n")}
+  )
+
+  all_layers <- all_layers[grepl("[0-9]$", all_layers)]
+
+  all_layers_available <- tryCatch({paste0(COD_URL, URL, "/layers")%>%
+      read_html() %>%
+      html_nodes("h3") %>%
+      as.character()},
+      error=function(e){cat("URL not available:", conditionMessage(e), "\n")}
+  )
+
+  if(is.null(all_layers) | is.null(all_layers_available) ){
+    message("URL not available")
+    return(NULL)
+  }else{
+
+    all_layers_names <-stringr::str_extract(all_layers_available,'(?<=">).*?(?=<)')
+    all_layers_id <- stringr::str_extract(all_layers_available,"(?<=<\\/a> \\().*?(?=\\)<\\/h3>)")
+
+    admin_levels_nb <- as.numeric(unlist(str_extract_all(all_layers_names[grepl("^Admin[0-9]", all_layers_names)],  "[0-9]")))
+    lowest_admin  <- all_layers_names[grepl(paste0("Admin",max(admin_levels_nb)), all_layers_names)]
+    lowest_admin_id <- all_layers_id[grepl(lowest_admin, all_layers_names)]
+
+
+    lowest_admin_layer <- all_layers[grepl(paste0(lowest_admin_id, "$"), all_layers)]
+
+    geo_json <- scrap_one_URL_geoJSON(lowest_admin_layer)
+
+    return(geo_json)
+  }
+}
diff --git a/README.Rmd b/README.Rmd
@@ -0,0 +1,26 @@
+---
+output: github_document
+---
+
+<!-- README.md is generated from README.Rmd. Please edit that file -->
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>",
+  fig.path = "man/figures/README-",
+  out.width = "100%"
+)
+```
+
+# pcodesOCHA
+
+<!-- badges: start -->
+<!-- badges: end -->
+
+The goal of pcodesOCHA is to scrap pcodes from [UNOCHA Common Operational Datasets web server](https://gistmaps.itos.uga.edu/arcgis/rest/services)
+
+There is just one function exported: all_pcodes_feature_servers
+Other helper functions are available.
+
+One function implemented partially to scrap geoJSON files: scrap_country_geojson
diff --git a/README.md b/README.md
@@ -0,0 +1,18 @@
+
+<!-- README.md is generated from README.Rmd. Please edit that file -->
+
+# pcodesOCHA
+
+<!-- badges: start -->
+
+<!-- badges: end -->
+
+The goal of pcodesOCHA is to scrap pcodes from [UNOCHA Common
+Operational Datasets web
+server](https://gistmaps.itos.uga.edu/arcgis/rest/services)
+
+There is just one function exported: all\_pcodes\_feature\_servers Other
+helper functions are available.
+
+One function implemented partially to scrap geoJSON files:
+scrap\_country\_geojson
diff --git a/man/all_pcodes_feature_servers.Rd b/man/all_pcodes_feature_servers.Rd
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Generated by roxygen2: do not edit by hand

		export(all_pcodes_feature_servers)