add scripts to EJP repository

ejpsoil · Jan 31, 2025 · 35b6d22 · 35b6d22
1 parent c9dbb34
commit 35b6d22
Show file tree

Hide file tree

Showing 9 changed files with 996 additions and 1 deletion.
diff --git a/scripts/validation/EJP-catalogue-cleaning.R b/scripts/validation/EJP-catalogue-cleaning.R
@@ -0,0 +1,379 @@
+# task: content of ejpsoil catalogue at https://catalogue.ejpsoil.eu
+
+# load libraries ----
+library(jsonlite)
+library(xlsx)
+library(stringr)
+library(dplyr)
+
+# create an empty data frame to store the assessment ----
+# created vector with 5 characters
+columns <- c("check_id", "id",
+             "title", "title_check", #"title_notes",
+             "keywords", "scope", "country", "concept", "folder",
+             "description", "license", 
+             "time", "created", "updated",
+             "providers", "publisher", "prov_url", "prov_url_works", "email",
+             "ext_id", "ext_id_works", "ext_id_notes",
+             "links", "link_works", "links_notes", "url_fix",
+             "catalogue_url", "canonical") 
+# pass this vector length to ncol parameter
+# and nrow with 0
+checks <- data.frame(matrix(nrow = 0, ncol = length(columns))) 
+# assign column names
+colnames(checks) <- columns
+
+# valid url function ----
+valid_url <- function(url_in,t=2){
+  con <- url(url_in)
+  check <- suppressWarnings(try(open.connection(con,open="rt",timeout=t),silent=T)[1])
+  suppressWarnings(try(close.connection(con),silent=T))
+  ifelse(is.null(check),TRUE,FALSE)
+}
+
+# urls <-   c("https://www.dov.vlaanderen.be/bodemverkenner", ... )
+# sapply(urls,valid_url)
+
+# loop catalog ----
+
+# the catalog allows the offset of 50 elements, and have in total 577 ----
+nmax <- 577
+offset_i <- seq(from = 0, to = nmax, by = 50)
+
+for (i in offset_i){
+
+  # load JSON catalog from url
+  EJP_catalog <- fromJSON(paste0("https://catalogue.ejpsoil.eu/collections/metadata:main/items?offset=", i)) 
+  # get id
+  id <- EJP_catalog$features$id
+  # get title
+  title <- EJP_catalog$features$properties$title # vector 
+  # get keywords
+  keywords <-  EJP_catalog$features$properties$keywords # list
+  # get themes
+  themes <- EJP_catalog$features$properties$themes
+  # get description
+  description <- EJP_catalog$features$properties$description # vector # does it have description? (no = 0, yes = 1) does the description makes sense? (no = 2)
+  # get time
+  time <- EJP_catalog$features$time
+  # get created
+  created <- EJP_catalog$features$properties$created
+  # get updated
+  updated <- EJP_catalog$features$properties$updated
+  # get providers
+  providers <-  EJP_catalog$features$properties$providers # df list # does it have providers? (no = 0, yes = 1) what info about providers is significant? email? url?
+  # get external id's
+  ext_id <- EJP_catalog$features$properties$externalIds # df list, "value" # does it have ext_id? does it work?
+  # get links
+  links <- EJP_catalog$features$links # df list, "href", # does it have href link? (no = 0, yes = 1) does it work?
+  # get license
+  license <- EJP_catalog$features$properties$license # vector
+
+  # fill checks table
+
+  # first row to write the records in the iteration
+  j = i + 1
+
+  # fill "check_id"
+  print(paste0("fill id: ", i))
+  checks[j:(i+length(id)),"check_id"] <- j:(i+length(id))
+
+  # fill id ----
+  checks[j:(i+length(id)),"id"] <- id
+
+  # fill "title" ----
+  print(paste0("fill title: ", i))
+  checks[j:(i+length(title)), "title"] <- title
+
+  # fill "title_check"
+  checks[j:(i+length(title)), "title_check"] <- ifelse(is.na(checks$title[j:(i+length(title))]), 0, 1) # does it have a title? (no = 0, yes = 1)
+
+  # fill "license" ----
+  if (is.null(license)){
+    checks[j:(i+length(license)), "license"] <- NA
+  } else {
+    checks[j:(i+length(license)), "license"] <- license
+  }
+
+  # fill "keywords" ----
+  print(paste0("fill keywords: ", i))
+  # does it have keywords? which?
+  l <- j
+  for (k in seq_along(keywords)){
+    checks[l, "keywords"] <- paste(keywords[[k]], collapse = ",")
+    l <- l + 1
+  }
+
+  # fill canonical ----
+  # extract github link
+  l <- j
+  for (k in seq_along(links)){
+    canonical_k <- subset(links[[k]], type == "canonical")
+    checks[l, "canonical"] <- canonical_k$href
+    l <- l + 1
+  }
+
+  # fill folder ----
+  l <- j
+  for (k in seq_along(links)){
+    canonical_k <- subset(links[[k]], type == "canonical")$href
+    checks[l, "folder"] <- substr(canonical_k, 62, max(unlist(gregexpr('/', canonical_k)))-1)
+    l <- l + 1
+  }
+
+  # fill scope ----
+  print(paste0("fill scope: ", i))
+  l <- j
+  for (k in seq_along(themes)){
+    scope_k <- themes[[k]]$concepts[[1]][1]
+    concept_k <- themes[[k]]$concepts[[2]][1]
+    checks[l, "scope"] <- scope_k
+    checks[l, "concept"] <- concept_k
+
+
+    # fill country
+    if (scope_k == "National"){
+      str_country <- substr(checks[l, "canonical"], 62, nchar(checks[l, "canonical"]))
+      slash_loc <- unlist(gregexpr('/', str_country))
+      checks[l, "country"] <- substr(str_country, slash_loc[1]+1, slash_loc[2]-1)
+    }
+
+    l <- l + 1
+  }
+
+  # fill "description" ----
+  print(paste0("fill description: ", i))
+  checks[j:(i+length(description)), "description"] <- ifelse(is.na(description), 0, 1) # does it have a description? (no = 0, yes = 1)
+
+
+  # fill providers ----
+  print(paste0("fill providers: ", i))
+
+  # fill publisher
+  l <- j
+  for (k in seq_along(providers)){
+    distributor_k <- subset(providers[[k]]$name, unlist(providers[[k]]$roles) == 'publisher' |
+                              unlist(providers[[k]]["roles"]) == 'distributor' |
+                                       unlist(providers[[k]]["roles"]) == 'pointOfContact')
+    if("" %in% distributor_k){
+      distributor_k <- subset(distributor_k, distributor_k != "")
+
+      if (length(distributor_k) == 0){
+        distributor_k <- NA
+      }
+    }
+
+    checks[l, "publisher"] <-  paste(distributor_k, collapse = ",")
+    l <- l + 1
+  }
+
+  # fill "providers"
+  # does it have providers? ('Unknown' = 0, yes = 1)
+  l <- j
+  for (k in seq_along(providers)){
+    providers_k <- providers[[k]]$name
+
+    # if("" %in% providers_k){
+    #   providers_k <- which(providers_k != "")
+    # }
+
+    if (any(providers_k == "Unknown") | any(is.na(providers_k))){
+      checks[l, "providers"] <- 0
+    } else {
+      checks[l, "providers"] <- 1
+    }
+    l <- l + 1
+  }
+
+  # fill "providers_url" and "prov_url_works"
+  l <- j
+  for (k in seq_along(providers)){
+    providers_k <- providers[[k]]$contactInfo$url$url
+
+    providers_k <- subset(providers_k, !is.na(providers_k))
+
+    if(length(providers_k) == 0){
+      checks[l, "prov_url"] <- NA
+    } else{
+      checks[l, "prov_url"] <- paste(providers_k, collapse = ",")
+
+      if (any(grepl("http://", providers_k) | grepl("https://", providers_k))){
+
+        link_works_k <- sapply(providers_k,valid_url)
+
+        if(length(link_works_k) == sum(link_works_k)){
+          checks[l, "prov_url_works"] <- "TRUE"
+        } else{
+          checks[l, "prov_url_works"] <- paste(as.character(link_works_k), collapse = ",")
+        }
+      } else {
+        checks[l, "prov_url_works"] <- "character"
+      }
+    }
+    l <- l + 1
+  }
+
+  # fill email
+  # extract the email
+  l <- j
+  for (k in seq_along(providers)){
+    email_k <- providers[[k]]$contactInfo$email
+    email_k <- subset(email_k, !is.na(email_k))
+
+    if(nrow(email_k) == 0){
+      checks[l, "email"] <- NA
+    } else {
+      checks[l, "email"] <- paste(as.character(email_k), collapse = ",")
+    }
+    l <- l + 1
+  }
+
+  # fill "time" ----
+  print(paste0("fill time: ", i))
+  # check the temporal extent
+  l <- j
+  for (k in seq_along(time)){
+
+    if(is.list(time)){
+      checks[l, "time"] <- paste(time[[k]], collapse = ",")
+    } else {
+      checks[l, "time"] <- paste(time[l], collapse = ",")
+    }
+    l <- l + 1
+  }
+
+  # fill created
+  l <- j
+  for (k in seq_along(created)){
+    checks[l, "created"] <- created[k]
+    l <- l + 1
+  }
+
+  # fill updated
+  l <- j
+  for (k in seq_along(updated)){
+    checks[l, "updated"] <- updated[k]
+    l <- l + 1
+  }
+
+  # fill external id ----
+  print(paste0("fill external id: ", i))
+
+  # does it have ext_id? (no = NA, yes(works? no = FALSE, yes = TRUE))
+  l <- j
+  for (k in seq_along(ext_id)){
+    ext_id_k <- ext_id[[k]]
+    # fill "ext_id_works" and "ext_id"
+    if (is.null(ext_id_k)){
+      checks[l, "ext_id_works"] <- NA
+      checks[l, "ext_id"] <- NA
+    } else {
+      checks[l, "ext_id"] <- ext_id_k
+      if (grepl("^http://", ext_id_k) | grepl("^https://", ext_id_k)){
+        checks[l, "ext_id_works"] <- as.character(sapply(ext_id_k,valid_url))
+      } else {
+        checks[l, "ext_id_works"] <- "character"
+      }
+    }
+    l <- l + 1
+  }
+
+  # fill links ----
+  # does it have links? (no = NA, yes(works? no = FALSE, yes = TRUE))
+  print(paste0("fill links: ", i))
+
+  l <- j
+  for (k in seq_along(links)){
+
+    # extract links # k
+    links_k <- subset(links[[k]], type != "canonical" & type != "application/json" & type != "metadata")
+
+    # fill wih NA if there are no links available
+    if (nrow(links_k) == 0){
+      checks[l, "links"] <- NA
+      checks[l, "link_works"] <- NA
+    } else {
+      # paste together the links in one vector
+      checks[l, "links"] <- paste(links_k$href, collapse = ",")
+
+      # check if link works
+
+      # initialize "link_works"
+      link_works <- c()
+
+      for (m in 1:length(links_k$href)){
+        link_m <- links_k$href[m]
+
+        cond1 <- grepl("http://*", link_m) | grepl("https://*", link_m)
+        cond2 <- sapply(link_m, grepl, pattern="^10.*")
+
+        # check links that start with "http://*" or "https://*"
+        if (cond1){
+          link_works <- c(link_works, sapply(link_m, valid_url))
+        }
+
+        # check links that start with "^10.*", which are doi's
+        if (cond2){
+          # add a flag to the incomplete link
+          checks[l, "links_notes"] <- paste0(!is.na(checks[l, "links_notes"]), "add \"https://doi.org/\" to doi")
+          checks[l, "url_fix"] <- link_m
+
+          # fix the link
+          link_f <- paste0("https://doi.org/", link_m)
+
+          link_works <- c(link_works, sapply(link_f, valid_url)) 
+        }
+
+        # if the link neither starts with "http://*" or "^10.*"
+        if (!cond1 & !cond2){
+          link_works <- c(link_works, NA)
+          checks[l, "links_notes"] <- paste0(!is.na(checks[l, "links_notes"]), ", check ", link_m)
+        }
+
+      }
+
+      # if all the links work, write just one "TRUE"
+      if(length(link_works) == sum(link_works)){
+        checks[l, "link_works"] <- "TRUE"
+      } else{
+        # if at least one of the links doesn't work, write all the TRUE and FALSE
+        checks[l, "link_works"] <- paste(as.character(link_works), collapse = ",")
+      }
+
+    } # else closes
+
+    l <- l + 1
+  } # for closes
+
+  # fill catalogue url ----
+  # make up url to catalog
+
+  l <- j
+  for (k in seq_along(links)){
+    catlinkl <- "https://catalogue.ejpsoil.eu/collections/metadata:main/items/"
+    catlinkl_k <- paste0(catlinkl, URLencode(id[k], reserved = TRUE, repeated = TRUE))
+    checks[l, "catalogue_url"] <- catlinkl_k
+    #print(catlinkl_k)
+    l <- l + 1
+  }
+}
+
+#test <- subset(checks, !rowSums(is.na(checks)) == ncol(checks))
+
+# join "title_notes"
+previous_xlsx <- read.xlsx("~/ISRIC_Workspace/scratch/diana/EJP-catalog-checks_11.xlsx", sheetIndex = 1)
+# delete empty "title_notes"
+#drop <- c("title_notes")
+#checks <- checks[,!(names(checks) %in% drop)]
+#test <- subset(checks, !is.na(check_id))
+
+checks_plus_notes <- left_join(checks, previous_xlsx[c("id", "title_notes")], by = join_by("id"))
+checks_plus_notes <- subset(checks_plus_notes, !duplicated(checks_plus_notes))
+checks_plus_notes <- checks_plus_notes %>% relocate(title_notes, .after = title_check)
+
+# create a new column with link pointing to "diana" branch
+checks_plus_notes["diana"] <- lapply(checks_plus_notes["canonical"], gsub, pattern = "/main/", replacement = "/diana/")
+
+# save xlsx
+write.xlsx(checks_plus_notes, file = "~/ISRIC_Workspace/scratch/diana/EJP-catalog-checks_15.xlsx", row.names = FALSE)
+
diff --git a/scripts/validation/EJP-check-titles.R b/scripts/validation/EJP-check-titles.R
@@ -0,0 +1,19 @@
+# task: check that titles make sense
+
+# load libraries ----
+library(xlsx)
+
+# load data
+xlsx <- read.xlsx("~/ISRIC_Workspace/scratch/diana/EJP-catalog-checks_11.xlsx", sheetIndex = 1)
+
+# check names
+names(xlsx)
+
+# # select columns useful for title check
+# df <- xlsx[c("check_id", "id", "title", "title_check", "title_notes",
+#                "scope", "country", "catalogue_url", "canonical")]
+
+xlsx["diana"] <- lapply(xlsx["canonical"], gsub, pattern = "/main/", replacement = "/diana/")
+
+# save table
+write.xlsx(xlsx, file = "~/ISRIC_Workspace/scratch/diana/EJP-catalog-checks_11.xlsx", row.names = FALSE)