Skip to content

Commit

Permalink
add scripts to EJP repository
Browse files Browse the repository at this point in the history
  • Loading branch information
colla007 committed Jan 31, 2025
1 parent c9dbb34 commit 35b6d22
Show file tree
Hide file tree
Showing 9 changed files with 996 additions and 1 deletion.
379 changes: 379 additions & 0 deletions scripts/validation/EJP-catalogue-cleaning.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,379 @@
# task: content of ejpsoil catalogue at https://catalogue.ejpsoil.eu

# load libraries ----
library(jsonlite)
library(xlsx)
library(stringr)
library(dplyr)

# create an empty data frame to store the assessment ----
# created vector with 5 characters
columns <- c("check_id", "id",
"title", "title_check", #"title_notes",
"keywords", "scope", "country", "concept", "folder",
"description", "license",
"time", "created", "updated",
"providers", "publisher", "prov_url", "prov_url_works", "email",
"ext_id", "ext_id_works", "ext_id_notes",
"links", "link_works", "links_notes", "url_fix",
"catalogue_url", "canonical")
# pass this vector length to ncol parameter
# and nrow with 0
checks <- data.frame(matrix(nrow = 0, ncol = length(columns)))
# assign column names
colnames(checks) <- columns

# valid url function ----
valid_url <- function(url_in,t=2){
con <- url(url_in)
check <- suppressWarnings(try(open.connection(con,open="rt",timeout=t),silent=T)[1])
suppressWarnings(try(close.connection(con),silent=T))
ifelse(is.null(check),TRUE,FALSE)
}

# urls <- c("https://www.dov.vlaanderen.be/bodemverkenner", ... )
# sapply(urls,valid_url)

# loop catalog ----

# the catalog allows the offset of 50 elements, and have in total 577 ----
nmax <- 577
offset_i <- seq(from = 0, to = nmax, by = 50)

for (i in offset_i){

# load JSON catalog from url
EJP_catalog <- fromJSON(paste0("https://catalogue.ejpsoil.eu/collections/metadata:main/items?offset=", i))
# get id
id <- EJP_catalog$features$id
# get title
title <- EJP_catalog$features$properties$title # vector
# get keywords
keywords <- EJP_catalog$features$properties$keywords # list
# get themes
themes <- EJP_catalog$features$properties$themes
# get description
description <- EJP_catalog$features$properties$description # vector # does it have description? (no = 0, yes = 1) does the description makes sense? (no = 2)
# get time
time <- EJP_catalog$features$time
# get created
created <- EJP_catalog$features$properties$created
# get updated
updated <- EJP_catalog$features$properties$updated
# get providers
providers <- EJP_catalog$features$properties$providers # df list # does it have providers? (no = 0, yes = 1) what info about providers is significant? email? url?
# get external id's
ext_id <- EJP_catalog$features$properties$externalIds # df list, "value" # does it have ext_id? does it work?
# get links
links <- EJP_catalog$features$links # df list, "href", # does it have href link? (no = 0, yes = 1) does it work?
# get license
license <- EJP_catalog$features$properties$license # vector

# fill checks table

# first row to write the records in the iteration
j = i + 1

# fill "check_id"
print(paste0("fill id: ", i))
checks[j:(i+length(id)),"check_id"] <- j:(i+length(id))

# fill id ----
checks[j:(i+length(id)),"id"] <- id

# fill "title" ----
print(paste0("fill title: ", i))
checks[j:(i+length(title)), "title"] <- title

# fill "title_check"
checks[j:(i+length(title)), "title_check"] <- ifelse(is.na(checks$title[j:(i+length(title))]), 0, 1) # does it have a title? (no = 0, yes = 1)

# fill "license" ----
if (is.null(license)){
checks[j:(i+length(license)), "license"] <- NA
} else {
checks[j:(i+length(license)), "license"] <- license
}

# fill "keywords" ----
print(paste0("fill keywords: ", i))
# does it have keywords? which?
l <- j
for (k in seq_along(keywords)){
checks[l, "keywords"] <- paste(keywords[[k]], collapse = ",")
l <- l + 1
}

# fill canonical ----
# extract github link
l <- j
for (k in seq_along(links)){
canonical_k <- subset(links[[k]], type == "canonical")
checks[l, "canonical"] <- canonical_k$href
l <- l + 1
}

# fill folder ----
l <- j
for (k in seq_along(links)){
canonical_k <- subset(links[[k]], type == "canonical")$href
checks[l, "folder"] <- substr(canonical_k, 62, max(unlist(gregexpr('/', canonical_k)))-1)
l <- l + 1
}

# fill scope ----
print(paste0("fill scope: ", i))
l <- j
for (k in seq_along(themes)){
scope_k <- themes[[k]]$concepts[[1]][1]
concept_k <- themes[[k]]$concepts[[2]][1]
checks[l, "scope"] <- scope_k
checks[l, "concept"] <- concept_k


# fill country
if (scope_k == "National"){
str_country <- substr(checks[l, "canonical"], 62, nchar(checks[l, "canonical"]))
slash_loc <- unlist(gregexpr('/', str_country))
checks[l, "country"] <- substr(str_country, slash_loc[1]+1, slash_loc[2]-1)
}

l <- l + 1
}

# fill "description" ----
print(paste0("fill description: ", i))
checks[j:(i+length(description)), "description"] <- ifelse(is.na(description), 0, 1) # does it have a description? (no = 0, yes = 1)


# fill providers ----
print(paste0("fill providers: ", i))

# fill publisher
l <- j
for (k in seq_along(providers)){
distributor_k <- subset(providers[[k]]$name, unlist(providers[[k]]$roles) == 'publisher' |
unlist(providers[[k]]["roles"]) == 'distributor' |
unlist(providers[[k]]["roles"]) == 'pointOfContact')
if("" %in% distributor_k){
distributor_k <- subset(distributor_k, distributor_k != "")

if (length(distributor_k) == 0){
distributor_k <- NA
}
}

checks[l, "publisher"] <- paste(distributor_k, collapse = ",")
l <- l + 1
}

# fill "providers"
# does it have providers? ('Unknown' = 0, yes = 1)
l <- j
for (k in seq_along(providers)){
providers_k <- providers[[k]]$name

# if("" %in% providers_k){
# providers_k <- which(providers_k != "")
# }

if (any(providers_k == "Unknown") | any(is.na(providers_k))){
checks[l, "providers"] <- 0
} else {
checks[l, "providers"] <- 1
}
l <- l + 1
}

# fill "providers_url" and "prov_url_works"
l <- j
for (k in seq_along(providers)){
providers_k <- providers[[k]]$contactInfo$url$url

providers_k <- subset(providers_k, !is.na(providers_k))

if(length(providers_k) == 0){
checks[l, "prov_url"] <- NA
} else{
checks[l, "prov_url"] <- paste(providers_k, collapse = ",")

if (any(grepl("http://", providers_k) | grepl("https://", providers_k))){

link_works_k <- sapply(providers_k,valid_url)

if(length(link_works_k) == sum(link_works_k)){
checks[l, "prov_url_works"] <- "TRUE"
} else{
checks[l, "prov_url_works"] <- paste(as.character(link_works_k), collapse = ",")
}
} else {
checks[l, "prov_url_works"] <- "character"
}
}
l <- l + 1
}

# fill email
# extract the email
l <- j
for (k in seq_along(providers)){
email_k <- providers[[k]]$contactInfo$email
email_k <- subset(email_k, !is.na(email_k))

if(nrow(email_k) == 0){
checks[l, "email"] <- NA
} else {
checks[l, "email"] <- paste(as.character(email_k), collapse = ",")
}
l <- l + 1
}

# fill "time" ----
print(paste0("fill time: ", i))
# check the temporal extent
l <- j
for (k in seq_along(time)){

if(is.list(time)){
checks[l, "time"] <- paste(time[[k]], collapse = ",")
} else {
checks[l, "time"] <- paste(time[l], collapse = ",")
}
l <- l + 1
}

# fill created
l <- j
for (k in seq_along(created)){
checks[l, "created"] <- created[k]
l <- l + 1
}

# fill updated
l <- j
for (k in seq_along(updated)){
checks[l, "updated"] <- updated[k]
l <- l + 1
}

# fill external id ----
print(paste0("fill external id: ", i))

# does it have ext_id? (no = NA, yes(works? no = FALSE, yes = TRUE))
l <- j
for (k in seq_along(ext_id)){
ext_id_k <- ext_id[[k]]
# fill "ext_id_works" and "ext_id"
if (is.null(ext_id_k)){
checks[l, "ext_id_works"] <- NA
checks[l, "ext_id"] <- NA
} else {
checks[l, "ext_id"] <- ext_id_k
if (grepl("^http://", ext_id_k) | grepl("^https://", ext_id_k)){
checks[l, "ext_id_works"] <- as.character(sapply(ext_id_k,valid_url))
} else {
checks[l, "ext_id_works"] <- "character"
}
}
l <- l + 1
}

# fill links ----
# does it have links? (no = NA, yes(works? no = FALSE, yes = TRUE))
print(paste0("fill links: ", i))

l <- j
for (k in seq_along(links)){

# extract links # k
links_k <- subset(links[[k]], type != "canonical" & type != "application/json" & type != "metadata")

# fill wih NA if there are no links available
if (nrow(links_k) == 0){
checks[l, "links"] <- NA
checks[l, "link_works"] <- NA
} else {
# paste together the links in one vector
checks[l, "links"] <- paste(links_k$href, collapse = ",")

# check if link works

# initialize "link_works"
link_works <- c()

for (m in 1:length(links_k$href)){
link_m <- links_k$href[m]

cond1 <- grepl("http://*", link_m) | grepl("https://*", link_m)
cond2 <- sapply(link_m, grepl, pattern="^10.*")

# check links that start with "http://*" or "https://*"
if (cond1){
link_works <- c(link_works, sapply(link_m, valid_url))
}

# check links that start with "^10.*", which are doi's
if (cond2){
# add a flag to the incomplete link
checks[l, "links_notes"] <- paste0(!is.na(checks[l, "links_notes"]), "add \"https://doi.org/\" to doi")
checks[l, "url_fix"] <- link_m

# fix the link
link_f <- paste0("https://doi.org/", link_m)

link_works <- c(link_works, sapply(link_f, valid_url))
}

# if the link neither starts with "http://*" or "^10.*"
if (!cond1 & !cond2){
link_works <- c(link_works, NA)
checks[l, "links_notes"] <- paste0(!is.na(checks[l, "links_notes"]), ", check ", link_m)
}

}

# if all the links work, write just one "TRUE"
if(length(link_works) == sum(link_works)){
checks[l, "link_works"] <- "TRUE"
} else{
# if at least one of the links doesn't work, write all the TRUE and FALSE
checks[l, "link_works"] <- paste(as.character(link_works), collapse = ",")
}

} # else closes

l <- l + 1
} # for closes

# fill catalogue url ----
# make up url to catalog

l <- j
for (k in seq_along(links)){
catlinkl <- "https://catalogue.ejpsoil.eu/collections/metadata:main/items/"
catlinkl_k <- paste0(catlinkl, URLencode(id[k], reserved = TRUE, repeated = TRUE))
checks[l, "catalogue_url"] <- catlinkl_k
#print(catlinkl_k)
l <- l + 1
}
}

#test <- subset(checks, !rowSums(is.na(checks)) == ncol(checks))

# join "title_notes"
previous_xlsx <- read.xlsx("~/ISRIC_Workspace/scratch/diana/EJP-catalog-checks_11.xlsx", sheetIndex = 1)
# delete empty "title_notes"
#drop <- c("title_notes")
#checks <- checks[,!(names(checks) %in% drop)]
#test <- subset(checks, !is.na(check_id))

checks_plus_notes <- left_join(checks, previous_xlsx[c("id", "title_notes")], by = join_by("id"))
checks_plus_notes <- subset(checks_plus_notes, !duplicated(checks_plus_notes))
checks_plus_notes <- checks_plus_notes %>% relocate(title_notes, .after = title_check)

# create a new column with link pointing to "diana" branch
checks_plus_notes["diana"] <- lapply(checks_plus_notes["canonical"], gsub, pattern = "/main/", replacement = "/diana/")

# save xlsx
write.xlsx(checks_plus_notes, file = "~/ISRIC_Workspace/scratch/diana/EJP-catalog-checks_15.xlsx", row.names = FALSE)

19 changes: 19 additions & 0 deletions scripts/validation/EJP-check-titles.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# task: check that titles make sense

# load libraries ----
library(xlsx)

# load data
xlsx <- read.xlsx("~/ISRIC_Workspace/scratch/diana/EJP-catalog-checks_11.xlsx", sheetIndex = 1)

# check names
names(xlsx)

# # select columns useful for title check
# df <- xlsx[c("check_id", "id", "title", "title_check", "title_notes",
# "scope", "country", "catalogue_url", "canonical")]

xlsx["diana"] <- lapply(xlsx["canonical"], gsub, pattern = "/main/", replacement = "/diana/")

# save table
write.xlsx(xlsx, file = "~/ISRIC_Workspace/scratch/diana/EJP-catalog-checks_11.xlsx", row.names = FALSE)
Loading

0 comments on commit 35b6d22

Please sign in to comment.