-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
colla007
committed
Jan 31, 2025
1 parent
c9dbb34
commit 35b6d22
Showing
9 changed files
with
996 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,379 @@ | ||
# task: content of ejpsoil catalogue at https://catalogue.ejpsoil.eu | ||
|
||
# load libraries ---- | ||
library(jsonlite) | ||
library(xlsx) | ||
library(stringr) | ||
library(dplyr) | ||
|
||
# create an empty data frame to store the assessment ---- | ||
# created vector with 5 characters | ||
columns <- c("check_id", "id", | ||
"title", "title_check", #"title_notes", | ||
"keywords", "scope", "country", "concept", "folder", | ||
"description", "license", | ||
"time", "created", "updated", | ||
"providers", "publisher", "prov_url", "prov_url_works", "email", | ||
"ext_id", "ext_id_works", "ext_id_notes", | ||
"links", "link_works", "links_notes", "url_fix", | ||
"catalogue_url", "canonical") | ||
# pass this vector length to ncol parameter | ||
# and nrow with 0 | ||
checks <- data.frame(matrix(nrow = 0, ncol = length(columns))) | ||
# assign column names | ||
colnames(checks) <- columns | ||
|
||
# valid url function ---- | ||
valid_url <- function(url_in,t=2){ | ||
con <- url(url_in) | ||
check <- suppressWarnings(try(open.connection(con,open="rt",timeout=t),silent=T)[1]) | ||
suppressWarnings(try(close.connection(con),silent=T)) | ||
ifelse(is.null(check),TRUE,FALSE) | ||
} | ||
|
||
# urls <- c("https://www.dov.vlaanderen.be/bodemverkenner", ... ) | ||
# sapply(urls,valid_url) | ||
|
||
# loop catalog ---- | ||
|
||
# the catalog allows the offset of 50 elements, and have in total 577 ---- | ||
nmax <- 577 | ||
offset_i <- seq(from = 0, to = nmax, by = 50) | ||
|
||
for (i in offset_i){ | ||
|
||
# load JSON catalog from url | ||
EJP_catalog <- fromJSON(paste0("https://catalogue.ejpsoil.eu/collections/metadata:main/items?offset=", i)) | ||
# get id | ||
id <- EJP_catalog$features$id | ||
# get title | ||
title <- EJP_catalog$features$properties$title # vector | ||
# get keywords | ||
keywords <- EJP_catalog$features$properties$keywords # list | ||
# get themes | ||
themes <- EJP_catalog$features$properties$themes | ||
# get description | ||
description <- EJP_catalog$features$properties$description # vector # does it have description? (no = 0, yes = 1) does the description makes sense? (no = 2) | ||
# get time | ||
time <- EJP_catalog$features$time | ||
# get created | ||
created <- EJP_catalog$features$properties$created | ||
# get updated | ||
updated <- EJP_catalog$features$properties$updated | ||
# get providers | ||
providers <- EJP_catalog$features$properties$providers # df list # does it have providers? (no = 0, yes = 1) what info about providers is significant? email? url? | ||
# get external id's | ||
ext_id <- EJP_catalog$features$properties$externalIds # df list, "value" # does it have ext_id? does it work? | ||
# get links | ||
links <- EJP_catalog$features$links # df list, "href", # does it have href link? (no = 0, yes = 1) does it work? | ||
# get license | ||
license <- EJP_catalog$features$properties$license # vector | ||
|
||
# fill checks table | ||
|
||
# first row to write the records in the iteration | ||
j = i + 1 | ||
|
||
# fill "check_id" | ||
print(paste0("fill id: ", i)) | ||
checks[j:(i+length(id)),"check_id"] <- j:(i+length(id)) | ||
|
||
# fill id ---- | ||
checks[j:(i+length(id)),"id"] <- id | ||
|
||
# fill "title" ---- | ||
print(paste0("fill title: ", i)) | ||
checks[j:(i+length(title)), "title"] <- title | ||
|
||
# fill "title_check" | ||
checks[j:(i+length(title)), "title_check"] <- ifelse(is.na(checks$title[j:(i+length(title))]), 0, 1) # does it have a title? (no = 0, yes = 1) | ||
|
||
# fill "license" ---- | ||
if (is.null(license)){ | ||
checks[j:(i+length(license)), "license"] <- NA | ||
} else { | ||
checks[j:(i+length(license)), "license"] <- license | ||
} | ||
|
||
# fill "keywords" ---- | ||
print(paste0("fill keywords: ", i)) | ||
# does it have keywords? which? | ||
l <- j | ||
for (k in seq_along(keywords)){ | ||
checks[l, "keywords"] <- paste(keywords[[k]], collapse = ",") | ||
l <- l + 1 | ||
} | ||
|
||
# fill canonical ---- | ||
# extract github link | ||
l <- j | ||
for (k in seq_along(links)){ | ||
canonical_k <- subset(links[[k]], type == "canonical") | ||
checks[l, "canonical"] <- canonical_k$href | ||
l <- l + 1 | ||
} | ||
|
||
# fill folder ---- | ||
l <- j | ||
for (k in seq_along(links)){ | ||
canonical_k <- subset(links[[k]], type == "canonical")$href | ||
checks[l, "folder"] <- substr(canonical_k, 62, max(unlist(gregexpr('/', canonical_k)))-1) | ||
l <- l + 1 | ||
} | ||
|
||
# fill scope ---- | ||
print(paste0("fill scope: ", i)) | ||
l <- j | ||
for (k in seq_along(themes)){ | ||
scope_k <- themes[[k]]$concepts[[1]][1] | ||
concept_k <- themes[[k]]$concepts[[2]][1] | ||
checks[l, "scope"] <- scope_k | ||
checks[l, "concept"] <- concept_k | ||
|
||
|
||
# fill country | ||
if (scope_k == "National"){ | ||
str_country <- substr(checks[l, "canonical"], 62, nchar(checks[l, "canonical"])) | ||
slash_loc <- unlist(gregexpr('/', str_country)) | ||
checks[l, "country"] <- substr(str_country, slash_loc[1]+1, slash_loc[2]-1) | ||
} | ||
|
||
l <- l + 1 | ||
} | ||
|
||
# fill "description" ---- | ||
print(paste0("fill description: ", i)) | ||
checks[j:(i+length(description)), "description"] <- ifelse(is.na(description), 0, 1) # does it have a description? (no = 0, yes = 1) | ||
|
||
|
||
# fill providers ---- | ||
print(paste0("fill providers: ", i)) | ||
|
||
# fill publisher | ||
l <- j | ||
for (k in seq_along(providers)){ | ||
distributor_k <- subset(providers[[k]]$name, unlist(providers[[k]]$roles) == 'publisher' | | ||
unlist(providers[[k]]["roles"]) == 'distributor' | | ||
unlist(providers[[k]]["roles"]) == 'pointOfContact') | ||
if("" %in% distributor_k){ | ||
distributor_k <- subset(distributor_k, distributor_k != "") | ||
|
||
if (length(distributor_k) == 0){ | ||
distributor_k <- NA | ||
} | ||
} | ||
|
||
checks[l, "publisher"] <- paste(distributor_k, collapse = ",") | ||
l <- l + 1 | ||
} | ||
|
||
# fill "providers" | ||
# does it have providers? ('Unknown' = 0, yes = 1) | ||
l <- j | ||
for (k in seq_along(providers)){ | ||
providers_k <- providers[[k]]$name | ||
|
||
# if("" %in% providers_k){ | ||
# providers_k <- which(providers_k != "") | ||
# } | ||
|
||
if (any(providers_k == "Unknown") | any(is.na(providers_k))){ | ||
checks[l, "providers"] <- 0 | ||
} else { | ||
checks[l, "providers"] <- 1 | ||
} | ||
l <- l + 1 | ||
} | ||
|
||
# fill "providers_url" and "prov_url_works" | ||
l <- j | ||
for (k in seq_along(providers)){ | ||
providers_k <- providers[[k]]$contactInfo$url$url | ||
|
||
providers_k <- subset(providers_k, !is.na(providers_k)) | ||
|
||
if(length(providers_k) == 0){ | ||
checks[l, "prov_url"] <- NA | ||
} else{ | ||
checks[l, "prov_url"] <- paste(providers_k, collapse = ",") | ||
|
||
if (any(grepl("http://", providers_k) | grepl("https://", providers_k))){ | ||
|
||
link_works_k <- sapply(providers_k,valid_url) | ||
|
||
if(length(link_works_k) == sum(link_works_k)){ | ||
checks[l, "prov_url_works"] <- "TRUE" | ||
} else{ | ||
checks[l, "prov_url_works"] <- paste(as.character(link_works_k), collapse = ",") | ||
} | ||
} else { | ||
checks[l, "prov_url_works"] <- "character" | ||
} | ||
} | ||
l <- l + 1 | ||
} | ||
|
||
# fill email | ||
# extract the email | ||
l <- j | ||
for (k in seq_along(providers)){ | ||
email_k <- providers[[k]]$contactInfo$email | ||
email_k <- subset(email_k, !is.na(email_k)) | ||
|
||
if(nrow(email_k) == 0){ | ||
checks[l, "email"] <- NA | ||
} else { | ||
checks[l, "email"] <- paste(as.character(email_k), collapse = ",") | ||
} | ||
l <- l + 1 | ||
} | ||
|
||
# fill "time" ---- | ||
print(paste0("fill time: ", i)) | ||
# check the temporal extent | ||
l <- j | ||
for (k in seq_along(time)){ | ||
|
||
if(is.list(time)){ | ||
checks[l, "time"] <- paste(time[[k]], collapse = ",") | ||
} else { | ||
checks[l, "time"] <- paste(time[l], collapse = ",") | ||
} | ||
l <- l + 1 | ||
} | ||
|
||
# fill created | ||
l <- j | ||
for (k in seq_along(created)){ | ||
checks[l, "created"] <- created[k] | ||
l <- l + 1 | ||
} | ||
|
||
# fill updated | ||
l <- j | ||
for (k in seq_along(updated)){ | ||
checks[l, "updated"] <- updated[k] | ||
l <- l + 1 | ||
} | ||
|
||
# fill external id ---- | ||
print(paste0("fill external id: ", i)) | ||
|
||
# does it have ext_id? (no = NA, yes(works? no = FALSE, yes = TRUE)) | ||
l <- j | ||
for (k in seq_along(ext_id)){ | ||
ext_id_k <- ext_id[[k]] | ||
# fill "ext_id_works" and "ext_id" | ||
if (is.null(ext_id_k)){ | ||
checks[l, "ext_id_works"] <- NA | ||
checks[l, "ext_id"] <- NA | ||
} else { | ||
checks[l, "ext_id"] <- ext_id_k | ||
if (grepl("^http://", ext_id_k) | grepl("^https://", ext_id_k)){ | ||
checks[l, "ext_id_works"] <- as.character(sapply(ext_id_k,valid_url)) | ||
} else { | ||
checks[l, "ext_id_works"] <- "character" | ||
} | ||
} | ||
l <- l + 1 | ||
} | ||
|
||
# fill links ---- | ||
# does it have links? (no = NA, yes(works? no = FALSE, yes = TRUE)) | ||
print(paste0("fill links: ", i)) | ||
|
||
l <- j | ||
for (k in seq_along(links)){ | ||
|
||
# extract links # k | ||
links_k <- subset(links[[k]], type != "canonical" & type != "application/json" & type != "metadata") | ||
|
||
# fill wih NA if there are no links available | ||
if (nrow(links_k) == 0){ | ||
checks[l, "links"] <- NA | ||
checks[l, "link_works"] <- NA | ||
} else { | ||
# paste together the links in one vector | ||
checks[l, "links"] <- paste(links_k$href, collapse = ",") | ||
|
||
# check if link works | ||
|
||
# initialize "link_works" | ||
link_works <- c() | ||
|
||
for (m in 1:length(links_k$href)){ | ||
link_m <- links_k$href[m] | ||
|
||
cond1 <- grepl("http://*", link_m) | grepl("https://*", link_m) | ||
cond2 <- sapply(link_m, grepl, pattern="^10.*") | ||
|
||
# check links that start with "http://*" or "https://*" | ||
if (cond1){ | ||
link_works <- c(link_works, sapply(link_m, valid_url)) | ||
} | ||
|
||
# check links that start with "^10.*", which are doi's | ||
if (cond2){ | ||
# add a flag to the incomplete link | ||
checks[l, "links_notes"] <- paste0(!is.na(checks[l, "links_notes"]), "add \"https://doi.org/\" to doi") | ||
checks[l, "url_fix"] <- link_m | ||
|
||
# fix the link | ||
link_f <- paste0("https://doi.org/", link_m) | ||
|
||
link_works <- c(link_works, sapply(link_f, valid_url)) | ||
} | ||
|
||
# if the link neither starts with "http://*" or "^10.*" | ||
if (!cond1 & !cond2){ | ||
link_works <- c(link_works, NA) | ||
checks[l, "links_notes"] <- paste0(!is.na(checks[l, "links_notes"]), ", check ", link_m) | ||
} | ||
|
||
} | ||
|
||
# if all the links work, write just one "TRUE" | ||
if(length(link_works) == sum(link_works)){ | ||
checks[l, "link_works"] <- "TRUE" | ||
} else{ | ||
# if at least one of the links doesn't work, write all the TRUE and FALSE | ||
checks[l, "link_works"] <- paste(as.character(link_works), collapse = ",") | ||
} | ||
|
||
} # else closes | ||
|
||
l <- l + 1 | ||
} # for closes | ||
|
||
# fill catalogue url ---- | ||
# make up url to catalog | ||
|
||
l <- j | ||
for (k in seq_along(links)){ | ||
catlinkl <- "https://catalogue.ejpsoil.eu/collections/metadata:main/items/" | ||
catlinkl_k <- paste0(catlinkl, URLencode(id[k], reserved = TRUE, repeated = TRUE)) | ||
checks[l, "catalogue_url"] <- catlinkl_k | ||
#print(catlinkl_k) | ||
l <- l + 1 | ||
} | ||
} | ||
|
||
#test <- subset(checks, !rowSums(is.na(checks)) == ncol(checks)) | ||
|
||
# join "title_notes" | ||
previous_xlsx <- read.xlsx("~/ISRIC_Workspace/scratch/diana/EJP-catalog-checks_11.xlsx", sheetIndex = 1) | ||
# delete empty "title_notes" | ||
#drop <- c("title_notes") | ||
#checks <- checks[,!(names(checks) %in% drop)] | ||
#test <- subset(checks, !is.na(check_id)) | ||
|
||
checks_plus_notes <- left_join(checks, previous_xlsx[c("id", "title_notes")], by = join_by("id")) | ||
checks_plus_notes <- subset(checks_plus_notes, !duplicated(checks_plus_notes)) | ||
checks_plus_notes <- checks_plus_notes %>% relocate(title_notes, .after = title_check) | ||
|
||
# create a new column with link pointing to "diana" branch | ||
checks_plus_notes["diana"] <- lapply(checks_plus_notes["canonical"], gsub, pattern = "/main/", replacement = "/diana/") | ||
|
||
# save xlsx | ||
write.xlsx(checks_plus_notes, file = "~/ISRIC_Workspace/scratch/diana/EJP-catalog-checks_15.xlsx", row.names = FALSE) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# task: check that titles make sense | ||
|
||
# load libraries ---- | ||
library(xlsx) | ||
|
||
# load data | ||
xlsx <- read.xlsx("~/ISRIC_Workspace/scratch/diana/EJP-catalog-checks_11.xlsx", sheetIndex = 1) | ||
|
||
# check names | ||
names(xlsx) | ||
|
||
# # select columns useful for title check | ||
# df <- xlsx[c("check_id", "id", "title", "title_check", "title_notes", | ||
# "scope", "country", "catalogue_url", "canonical")] | ||
|
||
xlsx["diana"] <- lapply(xlsx["canonical"], gsub, pattern = "/main/", replacement = "/diana/") | ||
|
||
# save table | ||
write.xlsx(xlsx, file = "~/ISRIC_Workspace/scratch/diana/EJP-catalog-checks_11.xlsx", row.names = FALSE) |
Oops, something went wrong.