From cafaf3aaa12e5066214a8f0d0af8702aff57e9dd Mon Sep 17 00:00:00 2001 From: Ewan Cahen Date: Wed, 1 Nov 2023 17:00:52 +0100 Subject: [PATCH 1/3] fix: increase authors column in mention table --- database/009-create-mention-table.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/database/009-create-mention-table.sql b/database/009-create-mention-table.sql index 2ea751246..9fc2c1bb9 100644 --- a/database/009-create-mention-table.sql +++ b/database/009-create-mention-table.sql @@ -33,7 +33,7 @@ CREATE TABLE mention ( doi_registration_date TIMESTAMPTZ, url VARCHAR(500) CHECK (url ~ '^https?://'), title VARCHAR(500) NOT NULL, - authors VARCHAR(15000), + authors VARCHAR(50000), publisher VARCHAR(255), publication_year SMALLINT, journal VARCHAR(500), From 489d5c773566145cb6c04aadc7b1b71cadaa2263 Mon Sep 17 00:00:00 2001 From: Ewan Cahen Date: Thu, 2 Nov 2023 14:26:07 +0100 Subject: [PATCH 2/3] fix: save exceptions when scraping mentions in the database --- database/118-backend-logs-views.sql | 8 +++-- .../nl/esciencecenter/rsd/scraper/Utils.java | 10 ++++--- .../rsd/scraper/doi/CrossrefMention.java | 2 +- .../doi/DataciteMentionRepository.java | 4 ++- .../rsd/scraper/doi/MainMentions.java | 27 +++++++++-------- .../doi/PostgrestMentionRepository.java | 29 +++++++++++++++++-- 6 files changed, 57 insertions(+), 23 deletions(-) diff --git a/database/118-backend-logs-views.sql b/database/118-backend-logs-views.sql index 8f9dfd24b..b8d6396b4 100644 --- a/database/118-backend-logs-views.sql +++ b/database/118-backend-logs-views.sql @@ -16,16 +16,20 @@ $$ SELECT CASE WHEN table_name = 'repository_url' THEN ( SELECT - CONCAT('/software/',slug,'/edit/information') as slug + CONCAT('/software/', slug, '/edit/information') as slug FROM software WHERE id = reference_id ) WHEN table_name = 'package_manager' THEN ( SELECT - CONCAT('/software/',slug,'/edit/package-managers') as slug + CONCAT('/software/', slug, '/edit/package-managers') as slug FROM software WHERE id = (SELECT software FROM package_manager WHERE id = reference_id)) + WHEN table_name = 'mention' AND reference_id IS NOT NULL THEN ( + SELECT + CONCAT('/api/v1/mention?id=eq.', reference_id) as slug + ) END $$; diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/Utils.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/Utils.java index 3b4fabb7f..6ef5a5030 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/Utils.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/Utils.java @@ -235,12 +235,14 @@ public static void saveExceptionInDatabase(String serviceName, String tableName, postAsAdmin(Config.backendBaseUrl() + "/backend_log", logData.toString()); } - public static void saveErrorMessageInDatabase(String message, String tableName, String columnName, String primaryKey, String primaryKeyName, ZonedDateTime scrapedAt, String scapedAtName) { + public static void saveErrorMessageInDatabase(String message, String tableName, String columnName, String primaryKey, String primaryKeyName, ZonedDateTime scrapedAt, String scrapedAtName) { JsonObject body = new JsonObject(); - body.addProperty(columnName, message); + if (columnName != null) { + body.addProperty(columnName, message); + } - if (scrapedAt != null && scapedAtName != null) { - body.addProperty(scapedAtName, scrapedAt.format(DateTimeFormatter.ISO_OFFSET_DATE_TIME)); + if (scrapedAt != null && scrapedAtName != null) { + body.addProperty(scrapedAtName, scrapedAt.format(DateTimeFormatter.ISO_OFFSET_DATE_TIME)); } String uri = createPatchUri(Config.backendBaseUrl(), tableName, primaryKey, primaryKeyName); diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/CrossrefMention.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/CrossrefMention.java index 0abadd302..431d159df 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/CrossrefMention.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/CrossrefMention.java @@ -99,7 +99,7 @@ public MentionRecord mentionData() { } catch (RuntimeException e) { // year not found, we leave it at null, nothing to do } - if (workJson.getAsJsonArray("container-title").size() > 0) { + if (!workJson.getAsJsonArray("container-title").isEmpty()) { JsonArray journalTitles = workJson.getAsJsonArray("container-title"); result.journal = journalTitles.get(0).getAsString(); for (int i = 1; i < journalTitles.size(); i++) { diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/DataciteMentionRepository.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/DataciteMentionRepository.java index 60dde05ce..c16754f8f 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/DataciteMentionRepository.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/DataciteMentionRepository.java @@ -202,7 +202,9 @@ public Collection leastRecentlyScrapedMentions(int limit) { @Override public Collection mentionData(Collection dois) { - if (dois.isEmpty()) return Collections.EMPTY_LIST; + if (dois.isEmpty()) { + return Collections.emptyList(); + } JsonObject body = new JsonObject(); body.addProperty("query", QUERY_UNFORMATTED.formatted(joinCollection(dois))); diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainMentions.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainMentions.java index 1bdc06943..7cb807c7a 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainMentions.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainMentions.java @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: 2022 Ewan Cahen (Netherlands eScience Center) -// SPDX-FileCopyrightText: 2022 Netherlands eScience Center +// SPDX-FileCopyrightText: 2022 - 2023 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2022 - 2023 Netherlands eScience Center // // SPDX-License-Identifier: Apache-2.0 @@ -26,9 +26,9 @@ public static void main(String[] args) { System.out.println("Start scraping mentions"); MentionRepository localMentionRepository = new PostgrestMentionRepository(Config.backendBaseUrl()); Collection mentionsToScrape = localMentionRepository.leastRecentlyScrapedMentions(Config.maxRequestsDoi()); -// we will remove successfully scraped mentions from here, -// we use this to set scrapedAt even for failed mentions, -// to put them back at the scraping order + // we will remove successfully scraped mentions from here, + // we use this to set scrapedAt even for failed mentions, + // to put them back at the scraping order Map mentionsFailedToScrape = new TreeMap<>(String.CASE_INSENSITIVE_ORDER); for (MentionRecord mentionRecord : mentionsToScrape) { mentionsFailedToScrape.put(mentionRecord.doi, mentionRecord); @@ -46,13 +46,12 @@ public static void main(String[] args) { Collection dataciteDois = doiToSource.entrySet() .stream() .filter(doiSourceEntry -> doiSourceEntry.getValue().equals("DataCite")) - .map(doiSourceEntry -> doiSourceEntry.getKey()) + .map(Map.Entry::getKey) .toList(); try { scrapedMentions.addAll(new DataciteMentionRepository().mentionData(dataciteDois)); } catch (RuntimeException e) { - System.out.println("Failed to scrape from DataCite"); - e.printStackTrace(); + Utils.saveExceptionInDatabase("DataCite mention scraper", "mention", null, e); } for (MentionRecord scrapedMention : scrapedMentions) { mentionsFailedToScrape.remove(scrapedMention.doi); @@ -61,7 +60,7 @@ public static void main(String[] args) { Collection crossrefDois = doiToSource.entrySet() .stream() .filter(doiSourceEntry -> doiSourceEntry.getValue().equals("Crossref")) - .map(doiSourceEntry -> doiSourceEntry.getKey()) + .map(Map.Entry::getKey) .toList(); for (String crossrefDoi : crossrefDois) { try { @@ -69,8 +68,8 @@ public static void main(String[] args) { scrapedMentions.add(scrapedMention); mentionsFailedToScrape.remove(scrapedMention.doi); } catch (RuntimeException e) { - System.out.println("Failed to scrape a Crossref mention with DOI " + crossrefDoi); - e.printStackTrace(); + RuntimeException exceptionWithMessage = new RuntimeException("Failed to scrape a Crossref mention with DOI " + crossrefDoi, e); + Utils.saveExceptionInDatabase("Crossref mention scraper", "mention", null, exceptionWithMessage); } } @@ -81,7 +80,11 @@ public static void main(String[] args) { scrapedMentions.addAll(mentionsFailedToScrape.values()); - localMentionRepository.save(scrapedMentions); + try { + localMentionRepository.save(scrapedMentions); + } catch (RuntimeException e) { + Utils.saveExceptionInDatabase("Mention scraper", "mention", null, e); + } System.out.println("Done scraping mentions"); } diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestMentionRepository.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestMentionRepository.java index c38ae1281..c793b6a76 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestMentionRepository.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestMentionRepository.java @@ -78,11 +78,34 @@ public void save(Collection mentions) { } String uri = "%s/mention?on_conflict=%s&select=id".formatted(backendUrl, onConflictFilter); - String response = Utils.postAsAdmin(uri, scrapedMentionJson, "Prefer", "resolution=merge-duplicates,return=representation"); + String response; + try { + response = Utils.postAsAdmin(uri, scrapedMentionJson, "Prefer", "resolution=merge-duplicates,return=representation"); + } catch (RuntimeException e) { + if (mention.doi == null) { + Utils.saveExceptionInDatabase("Mention scraper", "mention", null, e); + } else { + // We will try to update the scraped_at field, so that it goes back into the queue for being scraped + String existingMentionResponse = Utils.getAsAdmin("%s/mention?doi=eq.%s&select=id".formatted(backendUrl, mention.doi)); + JsonArray array = JsonParser.parseString(existingMentionResponse).getAsJsonArray(); + String id = array.get(0).getAsJsonObject().getAsJsonPrimitive("id").getAsString(); + Utils.saveErrorMessageInDatabase(null, + "mention", + null, + id, + "id", + ZonedDateTime.now(), + "scraped_at"); + + Utils.saveExceptionInDatabase("Mention scraper", "mention", UUID.fromString(id), e); + } + + continue; + } JsonArray responseAsArray = JsonParser.parseString(response).getAsJsonArray(); - UUID id = UUID.fromString(responseAsArray.get(0).getAsJsonObject().getAsJsonPrimitive("id").getAsString()); - mention.id = id; + // Used in MainCitations, do not remove + mention.id = UUID.fromString(responseAsArray.get(0).getAsJsonObject().getAsJsonPrimitive("id").getAsString()); } } } From 182cd50ee789f855f7d7730aff596c7066cd6a06 Mon Sep 17 00:00:00 2001 From: Ewan Cahen Date: Fri, 3 Nov 2023 11:42:41 +0100 Subject: [PATCH 3/3] docs: add warning about special characters in .env.example --- .env.example | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.env.example b/.env.example index be57a4185..95130763b 100644 --- a/.env.example +++ b/.env.example @@ -7,6 +7,11 @@ # example env file # copy to .env and /frontend/.env.local +################ WARNING ################ +# Using special characters in the values (e.g. in passwords or secrets) might corrupt some processes. +# If you experience any problems, remove the special characters from the values or place them in quotes (' or "). +################ WARNING ################ + # .env.local is consumed by frontent (Next) # see https://nextjs.org/docs/basic-features/environment-variables