Skip to content

Commit

Permalink
Merge pull request #1043 from research-software-directory/1039-author…
Browse files Browse the repository at this point in the history
…-size

1039 author size
  • Loading branch information
ewan-escience authored Nov 6, 2023
2 parents e54a0e0 + 182cd50 commit 0e25a75
Show file tree
Hide file tree
Showing 8 changed files with 63 additions and 24 deletions.
5 changes: 5 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@
# example env file
# copy to .env and /frontend/.env.local

################ WARNING ################
# Using special characters in the values (e.g. in passwords or secrets) might corrupt some processes.
# If you experience any problems, remove the special characters from the values or place them in quotes (' or ").
################ WARNING ################

# .env.local is consumed by frontent (Next)
# see https://nextjs.org/docs/basic-features/environment-variables

Expand Down
2 changes: 1 addition & 1 deletion database/009-create-mention-table.sql
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ CREATE TABLE mention (
doi_registration_date TIMESTAMPTZ,
url VARCHAR(500) CHECK (url ~ '^https?://'),
title VARCHAR(500) NOT NULL,
authors VARCHAR(15000),
authors VARCHAR(50000),
publisher VARCHAR(255),
publication_year SMALLINT,
journal VARCHAR(500),
Expand Down
8 changes: 6 additions & 2 deletions database/118-backend-logs-views.sql
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,20 @@ $$
SELECT CASE
WHEN table_name = 'repository_url' THEN (
SELECT
CONCAT('/software/',slug,'/edit/information') as slug
CONCAT('/software/', slug, '/edit/information') as slug
FROM
software WHERE id = reference_id
)
WHEN table_name = 'package_manager' THEN (
SELECT
CONCAT('/software/',slug,'/edit/package-managers') as slug
CONCAT('/software/', slug, '/edit/package-managers') as slug
FROM
software
WHERE id = (SELECT software FROM package_manager WHERE id = reference_id))
WHEN table_name = 'mention' AND reference_id IS NOT NULL THEN (
SELECT
CONCAT('/api/v1/mention?id=eq.', reference_id) as slug
)
END
$$;

Expand Down
10 changes: 6 additions & 4 deletions scrapers/src/main/java/nl/esciencecenter/rsd/scraper/Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -235,12 +235,14 @@ public static void saveExceptionInDatabase(String serviceName, String tableName,
postAsAdmin(Config.backendBaseUrl() + "/backend_log", logData.toString());
}

public static void saveErrorMessageInDatabase(String message, String tableName, String columnName, String primaryKey, String primaryKeyName, ZonedDateTime scrapedAt, String scapedAtName) {
public static void saveErrorMessageInDatabase(String message, String tableName, String columnName, String primaryKey, String primaryKeyName, ZonedDateTime scrapedAt, String scrapedAtName) {
JsonObject body = new JsonObject();
body.addProperty(columnName, message);
if (columnName != null) {
body.addProperty(columnName, message);
}

if (scrapedAt != null && scapedAtName != null) {
body.addProperty(scapedAtName, scrapedAt.format(DateTimeFormatter.ISO_OFFSET_DATE_TIME));
if (scrapedAt != null && scrapedAtName != null) {
body.addProperty(scrapedAtName, scrapedAt.format(DateTimeFormatter.ISO_OFFSET_DATE_TIME));
}

String uri = createPatchUri(Config.backendBaseUrl(), tableName, primaryKey, primaryKeyName);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ public MentionRecord mentionData() {
} catch (RuntimeException e) {
// year not found, we leave it at null, nothing to do
}
if (workJson.getAsJsonArray("container-title").size() > 0) {
if (!workJson.getAsJsonArray("container-title").isEmpty()) {
JsonArray journalTitles = workJson.getAsJsonArray("container-title");
result.journal = journalTitles.get(0).getAsString();
for (int i = 1; i < journalTitles.size(); i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,9 @@ public Collection<MentionRecord> leastRecentlyScrapedMentions(int limit) {

@Override
public Collection<MentionRecord> mentionData(Collection<String> dois) {
if (dois.isEmpty()) return Collections.EMPTY_LIST;
if (dois.isEmpty()) {
return Collections.emptyList();
}

JsonObject body = new JsonObject();
body.addProperty("query", QUERY_UNFORMATTED.formatted(joinCollection(dois)));
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// SPDX-FileCopyrightText: 2022 Ewan Cahen (Netherlands eScience Center) <e.cahen@esciencecenter.nl>
// SPDX-FileCopyrightText: 2022 Netherlands eScience Center
// SPDX-FileCopyrightText: 2022 - 2023 Ewan Cahen (Netherlands eScience Center) <e.cahen@esciencecenter.nl>
// SPDX-FileCopyrightText: 2022 - 2023 Netherlands eScience Center
//
// SPDX-License-Identifier: Apache-2.0

Expand All @@ -26,9 +26,9 @@ public static void main(String[] args) {
System.out.println("Start scraping mentions");
MentionRepository localMentionRepository = new PostgrestMentionRepository(Config.backendBaseUrl());
Collection<MentionRecord> mentionsToScrape = localMentionRepository.leastRecentlyScrapedMentions(Config.maxRequestsDoi());
// we will remove successfully scraped mentions from here,
// we use this to set scrapedAt even for failed mentions,
// to put them back at the scraping order
// we will remove successfully scraped mentions from here,
// we use this to set scrapedAt even for failed mentions,
// to put them back at the scraping order
Map<String, MentionRecord> mentionsFailedToScrape = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
for (MentionRecord mentionRecord : mentionsToScrape) {
mentionsFailedToScrape.put(mentionRecord.doi, mentionRecord);
Expand All @@ -46,13 +46,12 @@ public static void main(String[] args) {
Collection<String> dataciteDois = doiToSource.entrySet()
.stream()
.filter(doiSourceEntry -> doiSourceEntry.getValue().equals("DataCite"))
.map(doiSourceEntry -> doiSourceEntry.getKey())
.map(Map.Entry::getKey)
.toList();
try {
scrapedMentions.addAll(new DataciteMentionRepository().mentionData(dataciteDois));
} catch (RuntimeException e) {
System.out.println("Failed to scrape from DataCite");
e.printStackTrace();
Utils.saveExceptionInDatabase("DataCite mention scraper", "mention", null, e);
}
for (MentionRecord scrapedMention : scrapedMentions) {
mentionsFailedToScrape.remove(scrapedMention.doi);
Expand All @@ -61,16 +60,16 @@ public static void main(String[] args) {
Collection<String> crossrefDois = doiToSource.entrySet()
.stream()
.filter(doiSourceEntry -> doiSourceEntry.getValue().equals("Crossref"))
.map(doiSourceEntry -> doiSourceEntry.getKey())
.map(Map.Entry::getKey)
.toList();
for (String crossrefDoi : crossrefDois) {
try {
MentionRecord scrapedMention = new CrossrefMention(crossrefDoi).mentionData();
scrapedMentions.add(scrapedMention);
mentionsFailedToScrape.remove(scrapedMention.doi);
} catch (RuntimeException e) {
System.out.println("Failed to scrape a Crossref mention with DOI " + crossrefDoi);
e.printStackTrace();
RuntimeException exceptionWithMessage = new RuntimeException("Failed to scrape a Crossref mention with DOI " + crossrefDoi, e);
Utils.saveExceptionInDatabase("Crossref mention scraper", "mention", null, exceptionWithMessage);
}
}

Expand All @@ -81,7 +80,11 @@ public static void main(String[] args) {
scrapedMentions.addAll(mentionsFailedToScrape.values());


localMentionRepository.save(scrapedMentions);
try {
localMentionRepository.save(scrapedMentions);
} catch (RuntimeException e) {
Utils.saveExceptionInDatabase("Mention scraper", "mention", null, e);
}

System.out.println("Done scraping mentions");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,34 @@ public void save(Collection<MentionRecord> mentions) {
}

String uri = "%s/mention?on_conflict=%s&select=id".formatted(backendUrl, onConflictFilter);
String response = Utils.postAsAdmin(uri, scrapedMentionJson, "Prefer", "resolution=merge-duplicates,return=representation");
String response;
try {
response = Utils.postAsAdmin(uri, scrapedMentionJson, "Prefer", "resolution=merge-duplicates,return=representation");
} catch (RuntimeException e) {
if (mention.doi == null) {
Utils.saveExceptionInDatabase("Mention scraper", "mention", null, e);
} else {
// We will try to update the scraped_at field, so that it goes back into the queue for being scraped
String existingMentionResponse = Utils.getAsAdmin("%s/mention?doi=eq.%s&select=id".formatted(backendUrl, mention.doi));
JsonArray array = JsonParser.parseString(existingMentionResponse).getAsJsonArray();
String id = array.get(0).getAsJsonObject().getAsJsonPrimitive("id").getAsString();
Utils.saveErrorMessageInDatabase(null,
"mention",
null,
id,
"id",
ZonedDateTime.now(),
"scraped_at");

Utils.saveExceptionInDatabase("Mention scraper", "mention", UUID.fromString(id), e);
}

continue;
}

JsonArray responseAsArray = JsonParser.parseString(response).getAsJsonArray();
UUID id = UUID.fromString(responseAsArray.get(0).getAsJsonObject().getAsJsonPrimitive("id").getAsString());
mention.id = id;
// Used in MainCitations, do not remove
mention.id = UUID.fromString(responseAsArray.get(0).getAsJsonObject().getAsJsonPrimitive("id").getAsString());
}
}
}

0 comments on commit 0e25a75

Please sign in to comment.