From fc7a02d86f9c28b7bf024ea696f1f87290413568 Mon Sep 17 00:00:00 2001 From: jochen_vermeulen Date: Thu, 3 Oct 2024 11:35:13 +0200 Subject: [PATCH] MET-5960: Start of an improvement to split by the actual language of the value itself. --- .../workflow/DeBiasProcessServiceImpl.java | 60 ++++++++++++++----- 1 file changed, 44 insertions(+), 16 deletions(-) diff --git a/src/main/java/eu/europeana/metis/sandbox/service/workflow/DeBiasProcessServiceImpl.java b/src/main/java/eu/europeana/metis/sandbox/service/workflow/DeBiasProcessServiceImpl.java index c6814b5f..0c89304d 100644 --- a/src/main/java/eu/europeana/metis/sandbox/service/workflow/DeBiasProcessServiceImpl.java +++ b/src/main/java/eu/europeana/metis/sandbox/service/workflow/DeBiasProcessServiceImpl.java @@ -14,7 +14,9 @@ import eu.europeana.metis.schema.jibx.RDF; import eu.europeana.metis.schema.jibx.ResourceOrLiteralType; import java.nio.charset.StandardCharsets; +import java.util.Arrays; import java.util.Collection; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Locale; @@ -94,36 +96,56 @@ private HashMap doDeBiasAndGenerateReport(DetectionParamet return deBiasReport; } - private List getDescriptionsFromRecordList(List recordList) { + enum SupportedLanguage{ + ENGLISH("en"), + ITALIAN("it"), + GERMAN("de"), + DUTCH("nl"), + FRENCH("fr"); + + private final String prefix; + + SupportedLanguage(String prefix) { + this.prefix = prefix; + } + + public static SupportedLanguage match(String language) { + final String mainLanguage = language.split("-")[0]; + return Arrays.stream(SupportedLanguage.values()) + .filter(lang -> lang.prefix.equals(mainLanguage)).findAny().orElse(null); + } + } + + record ValueToCheck(String value, SupportedLanguage language, long recordId){} + + private List getDescriptionsFromRecordList(List recordList) { return recordList .stream() .map(recordToProcess -> { - String recordDescription = ""; + List result; try { - recordDescription = String.join(" ", - getDescriptionsFromRdf( + result = getDescriptionsFromRdf( new RdfConversionUtils() .convertStringToRdf( new String(recordToProcess.getContent(), StandardCharsets.UTF_8) - ) - ) - ); + ), recordToProcess.getRecordId() + ); } catch (SerializationException e) { - recordDescription = ""; + result = Collections.emptyList(); } - LOGGER.info("DeBias Execution over: {} {}", recordToProcess.getRecordId(), recordDescription); - return recordDescription; - }).toList(); + LOGGER.info("DeBias Execution over: {}", recordToProcess.getRecordId()); + return result; + }).flatMap(Collection::stream).toList(); } - private List getDescriptionsFromRdf(RDF rdf) { + private List getDescriptionsFromRdf(RDF rdf, long recordId) { List providerProxies = this.getProviderProxies(rdf); List choices = providerProxies.stream().map(EuropeanaType::getChoiceList).filter(Objects::nonNull) .flatMap(Collection::stream).toList(); return this.getChoicesInStringList(choices, EuropeanaType.Choice::ifDescription, EuropeanaType.Choice::getDescription, - ResourceOrLiteralType::getString); + ResourceOrLiteralType::getString, value->value.getLang().getLang(), recordId); } private boolean isProviderProxy(ProxyType proxy) { @@ -137,8 +159,14 @@ private List getProviderProxies(RDF rdf) { .filter(this::isProviderProxy).toList(); } - private List getChoicesInStringList(List choices, Predicate choicePredicate, - Function choiceGetter, Function getString) { - return choices.stream().filter(Objects::nonNull).filter(choicePredicate).map(choiceGetter).map(getString).toList(); + private List getChoicesInStringList(List choices, + Predicate choicePredicate, Function choiceGetter, + Function getString, Function getLanguage, long recordId) { + return choices.stream().filter(Objects::nonNull).filter(choicePredicate).map(choiceGetter) + .map(value->{ + return Optional.ofNullable(SupportedLanguage.match(getLanguage.apply(value))).map(lang->new ValueToCheck(getString.apply(value), lang, recordId)).orElse(null); + + }) + .toList(); } }