-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
towards supporting offline wikidata taxon matcher; related to #181
- Loading branch information
Jorrit Poelen
committed
Jun 24, 2024
1 parent
3f90531
commit 5a5741f
Showing
4 changed files
with
13,496 additions
and
0 deletions.
There are no files selected for viewing
22 changes: 22 additions & 0 deletions
22
...c/main/java/org/globalbioticinteractions/nomer/match/TermMatcherWikidataTaxonFactory.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
package org.globalbioticinteractions.nomer.match; | ||
|
||
import org.eol.globi.taxon.TermMatcher; | ||
import org.globalbioticinteractions.nomer.util.TermMatcherContext; | ||
|
||
public class TermMatcherWikidataTaxonFactory implements TermMatcherFactory { | ||
|
||
@Override | ||
public String getPreferredName() { | ||
return "wikidata"; | ||
} | ||
|
||
@Override | ||
public String getDescription() { | ||
return "Lookup Wikidata taxon by name or id using offline-enabled database dump"; | ||
} | ||
|
||
@Override | ||
public TermMatcher createTermMatcher(TermMatcherContext ctx) { | ||
return new PBDBTaxonService(ctx); | ||
} | ||
} |
349 changes: 349 additions & 0 deletions
349
nomer/src/main/java/org/globalbioticinteractions/nomer/match/WikidataTaxonService.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,349 @@ | ||
package org.globalbioticinteractions.nomer.match; | ||
|
||
import com.Ostermiller.util.LabeledCSVParser; | ||
import com.fasterxml.jackson.databind.JsonNode; | ||
import org.apache.commons.lang3.StringUtils; | ||
import org.apache.commons.lang3.math.NumberUtils; | ||
import org.apache.commons.lang3.time.StopWatch; | ||
import org.eol.globi.data.CharsetConstant; | ||
import org.eol.globi.domain.NameType; | ||
import org.eol.globi.domain.Taxon; | ||
import org.eol.globi.domain.TaxonImpl; | ||
import org.eol.globi.domain.TaxonomyProvider; | ||
import org.eol.globi.service.PropertyEnricherException; | ||
import org.eol.globi.service.TaxonUtil; | ||
import org.eol.globi.taxon.TaxonCacheService; | ||
import org.eol.globi.util.CSVTSVUtil; | ||
import org.globalbioticinteractions.nomer.util.CacheUtil; | ||
import org.globalbioticinteractions.nomer.util.TermMatcherContext; | ||
import org.globalbioticinteractions.wikidata.WikidataUtil; | ||
import org.mapdb.BTreeKeySerializer; | ||
import org.mapdb.BTreeMap; | ||
import org.mapdb.DB; | ||
import org.mapdb.DBMaker; | ||
import org.mapdb.Serializer; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.File; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.io.InputStreamReader; | ||
import java.net.URI; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.TreeMap; | ||
|
||
import static org.eol.globi.domain.NameType.HAS_ACCEPTED_NAME; | ||
import static org.eol.globi.domain.NameType.NONE; | ||
import static org.eol.globi.domain.NameType.SAME_AS; | ||
import static org.eol.globi.domain.NameType.SYNONYM_OF; | ||
|
||
public class WikidataTaxonService extends CommonLongTaxonService { | ||
private static final Logger LOG = LoggerFactory.getLogger(WikidataTaxonService.class); | ||
|
||
private static final String AUTHORS = "author"; | ||
private BTreeMap<Long, String> refIds; | ||
|
||
|
||
public WikidataTaxonService(TermMatcherContext ctx) { | ||
super(ctx); | ||
} | ||
|
||
public static List<String> parseRelatedIds(JsonNode jsonNode) { | ||
List<String> relatedIds = new ArrayList<>(); | ||
|
||
JsonNode externalIds = jsonNode.at("/claims"); | ||
for (JsonNode externalId : externalIds) { | ||
JsonNode dataType = externalId.at("/0/mainsnak/datatype"); | ||
if (!dataType.isMissingNode()) { | ||
if (StringUtils.equals("external-id", dataType.asText())) { | ||
JsonNode externalIdScheme = externalId.at("/0/mainsnak/property"); | ||
if (!externalIdScheme.isMissingNode()) { | ||
String externalIdSchemeValue = externalIdScheme.asText(); | ||
TaxonomyProvider taxonomyProvider | ||
= WikidataUtil.WIKIDATA_TO_PROVIDER.get(externalIdSchemeValue); | ||
if (taxonomyProvider != null) { | ||
JsonNode identifier = externalId.at("/0/mainsnak/datavalue/value"); | ||
if (!identifier.isMissingNode()) { | ||
relatedIds.add(taxonomyProvider.getIdPrefix() + identifier.asText()); | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
return relatedIds; | ||
} | ||
|
||
public static Taxon parseTaxon(JsonNode jsonNode) { | ||
Taxon taxon = new TaxonImpl(); | ||
|
||
JsonNode at = jsonNode.at("/id"); | ||
if (!at.isMissingNode()) { | ||
taxon.setExternalId("WD:" + at.asText()); | ||
} | ||
|
||
JsonNode labels = jsonNode.at("/claims/P1843"); | ||
List<String> commonNames = new ArrayList<>(); | ||
for (JsonNode label : labels) { | ||
JsonNode value = label.at("/mainsnak/datavalue/value"); | ||
commonNames.add(value.get("text").asText() + " @" + value.get("language").asText()); | ||
} | ||
|
||
taxon.setCommonNames(StringUtils.join(commonNames, CharsetConstant.SEPARATOR)); | ||
|
||
JsonNode name = jsonNode.at("/claims/P225/0/mainsnak/datavalue/value"); | ||
if (!name.isMissingNode()) { | ||
taxon.setName(name.asText()); | ||
} | ||
|
||
JsonNode parentId = jsonNode.at("/claims/P171/0/mainsnak/datavalue/value/id"); | ||
if (!parentId.isMissingNode()) { | ||
taxon.setPathIds("WD:" + parentId.asText() + CharsetConstant.SEPARATOR + taxon.getId()); | ||
} | ||
return taxon; | ||
} | ||
|
||
@Override | ||
public TaxonomyProvider getTaxonomyProvider() { | ||
return TaxonomyProvider.PBDB; | ||
} | ||
|
||
void parseNodes(Map<Long, Map<String, String>> taxonMap, | ||
Map<Long, Long> childParent, | ||
Map<String, List<Long>> name2nodeIds, | ||
Map<Long, String> authorIds, | ||
BTreeMap<Long, Long> mergedNodes, | ||
InputStream is) throws PropertyEnricherException { | ||
try { | ||
LabeledCSVParser labeledTSVParser = CSVTSVUtil.createLabeledTSVParser(is); | ||
|
||
while (labeledTSVParser.getLine() != null) { | ||
|
||
|
||
String providedId = labeledTSVParser.getValueByLabel("taxon_no"); | ||
String providedName = labeledTSVParser.getValueByLabel("taxon_name"); | ||
String providedRank = labeledTSVParser.getValueByLabel("taxon_rank"); | ||
String providedParentId = labeledTSVParser.getValueByLabel("parent_no"); | ||
Taxon providedTaxon = new TaxonImpl( | ||
providedName, | ||
TaxonomyProvider.PBDB.getIdPrefix() + providedId | ||
); | ||
providedTaxon.setRank(providedRank); | ||
|
||
String acceptedId = labeledTSVParser.getValueByLabel("accepted_no"); | ||
String acceptedName = labeledTSVParser.getValueByLabel("accepted_name"); | ||
String acceptedRank = labeledTSVParser.getValueByLabel("accepted_rank"); | ||
Taxon acceptedTaxon = new TaxonImpl( | ||
acceptedName, | ||
TaxonomyProvider.PBDB.getIdPrefix() + acceptedId | ||
); | ||
acceptedTaxon.setRank(acceptedRank); | ||
|
||
String different = labeledTSVParser.getValueByLabel("difference"); | ||
|
||
Map<String, NameType> mapping = new TreeMap<String, NameType>() {{ | ||
put("corrected to", HAS_ACCEPTED_NAME); | ||
put("invalid subgroup of", NONE); | ||
put("misspelling of", HAS_ACCEPTED_NAME); | ||
put("nomen dubium", SAME_AS); | ||
put("nomen nudum", SAME_AS); | ||
put("nomen oblitum", SAME_AS); | ||
put("nomen vanum", SAME_AS); | ||
put("objective synonym of", SYNONYM_OF); | ||
put("obsolete variant of", HAS_ACCEPTED_NAME); | ||
put("reassigned as", HAS_ACCEPTED_NAME); | ||
put("recombined as", SAME_AS); | ||
put("replaced by", HAS_ACCEPTED_NAME); | ||
put("subjective synonym of", SYNONYM_OF); | ||
}}; | ||
|
||
String authorship = authorIds.get(Long.parseLong(labeledTSVParser.getValueByLabel("reference_no"))); | ||
|
||
providedTaxon.setAuthorship(authorship); | ||
|
||
if (NumberUtils.isCreatable(providedId)) { | ||
Long taxonKey = Long.parseLong(providedId); | ||
registerIdForName(taxonKey, providedTaxon, name2nodeIds); | ||
taxonMap.put(taxonKey, TaxonUtil.taxonToMap(providedTaxon)); | ||
if (NumberUtils.isCreatable(providedParentId)) { | ||
childParent.put( | ||
taxonKey, | ||
Long.parseLong(providedParentId) | ||
); | ||
} | ||
|
||
if (mapping.containsKey(different) && NumberUtils.isCreatable(acceptedId)) { | ||
mergedNodes.put(Long.parseLong(providedId), Long.parseLong(acceptedId)); | ||
} | ||
} | ||
|
||
|
||
} | ||
} catch (IOException e) { | ||
throw new PropertyEnricherException("failed to parse PBDB taxon dump", e); | ||
} | ||
} | ||
|
||
private static void parseReferences(Map<Long, String> refIdMap, InputStream resourceAsStream) throws PropertyEnricherException { | ||
try { | ||
LabeledCSVParser parser = CSVTSVUtil.createLabeledTSVParser(resourceAsStream); | ||
while (parser.getLine() != null) { | ||
String refId = parser.getValueByLabel("reference_no"); | ||
if (NumberUtils.isCreatable(refId)) { | ||
String year = parser.getValueByLabel("pubyr"); | ||
StringBuilder builder = new StringBuilder(); | ||
appendIfNotBlank(parser, builder, "author1init", " ", ""); | ||
appendIfNotBlank(parser, builder, "author1last", " ", ""); | ||
appendIfNotBlank(parser, builder, "author2init", " ", "and "); | ||
appendIfNotBlank(parser, builder, "author2last", " ", ""); | ||
|
||
refIdMap.put(Long.parseLong(refId), builder.toString() + year); | ||
} | ||
} | ||
} catch (IOException e) { | ||
throw new PropertyEnricherException("failed to parse ITIS taxon unit types", e); | ||
} | ||
} | ||
|
||
private static String appendIfNotBlank(LabeledCSVParser parser, StringBuilder builder, String author1init, String suffix, String prefix) { | ||
String author1First = parser.getValueByLabel(author1init); | ||
if (StringUtils.isNotBlank(author1First)) { | ||
builder.append(prefix); | ||
builder.append(author1First); | ||
builder.append(suffix); | ||
} | ||
return author1First; | ||
} | ||
|
||
|
||
static void parseMerged(Map<Long, Long> mergedMap, InputStream resourceAsStream) throws PropertyEnricherException { | ||
BufferedReader reader = new BufferedReader(new InputStreamReader(resourceAsStream)); | ||
String line; | ||
try { | ||
while ((line = reader.readLine()) != null) { | ||
String[] rowValues = StringUtils.splitByWholeSeparatorPreserveAllTokens(line, "|"); | ||
if (rowValues.length > 1) { | ||
String oldTaxId = rowValues[0]; | ||
String newTaxId = rowValues[1]; | ||
if (StringUtils.isNotBlank(oldTaxId) && StringUtils.isNotBlank(newTaxId)) { | ||
mergedMap.put( | ||
Long.parseLong(oldTaxId), | ||
Long.parseLong(newTaxId) | ||
); | ||
} | ||
} | ||
} | ||
} catch (IOException e) { | ||
throw new PropertyEnricherException("failed to parse ITIS taxon dump", e); | ||
} | ||
} | ||
|
||
@Override | ||
protected void lazyInit() throws PropertyEnricherException { | ||
|
||
File taxonomyDir = new File(getCacheDir(), StringUtils.lowerCase(getTaxonomyProvider().name())); | ||
DB db = DBMaker | ||
.newFileDB(taxonomyDir) | ||
.mmapFileEnableIfSupported() | ||
.mmapFileCleanerHackDisable() | ||
.compressionEnable() | ||
.closeOnJvmShutdown() | ||
.transactionDisable() | ||
.make(); | ||
|
||
if (db.exists(NODES) | ||
&& db.exists(CHILD_PARENT) | ||
&& db.exists(MERGED_NODES) | ||
&& db.exists(NAME_TO_NODE_IDS) | ||
&& db.exists(AUTHORS)) { | ||
LOG.debug("ITIS taxonomy already indexed at [" + taxonomyDir.getAbsolutePath() + "], no need to import."); | ||
nodes = db.getTreeMap(NODES); | ||
childParent = db.getTreeMap(CHILD_PARENT); | ||
mergedNodes = db.getTreeMap(MERGED_NODES); | ||
name2nodeIds = db.getTreeMap(NAME_TO_NODE_IDS); | ||
refIds = db.getTreeMap(AUTHORS); | ||
} else { | ||
indexITIS(db); | ||
} | ||
} | ||
|
||
private void indexITIS(DB db) throws PropertyEnricherException { | ||
LOG.info("ITIS taxonomy importing..."); | ||
StopWatch watch = new StopWatch(); | ||
watch.start(); | ||
|
||
refIds = db | ||
.createTreeMap(AUTHORS) | ||
.keySerializer(BTreeKeySerializer.ZERO_OR_POSITIVE_LONG) | ||
.valueSerializer(Serializer.STRING) | ||
.make(); | ||
|
||
try { | ||
InputStream resource = getCtx().retrieve(getReferences()); | ||
if (resource == null) { | ||
throw new PropertyEnricherException("init failure: failed to find [" + getReferences() + "]"); | ||
} | ||
parseReferences(refIds, resource); | ||
} catch (IOException e) { | ||
throw new PropertyEnricherException("failed to parse references", e); | ||
} | ||
|
||
nodes = db | ||
.createTreeMap(NODES) | ||
.keySerializer(BTreeKeySerializer.ZERO_OR_POSITIVE_LONG) | ||
.valueSerializer(Serializer.JAVA) | ||
.make(); | ||
|
||
childParent = db | ||
.createTreeMap(CHILD_PARENT) | ||
.keySerializer(BTreeKeySerializer.ZERO_OR_POSITIVE_LONG) | ||
.valueSerializer(Serializer.LONG) | ||
.make(); | ||
|
||
name2nodeIds = db | ||
.createTreeMap(NAME_TO_NODE_IDS) | ||
.keySerializer(BTreeKeySerializer.STRING) | ||
.valueSerializer(Serializer.JAVA) | ||
.make(); | ||
|
||
mergedNodes = db | ||
.createTreeMap(MERGED_NODES) | ||
.keySerializer(BTreeKeySerializer.ZERO_OR_POSITIVE_LONG) | ||
.valueSerializer(Serializer.LONG) | ||
.make(); | ||
|
||
try { | ||
parseNodes( | ||
nodes, | ||
childParent, | ||
name2nodeIds, | ||
refIds, | ||
mergedNodes, getCtx().retrieve(getNodesUrl()) | ||
); | ||
} catch (IOException e) { | ||
throw new PropertyEnricherException("failed to parse ITIS nodes", e); | ||
} | ||
|
||
watch.stop(); | ||
TaxonCacheService.logCacheLoadStats(watch.getTime(), nodes.size(), LOG); | ||
LOG.info("[" + getTaxonomyProvider().name() + "] taxonomy imported."); | ||
} | ||
|
||
@Override | ||
public void shutdown() { | ||
|
||
} | ||
|
||
private URI getNodesUrl() throws PropertyEnricherException { | ||
return CacheUtil.getValueURI(getCtx(), "nomer.pbdb.taxa"); | ||
} | ||
|
||
private URI getReferences() throws PropertyEnricherException { | ||
return CacheUtil.getValueURI(getCtx(), "nomer.pbdb.refs"); | ||
} | ||
|
||
} |
Oops, something went wrong.