Skip to content

Commit

Permalink
towards supporting offline wikidata taxon matcher; related to #181
Browse files Browse the repository at this point in the history
  • Loading branch information
Jorrit Poelen committed Jun 24, 2024
1 parent 3f90531 commit 5a5741f
Show file tree
Hide file tree
Showing 4 changed files with 13,496 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package org.globalbioticinteractions.nomer.match;

import org.eol.globi.taxon.TermMatcher;
import org.globalbioticinteractions.nomer.util.TermMatcherContext;

public class TermMatcherWikidataTaxonFactory implements TermMatcherFactory {

@Override
public String getPreferredName() {
return "wikidata";
}

@Override
public String getDescription() {
return "Lookup Wikidata taxon by name or id using offline-enabled database dump";
}

@Override
public TermMatcher createTermMatcher(TermMatcherContext ctx) {
return new PBDBTaxonService(ctx);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,349 @@
package org.globalbioticinteractions.nomer.match;

import com.Ostermiller.util.LabeledCSVParser;
import com.fasterxml.jackson.databind.JsonNode;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.commons.lang3.time.StopWatch;
import org.eol.globi.data.CharsetConstant;
import org.eol.globi.domain.NameType;
import org.eol.globi.domain.Taxon;
import org.eol.globi.domain.TaxonImpl;
import org.eol.globi.domain.TaxonomyProvider;
import org.eol.globi.service.PropertyEnricherException;
import org.eol.globi.service.TaxonUtil;
import org.eol.globi.taxon.TaxonCacheService;
import org.eol.globi.util.CSVTSVUtil;
import org.globalbioticinteractions.nomer.util.CacheUtil;
import org.globalbioticinteractions.nomer.util.TermMatcherContext;
import org.globalbioticinteractions.wikidata.WikidataUtil;
import org.mapdb.BTreeKeySerializer;
import org.mapdb.BTreeMap;
import org.mapdb.DB;
import org.mapdb.DBMaker;
import org.mapdb.Serializer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

import static org.eol.globi.domain.NameType.HAS_ACCEPTED_NAME;
import static org.eol.globi.domain.NameType.NONE;
import static org.eol.globi.domain.NameType.SAME_AS;
import static org.eol.globi.domain.NameType.SYNONYM_OF;

public class WikidataTaxonService extends CommonLongTaxonService {
private static final Logger LOG = LoggerFactory.getLogger(WikidataTaxonService.class);

private static final String AUTHORS = "author";
private BTreeMap<Long, String> refIds;


public WikidataTaxonService(TermMatcherContext ctx) {
super(ctx);
}

public static List<String> parseRelatedIds(JsonNode jsonNode) {
List<String> relatedIds = new ArrayList<>();

JsonNode externalIds = jsonNode.at("/claims");
for (JsonNode externalId : externalIds) {
JsonNode dataType = externalId.at("/0/mainsnak/datatype");
if (!dataType.isMissingNode()) {
if (StringUtils.equals("external-id", dataType.asText())) {
JsonNode externalIdScheme = externalId.at("/0/mainsnak/property");
if (!externalIdScheme.isMissingNode()) {
String externalIdSchemeValue = externalIdScheme.asText();
TaxonomyProvider taxonomyProvider
= WikidataUtil.WIKIDATA_TO_PROVIDER.get(externalIdSchemeValue);
if (taxonomyProvider != null) {
JsonNode identifier = externalId.at("/0/mainsnak/datavalue/value");
if (!identifier.isMissingNode()) {
relatedIds.add(taxonomyProvider.getIdPrefix() + identifier.asText());
}
}
}
}
}
}
return relatedIds;
}

public static Taxon parseTaxon(JsonNode jsonNode) {
Taxon taxon = new TaxonImpl();

JsonNode at = jsonNode.at("/id");
if (!at.isMissingNode()) {
taxon.setExternalId("WD:" + at.asText());
}

JsonNode labels = jsonNode.at("/claims/P1843");
List<String> commonNames = new ArrayList<>();
for (JsonNode label : labels) {
JsonNode value = label.at("/mainsnak/datavalue/value");
commonNames.add(value.get("text").asText() + " @" + value.get("language").asText());
}

taxon.setCommonNames(StringUtils.join(commonNames, CharsetConstant.SEPARATOR));

JsonNode name = jsonNode.at("/claims/P225/0/mainsnak/datavalue/value");
if (!name.isMissingNode()) {
taxon.setName(name.asText());
}

JsonNode parentId = jsonNode.at("/claims/P171/0/mainsnak/datavalue/value/id");
if (!parentId.isMissingNode()) {
taxon.setPathIds("WD:" + parentId.asText() + CharsetConstant.SEPARATOR + taxon.getId());
}
return taxon;
}

@Override
public TaxonomyProvider getTaxonomyProvider() {
return TaxonomyProvider.PBDB;
}

void parseNodes(Map<Long, Map<String, String>> taxonMap,
Map<Long, Long> childParent,
Map<String, List<Long>> name2nodeIds,
Map<Long, String> authorIds,
BTreeMap<Long, Long> mergedNodes,
InputStream is) throws PropertyEnricherException {
try {
LabeledCSVParser labeledTSVParser = CSVTSVUtil.createLabeledTSVParser(is);

while (labeledTSVParser.getLine() != null) {


String providedId = labeledTSVParser.getValueByLabel("taxon_no");
String providedName = labeledTSVParser.getValueByLabel("taxon_name");
String providedRank = labeledTSVParser.getValueByLabel("taxon_rank");
String providedParentId = labeledTSVParser.getValueByLabel("parent_no");
Taxon providedTaxon = new TaxonImpl(
providedName,
TaxonomyProvider.PBDB.getIdPrefix() + providedId
);
providedTaxon.setRank(providedRank);

String acceptedId = labeledTSVParser.getValueByLabel("accepted_no");
String acceptedName = labeledTSVParser.getValueByLabel("accepted_name");
String acceptedRank = labeledTSVParser.getValueByLabel("accepted_rank");
Taxon acceptedTaxon = new TaxonImpl(
acceptedName,
TaxonomyProvider.PBDB.getIdPrefix() + acceptedId
);
acceptedTaxon.setRank(acceptedRank);

String different = labeledTSVParser.getValueByLabel("difference");

Map<String, NameType> mapping = new TreeMap<String, NameType>() {{
put("corrected to", HAS_ACCEPTED_NAME);
put("invalid subgroup of", NONE);
put("misspelling of", HAS_ACCEPTED_NAME);
put("nomen dubium", SAME_AS);
put("nomen nudum", SAME_AS);
put("nomen oblitum", SAME_AS);
put("nomen vanum", SAME_AS);
put("objective synonym of", SYNONYM_OF);
put("obsolete variant of", HAS_ACCEPTED_NAME);
put("reassigned as", HAS_ACCEPTED_NAME);
put("recombined as", SAME_AS);
put("replaced by", HAS_ACCEPTED_NAME);
put("subjective synonym of", SYNONYM_OF);
}};

String authorship = authorIds.get(Long.parseLong(labeledTSVParser.getValueByLabel("reference_no")));

providedTaxon.setAuthorship(authorship);

if (NumberUtils.isCreatable(providedId)) {
Long taxonKey = Long.parseLong(providedId);
registerIdForName(taxonKey, providedTaxon, name2nodeIds);
taxonMap.put(taxonKey, TaxonUtil.taxonToMap(providedTaxon));
if (NumberUtils.isCreatable(providedParentId)) {
childParent.put(
taxonKey,
Long.parseLong(providedParentId)
);
}

if (mapping.containsKey(different) && NumberUtils.isCreatable(acceptedId)) {
mergedNodes.put(Long.parseLong(providedId), Long.parseLong(acceptedId));
}
}


}
} catch (IOException e) {
throw new PropertyEnricherException("failed to parse PBDB taxon dump", e);
}
}

private static void parseReferences(Map<Long, String> refIdMap, InputStream resourceAsStream) throws PropertyEnricherException {
try {
LabeledCSVParser parser = CSVTSVUtil.createLabeledTSVParser(resourceAsStream);
while (parser.getLine() != null) {
String refId = parser.getValueByLabel("reference_no");
if (NumberUtils.isCreatable(refId)) {
String year = parser.getValueByLabel("pubyr");
StringBuilder builder = new StringBuilder();
appendIfNotBlank(parser, builder, "author1init", " ", "");
appendIfNotBlank(parser, builder, "author1last", " ", "");
appendIfNotBlank(parser, builder, "author2init", " ", "and ");
appendIfNotBlank(parser, builder, "author2last", " ", "");

refIdMap.put(Long.parseLong(refId), builder.toString() + year);
}
}
} catch (IOException e) {
throw new PropertyEnricherException("failed to parse ITIS taxon unit types", e);
}
}

private static String appendIfNotBlank(LabeledCSVParser parser, StringBuilder builder, String author1init, String suffix, String prefix) {
String author1First = parser.getValueByLabel(author1init);
if (StringUtils.isNotBlank(author1First)) {
builder.append(prefix);
builder.append(author1First);
builder.append(suffix);
}
return author1First;
}


static void parseMerged(Map<Long, Long> mergedMap, InputStream resourceAsStream) throws PropertyEnricherException {
BufferedReader reader = new BufferedReader(new InputStreamReader(resourceAsStream));
String line;
try {
while ((line = reader.readLine()) != null) {
String[] rowValues = StringUtils.splitByWholeSeparatorPreserveAllTokens(line, "|");
if (rowValues.length > 1) {
String oldTaxId = rowValues[0];
String newTaxId = rowValues[1];
if (StringUtils.isNotBlank(oldTaxId) && StringUtils.isNotBlank(newTaxId)) {
mergedMap.put(
Long.parseLong(oldTaxId),
Long.parseLong(newTaxId)
);
}
}
}
} catch (IOException e) {
throw new PropertyEnricherException("failed to parse ITIS taxon dump", e);
}
}

@Override
protected void lazyInit() throws PropertyEnricherException {

File taxonomyDir = new File(getCacheDir(), StringUtils.lowerCase(getTaxonomyProvider().name()));
DB db = DBMaker
.newFileDB(taxonomyDir)
.mmapFileEnableIfSupported()
.mmapFileCleanerHackDisable()
.compressionEnable()
.closeOnJvmShutdown()
.transactionDisable()
.make();

if (db.exists(NODES)
&& db.exists(CHILD_PARENT)
&& db.exists(MERGED_NODES)
&& db.exists(NAME_TO_NODE_IDS)
&& db.exists(AUTHORS)) {
LOG.debug("ITIS taxonomy already indexed at [" + taxonomyDir.getAbsolutePath() + "], no need to import.");
nodes = db.getTreeMap(NODES);
childParent = db.getTreeMap(CHILD_PARENT);
mergedNodes = db.getTreeMap(MERGED_NODES);
name2nodeIds = db.getTreeMap(NAME_TO_NODE_IDS);
refIds = db.getTreeMap(AUTHORS);
} else {
indexITIS(db);
}
}

private void indexITIS(DB db) throws PropertyEnricherException {
LOG.info("ITIS taxonomy importing...");
StopWatch watch = new StopWatch();
watch.start();

refIds = db
.createTreeMap(AUTHORS)
.keySerializer(BTreeKeySerializer.ZERO_OR_POSITIVE_LONG)
.valueSerializer(Serializer.STRING)
.make();

try {
InputStream resource = getCtx().retrieve(getReferences());
if (resource == null) {
throw new PropertyEnricherException("init failure: failed to find [" + getReferences() + "]");
}
parseReferences(refIds, resource);
} catch (IOException e) {
throw new PropertyEnricherException("failed to parse references", e);
}

nodes = db
.createTreeMap(NODES)
.keySerializer(BTreeKeySerializer.ZERO_OR_POSITIVE_LONG)
.valueSerializer(Serializer.JAVA)
.make();

childParent = db
.createTreeMap(CHILD_PARENT)
.keySerializer(BTreeKeySerializer.ZERO_OR_POSITIVE_LONG)
.valueSerializer(Serializer.LONG)
.make();

name2nodeIds = db
.createTreeMap(NAME_TO_NODE_IDS)
.keySerializer(BTreeKeySerializer.STRING)
.valueSerializer(Serializer.JAVA)
.make();

mergedNodes = db
.createTreeMap(MERGED_NODES)
.keySerializer(BTreeKeySerializer.ZERO_OR_POSITIVE_LONG)
.valueSerializer(Serializer.LONG)
.make();

try {
parseNodes(
nodes,
childParent,
name2nodeIds,
refIds,
mergedNodes, getCtx().retrieve(getNodesUrl())
);
} catch (IOException e) {
throw new PropertyEnricherException("failed to parse ITIS nodes", e);
}

watch.stop();
TaxonCacheService.logCacheLoadStats(watch.getTime(), nodes.size(), LOG);
LOG.info("[" + getTaxonomyProvider().name() + "] taxonomy imported.");
}

@Override
public void shutdown() {

}

private URI getNodesUrl() throws PropertyEnricherException {
return CacheUtil.getValueURI(getCtx(), "nomer.pbdb.taxa");
}

private URI getReferences() throws PropertyEnricherException {
return CacheUtil.getValueURI(getCtx(), "nomer.pbdb.refs");
}

}
Loading

0 comments on commit 5a5741f

Please sign in to comment.