Skip to content

Commit

Permalink
add xml support to discoverlife taxonomic name indexing; related to #161
Browse files Browse the repository at this point in the history
  • Loading branch information
Jorrit Poelen committed Apr 5, 2024
1 parent 41c5a7f commit 2433824
Show file tree
Hide file tree
Showing 10 changed files with 93 additions and 98 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
import java.util.regex.Pattern;
import java.util.stream.Collectors;

public class DiscoverLifeUtil {
public class DiscoverLifeUtilXHTML {

private static final List<String> PATH_STATIC = Arrays.asList("Animalia", "Arthropoda", "Insecta", "Hymenoptera");
public static final String URL_ENDPOINT_DISCOVER_LIFE = "https://www.discoverlife.org";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@
import java.util.stream.Stream;

import static org.eol.globi.service.TaxonUtil.generateTaxonPathNames;
import static org.eol.globi.taxon.DiscoverLifeUtil.emitNameRelatedToFocalTaxon;
import static org.eol.globi.taxon.DiscoverLifeUtilXHTML.emitNameRelatedToFocalTaxon;

public class DiscoverLifeUtil2 {
public class DiscoverLifeUtilXML {

public static final List<String> RANKS = Arrays.asList("Family", "Subfamily", "Tribe", "Subtribe", "Genus", "Subgenus");

Expand Down Expand Up @@ -112,7 +112,7 @@ public void accept(String recordXml) {

Document doc = builder.parse(recordInputStream);
Map<String, String> nameMap = parseFocalTaxon(doc);
DiscoverLifeUtil.emitNameRelation(listener, nameMap, TaxonUtil.mapToTaxon(nameMap));
DiscoverLifeUtilXHTML.emitNameRelation(listener, nameMap, TaxonUtil.mapToTaxon(nameMap));

List<Taxon> relatedTaxa = parseRelatedNames(doc);

Expand Down Expand Up @@ -162,7 +162,7 @@ static Map<String, String> parseFocalTaxon(Document doc) throws XPathExpressionE
nameMap.put(PropertyAndValueDictionary.RANK, taxonomicRank);
String name = nameMap.get(PropertyAndValueDictionary.NAME);
nameMap.put(taxonomicRank, name);
nameMap.put(PropertyAndValueDictionary.EXTERNAL_ID, DiscoverLifeUtil.URL_ENDPOINT_DISCOVER_LIFE_SEARCH + StringUtils.replace(name, " ", "+"));
nameMap.put(PropertyAndValueDictionary.EXTERNAL_ID, DiscoverLifeUtilXHTML.URL_ENDPOINT_DISCOVER_LIFE_SEARCH + StringUtils.replace(name, " ", "+"));


NodeList attr = (NodeList) XmlUtil.applyXPath(setNode, "//attributes", XPathConstants.NODESET);
Expand Down Expand Up @@ -231,7 +231,7 @@ public static List<Taxon> parseRelatedNames(Document doc) throws XPathExpression
List<String> namesWithoutRemarks = Arrays.asList(nameParts).subList(0, nameParts.length > 2 ? nameParts.length - 1 : nameParts.length);
return StringUtils.join(namesWithoutRemarks, ",");
})
.map(DiscoverLifeUtil2::parse)
.map(DiscoverLifeUtilXML::parse)
.filter(Objects::nonNull);
}
return relatedNames.collect(Collectors.toList());
Expand Down
Original file line number Diff line number Diff line change
@@ -1,34 +1,23 @@
package org.eol.globi.taxon;

import org.apache.commons.io.IOUtils;
import org.eol.globi.domain.NameType;
import org.eol.globi.domain.Taxon;
import org.eol.globi.domain.Term;
import org.eol.globi.service.ResourceService;
import org.hamcrest.core.Is;
import org.junit.Test;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;

import static org.hamcrest.MatcherAssert.assertThat;
import static org.junit.Assert.assertNull;

public class DiscoverLifeUtilIntegrationTest {
public class DiscoverLifeUtilXHTMLIntegrationTest {

private static final String DISCOVER_LIFE_URL
= DiscoverLifeUtil.URL_ENDPOINT_DISCOVER_LIFE +
= DiscoverLifeUtilXHTML.URL_ENDPOINT_DISCOVER_LIFE +
"/mp/20q" +
"?act=x_checklist" +
"&guide=Apoidea_species" +
Expand All @@ -50,7 +39,7 @@ public void parseBees() throws IOException {
}
};

DiscoverLifeUtil.parse(DiscoverLifeUtil.getBeeNameTable(new ResourceService() {
DiscoverLifeUtilXHTML.parse(DiscoverLifeUtilXHTML.getBeeNameTable(new ResourceService() {
@Override
public InputStream retrieve(URI uri) throws IOException {
return DiscoverLifeTestUtil.getStreamOfBees(BEE_NAMES_CACHED);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import org.eol.globi.domain.NameType;
import org.eol.globi.domain.Taxon;
import org.eol.globi.domain.Term;
import org.eol.globi.service.ResourceService;
import org.hamcrest.core.Is;
import org.junit.Test;
import org.w3c.dom.NodeList;
Expand All @@ -17,17 +16,15 @@
import javax.xml.xpath.XPathExpressionException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;

import static org.hamcrest.MatcherAssert.assertThat;
import static org.junit.Assert.assertNull;

public class DiscoverLifeUtilTest {
public class DiscoverLifeUtilXHTMLTest {

@Test
public void parseNameAncylandrenaAtoposoma() throws SAXException, ParserConfigurationException, XPathExpressionException, IOException {
Expand Down Expand Up @@ -61,7 +58,7 @@ public void parseNameAncylandrenaAtoposoma() throws SAXException, ParserConfigur

List<Triple<Term, NameType, Taxon>> relatedTaxa = new ArrayList<>();

DiscoverLifeUtil.parseNames(null, nodes.item(0), new TermMatchListener() {
DiscoverLifeUtilXHTML.parseNames(null, nodes.item(0), new TermMatchListener() {

@Override
public void foundTaxonForTerm(Long requestId, Term providedTerm, NameType nameType, Taxon resolvedTaxon) {
Expand Down Expand Up @@ -208,7 +205,7 @@ public void parseNameRelationsAndrenaAccepta() throws SAXException, ParserConfig

List<Triple<Term, NameType, Taxon>> relatedTaxa = new ArrayList<>();

DiscoverLifeUtil.parseNames(null, nodes.item(0), new TermMatchListener() {
DiscoverLifeUtilXHTML.parseNames(null, nodes.item(0), new TermMatchListener() {

@Override
public void foundTaxonForTerm(Long requestId, Term providedTerm, NameType nameType, Taxon resolvedTaxon) {
Expand Down Expand Up @@ -272,7 +269,7 @@ public void parseHomonymAllodapeClypeata() throws SAXException, ParserConfigurat

List<Triple<Term, NameType, Taxon>> relatedTaxa = new ArrayList<>();

DiscoverLifeUtil.parseNames(null, nodes.item(0), new TermMatchListener() {
DiscoverLifeUtilXHTML.parseNames(null, nodes.item(0), new TermMatchListener() {

@Override
public void foundTaxonForTerm(Long requestId, Term providedTerm, NameType nameType, Taxon resolvedTaxon) {
Expand Down Expand Up @@ -339,7 +336,7 @@ public void parseNameRelationsDanglingVar() throws SAXException, ParserConfigura

List<Triple<Term, NameType, Taxon>> relatedTaxa = new ArrayList<>();

DiscoverLifeUtil.parseNames(null, nodes.item(0), new TermMatchListener() {
DiscoverLifeUtilXHTML.parseNames(null, nodes.item(0), new TermMatchListener() {

@Override
public void foundTaxonForTerm(Long requestId, Term providedTerm, NameType nameType, Taxon resolvedTaxon) {
Expand Down Expand Up @@ -393,7 +390,7 @@ public void parseNameRelationsDanglingVarWithLeadingComma() throws SAXException,

List<Triple<Term, NameType, Taxon>> relatedTaxa = new ArrayList<>();

DiscoverLifeUtil.parseNames(null, nodes.item(0), new TermMatchListener() {
DiscoverLifeUtilXHTML.parseNames(null, nodes.item(0), new TermMatchListener() {

@Override
public void foundTaxonForTerm(Long requestId, Term providedTerm, NameType nameType, Taxon resolvedTaxon) {
Expand Down Expand Up @@ -445,7 +442,7 @@ public void parseNameAcceptedHomonym() throws SAXException, ParserConfigurationE

List<Triple<Term, NameType, Taxon>> relatedTaxa = new ArrayList<>();

DiscoverLifeUtil.parseNames(null, nodes.item(0), new TermMatchListener() {
DiscoverLifeUtilXHTML.parseNames(null, nodes.item(0), new TermMatchListener() {

@Override
public void foundTaxonForTerm(Long requestId, Term providedTerm, NameType nameType, Taxon resolvedTaxon) {
Expand Down Expand Up @@ -557,7 +554,7 @@ public void parseNameRelationsWithHomonym() throws SAXException, ParserConfigura

List<Triple<Term, NameType, Taxon>> relatedTaxa = new ArrayList<>();

DiscoverLifeUtil.parseNames(null, nodes.item(0), new TermMatchListener() {
DiscoverLifeUtilXHTML.parseNames(null, nodes.item(0), new TermMatchListener() {

@Override
public void foundTaxonForTerm(Long requestId, Term providedTerm, NameType nameType, Taxon resolvedTaxon) {
Expand Down Expand Up @@ -614,7 +611,7 @@ public void parseNameRelationsWithSicSuffix() throws SAXException, ParserConfigu

List<Triple<Term, NameType, Taxon>> relatedTaxa = new ArrayList<>();

DiscoverLifeUtil.parseNames(null, nodes.item(0), new TermMatchListener() {
DiscoverLifeUtilXHTML.parseNames(null, nodes.item(0), new TermMatchListener() {

@Override
public void foundTaxonForTerm(Long requestId, Term providedTerm, NameType nameType, Taxon resolvedTaxon) {
Expand Down Expand Up @@ -665,7 +662,7 @@ public void parseNameRelationsWithSicSuffix2() throws SAXException, ParserConfig

List<Triple<Term, NameType, Taxon>> relatedTaxa = new ArrayList<>();

DiscoverLifeUtil.parseNames(null, nodes.item(0), new TermMatchListener() {
DiscoverLifeUtilXHTML.parseNames(null, nodes.item(0), new TermMatchListener() {

@Override
public void foundTaxonForTerm(Long requestId, Term providedTerm, NameType nameType, Taxon resolvedTaxon) {
Expand Down Expand Up @@ -702,7 +699,7 @@ public void parseNamePseudopanurgus_parvus() throws SAXException, ParserConfigur

List<Triple<Term, NameType, Taxon>> relatedTaxa = new ArrayList<>();

DiscoverLifeUtil.parseNames(null, nodes.item(0), new TermMatchListener() {
DiscoverLifeUtilXHTML.parseNames(null, nodes.item(0), new TermMatchListener() {

@Override
public void foundTaxonForTerm(Long requestId, Term providedTerm, NameType nameType, Taxon resolvedTaxon) {
Expand All @@ -729,27 +726,27 @@ public void foundTaxonForTerm(Long requestId, Term providedTerm, NameType nameTy

@Test
public void guessRank() throws IOException {
assertThat(DiscoverLifeUtil.guessRankFromName("Bla bla"), Is.is("species"));
assertThat(DiscoverLifeUtilXHTML.guessRankFromName("Bla bla"), Is.is("species"));
}

@Test
public void guessRankFamily() throws IOException {
assertThat(DiscoverLifeUtil.guessRankFromName("Bla (Bla) bla"), Is.is("species"));
assertThat(DiscoverLifeUtilXHTML.guessRankFromName("Bla (Bla) bla"), Is.is("species"));
}

@Test
public void guessRankSubspecies() throws IOException {
assertThat(DiscoverLifeUtil.guessRankFromName("Bla bla bla"), Is.is("subspecies"));
assertThat(DiscoverLifeUtilXHTML.guessRankFromName("Bla bla bla"), Is.is("subspecies"));
}

@Test
public void guessRankVariant() throws IOException {
assertThat(DiscoverLifeUtil.guessRankFromName("Bla bla var bla"), Is.is("variety"));
assertThat(DiscoverLifeUtilXHTML.guessRankFromName("Bla bla var bla"), Is.is("variety"));
}

@Test
public void guessRankSubvariant() throws IOException {
assertThat(DiscoverLifeUtil.guessRankFromName("Bla bla bla var bla"), Is.is("subvariety"));
assertThat(DiscoverLifeUtilXHTML.guessRankFromName("Bla bla bla var bla"), Is.is("subvariety"));
}

@Test
Expand All @@ -770,7 +767,7 @@ public void foundTaxonForTerm(Long requestId, Term providedTerm, NameType nameTy
}
};

DiscoverLifeUtil.parseTaxonPage(is, termMatchListener);
DiscoverLifeUtilXHTML.parseTaxonPage(is, termMatchListener);


assertThat(acceptedFound.get().getName(), Is.is("Agapostemon texanus"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import org.eol.globi.domain.NameType;
import org.eol.globi.domain.Taxon;
import org.eol.globi.domain.Term;
import org.eol.globi.service.TaxonUtil;
import org.hamcrest.core.Is;
import org.junit.Test;

Expand All @@ -14,22 +13,22 @@

import static org.hamcrest.MatcherAssert.assertThat;

public class DiscoverLifeUtil2IntegrationTest {
public class DiscoverLifeUtilXMLIntegrationTest {

public static final String BEES_XML_GZIP = "/org/globalbioticinteractions/nomer/match/discoverlife/bees.xml.gz";

@Test
public void compareLocalVersionToRemoteVersion() throws IOException {
DiscoverLifeTestUtil.compareLocalVersionToRemoteVersion(
BEES_XML_GZIP,
DiscoverLifeUtil.URL_ENDPOINT_DISCOVER_LIFE + "/nh/id/20q/Apoidea_species.xml"
DiscoverLifeUtilXHTML.URL_ENDPOINT_DISCOVER_LIFE + "/nh/id/20q/Apoidea_species.xml"
);
}

@Test
public void parseNames() throws ParserConfigurationException, IOException {
AtomicInteger counter = new AtomicInteger(0);
DiscoverLifeUtil2.parse(new GZIPInputStream(getClass().getResourceAsStream(BEES_XML_GZIP)), new TermMatchListener() {
DiscoverLifeUtilXML.parse(new GZIPInputStream(getClass().getResourceAsStream(BEES_XML_GZIP)), new TermMatchListener() {
@Override
public void foundTaxonForTerm(Long requestId, Term providedTerm, NameType nameType, Taxon resolvedTaxon) {
counter.incrementAndGet();
Expand Down
Loading

0 comments on commit 2433824

Please sign in to comment.