Skip to content

Commit

Permalink
refactor/add integration tests; related to #161
Browse files Browse the repository at this point in the history
  • Loading branch information
Jorrit Poelen committed Apr 4, 2024
1 parent 770f5cd commit d61a12a
Show file tree
Hide file tree
Showing 6 changed files with 300 additions and 216 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ static void parseNames(Node familyNameNode, Node nameNodeCandidate, TermMatchLis

if (StringUtils.equals("i", currentNode.getNodeName())) {
Map<String, String> taxonMap = new TreeMap<>();
String taxonName = trimNameNodeTextContent(nameNodeCandidate);
String taxonName = trimNameNodeTextContent(nameNodeCandidate.getTextContent());
taxonMap.put(PropertyAndValueDictionary.NAME, taxonName);

Node expectedTextNode = currentNode.getNextSibling();
Expand Down Expand Up @@ -135,7 +135,8 @@ private static void handleRelatedNames(

Map<String, String> relatedName = new TreeMap<>();

String authorshipString = enrichFromNameString(currentNode, relatedName);
Node authorshipNodeCandidate = currentNode.getNextSibling();
String authorshipString = enrichFromNameString(relatedName, currentNode.getTextContent(), authorshipNodeCandidate == null ? null : authorshipNodeCandidate.getTextContent());

currentNode = currentNode.getNextSibling();

Expand Down Expand Up @@ -251,13 +252,12 @@ private static void enrichFromAuthorString(String authorshipString, Map<String,
}
}

private static String enrichFromNameString(Node currentNode, Map<String, String> relatedName) {
String altName = trimNameNodeTextContent(currentNode);
private static String enrichFromNameString(Map<String, String> relatedName, String altNameText, String authorshipText) {
String altName = trimNameNodeTextContent(altNameText);

String authorship = null;
Node authorshipNode = currentNode.getNextSibling();
if (authorshipNode != null) {
authorship = StringUtils.trim(authorshipNode.getTextContent());
if (StringUtils.isNotBlank(authorshipText)) {
authorship = StringUtils.trim(authorshipText);
if (StringUtils.startsWith(authorship, ",")) {
authorship = StringUtils.trim(authorship.substring(1));
}
Expand All @@ -275,9 +275,9 @@ private static String enrichFromNameString(Node currentNode, Map<String, String>
return authorship;
}

private static String trimNameNodeTextContent(Node currentNode) {
private static String trimNameNodeTextContent(String textContent) {
return StringUtils.replace(
StringUtils.trim(currentNode.getTextContent()),
StringUtils.trim(textContent),
"_sic",
"");
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
package org.eol.globi.taxon;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.eol.globi.data.CharsetConstant;
import org.eol.globi.domain.NameType;
import org.eol.globi.domain.PropertyAndValueDictionary;
import org.eol.globi.service.TaxonUtil;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.TreeMap;
import java.util.function.Consumer;

import static org.eol.globi.service.TaxonUtil.generateTaxonPathNames;

public class DiscoverLifeUtil2 {

public static final List<String> RANKS = Arrays.asList("Family", "Subfamily", "Tribe", "Subtribe", "Genus", "Subgenus");

public static void splitRecords(InputStream is, Consumer<String> lineConsumer) {
Scanner scanner = new Scanner(is, StandardCharsets.UTF_8.name());
while (scanner.hasNext()) {
String record = nextRecord(scanner);
if (StringUtils.isNotBlank(record)) {
lineConsumer.accept(record);
}
}
}

public static String nextRecord(Scanner scanner) {
String record = null;
scanner.useDelimiter("__START__");
scanner.next();
if (scanner.hasNext()) {
scanner.useDelimiter("\n<set level=");
scanner.next();
if (scanner.hasNext()) {
scanner.useDelimiter("__STOP__");
record = StringUtils.trim(scanner.next());
}
}

return record;
}

public static void parse(InputStream is, final TermMatchListener listener) throws ParserConfigurationException {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
final DocumentBuilder builder = factory.newDocumentBuilder();

splitRecords(is, new Consumer<String>() {
@Override
public void accept(String recordXml) {
try {
// escape unescaped ampersands
String scrubbedXml = StringUtils.replace(recordXml, " & ", " &amp; ");
Document doc = builder.parse(IOUtils.toInputStream(scrubbedXml, StandardCharsets.UTF_8));
Map<String, String> nameMap = parseFocalTaxon(doc);
listener.foundTaxonForTerm(
null,
TaxonUtil.mapToTaxon(nameMap),
NameType.HAS_ACCEPTED_NAME,
TaxonUtil.mapToTaxon(nameMap)
);

} catch (SAXException | IOException | XPathExpressionException e) {
try {
IOUtils.copy(IOUtils.toInputStream(recordXml, StandardCharsets.UTF_8), System.err);
} catch (IOException e1) {
//
}
throw new RuntimeException("failed to parse DiscoverLife record [" + recordXml + "]", e);
}

}
});
}

static Map<String, String> parseFocalTaxon(Document doc) throws XPathExpressionException {
Map<String, String> nameMap = new TreeMap<String, String>() {{
put("kingdom", "Animalia");
put("phylum", "Arthropoda");
put("class", "Insecta");
put("order", "Hymenoptera");
put("superfamily", "Apoidea");
}};

Node setNode = (Node) XmlUtil.applyXPath(doc, "set", XPathConstants.NODE);


putTextValueForElement(nameMap, setNode, "name", PropertyAndValueDictionary.NAME);
putTextValueForElement(nameMap, setNode, "authority", PropertyAndValueDictionary.AUTHORSHIP);

Node level = setNode.getAttributes().getNamedItem("level");
String taxonomicRank = level == null ? null : level.getTextContent();
nameMap.put(PropertyAndValueDictionary.RANK, taxonomicRank);
nameMap.put(taxonomicRank, nameMap.get(PropertyAndValueDictionary.NAME));


NodeList attr = (NodeList) XmlUtil.applyXPath(setNode, "//attributes", XPathConstants.NODESET);

if (attr != null && attr.getLength() > 0) {

NodeList childNodes = attr.item(0).getChildNodes();

String keyCurrent = "";
ObjectNode objectNode = new ObjectMapper().createObjectNode();
ArrayNode valuesCurrent = new ObjectMapper().createArrayNode();

for (int i = 0; i < childNodes.getLength(); i++) {
Node childNode = childNodes.item(i);
String key = childNode.getNodeName();
if (StringUtils.equals("character", key)) {
keyCurrent = childNode.getTextContent();
valuesCurrent = new ObjectMapper().createArrayNode();
} else if (StringUtils.equals("state", key)) {
if (StringUtils.isNotBlank(keyCurrent)) {
valuesCurrent.add(childNode.getTextContent());
if (valuesCurrent.size() > 0 && objectNode != null) {
if (RANKS.contains(keyCurrent)) {
nameMap.put(StringUtils.lowerCase(keyCurrent), valuesCurrent.get(0).asText());
}
}
}
}
}

String pathNames = generateTaxonPathNames(nameMap, Arrays.asList("kingdom", "phylum", "class", "order", "family", "subfamily", "tribe", "subtribe", "genus", "subgenus", "subspecies"), "", "genus", "specificEpithet", "subspecificEpithet", "species");

nameMap.put(PropertyAndValueDictionary.PATH_NAMES, pathNames);

String[] ranks = StringUtils.splitByWholeSeparator(pathNames, CharsetConstant.SEPARATOR);
List<String> path = new ArrayList<>();

for (String rank : ranks) {
path.add(nameMap.get(rank));
}

String pathString = StringUtils.join(path, CharsetConstant.SEPARATOR);
nameMap.put(PropertyAndValueDictionary.PATH, pathString);
}
return nameMap;
}

private static void putTextValueForElement(Map<String, String> nameMap, Node setNode, String sourceElementName, String targetName) throws XPathExpressionException {
Node nameNode = (Node) XmlUtil.applyXPath(setNode, sourceElementName, XPathConstants.NODE);
if (nameNode != null) {
nameMap.put(targetName, nameNode.getTextContent());
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package org.eol.globi.taxon;

import org.eol.globi.domain.NameType;
import org.eol.globi.domain.Taxon;
import org.eol.globi.domain.Term;
import org.hamcrest.core.Is;
import org.junit.Test;

import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.zip.GZIPInputStream;

import static org.hamcrest.MatcherAssert.assertThat;

public class DiscoverLifeUtil2IntegrationTest {

public static final String BEES_XML_GZIP = "/org/globalbioticinteractions/nomer/match/discoverlife/bees.xml.gz";

@Test
public void compareLocalVersionToRemoteVersion() throws IOException {
DiscoverLifeTestUtil.compareLocalVersionToRemoteVersion(
BEES_XML_GZIP,
DiscoverLifeUtil.URL_ENDPOINT_DISCOVER_LIFE + "/nh/id/20q/Apoidea_species.xml"
);
}

@Test
public void parseNames() throws ParserConfigurationException, IOException {
AtomicInteger counter = new AtomicInteger(0);
DiscoverLifeUtil2.parse(new GZIPInputStream(getClass().getResourceAsStream(BEES_XML_GZIP)), new TermMatchListener() {
@Override
public void foundTaxonForTerm(Long requestId, Term providedTerm, NameType nameType, Taxon resolvedTaxon) {
counter.incrementAndGet();
}
});
assertThat(counter.get(), Is.is(20932));
}

}
Loading

0 comments on commit d61a12a

Please sign in to comment.