refactor/add integration tests; related to #161

globalbioticinteractions · Apr 4, 2024 · d61a12a · d61a12a
1 parent 770f5cd
commit d61a12a
Show file tree

Hide file tree

Showing 6 changed files with 300 additions and 216 deletions.
diff --git a/nomer-taxon-resolver/src/main/java/org/eol/globi/taxon/DiscoverLifeUtil.java b/nomer-taxon-resolver/src/main/java/org/eol/globi/taxon/DiscoverLifeUtil.java
@@ -65,7 +65,7 @@ static void parseNames(Node familyNameNode, Node nameNodeCandidate, TermMatchLis
 
         if (StringUtils.equals("i", currentNode.getNodeName())) {
             Map<String, String> taxonMap = new TreeMap<>();
-            String taxonName = trimNameNodeTextContent(nameNodeCandidate);
+            String taxonName = trimNameNodeTextContent(nameNodeCandidate.getTextContent());
             taxonMap.put(PropertyAndValueDictionary.NAME, taxonName);
 
             Node expectedTextNode = currentNode.getNextSibling();
@@ -135,7 +135,8 @@ private static void handleRelatedNames(
 
                 Map<String, String> relatedName = new TreeMap<>();
 
-                String authorshipString = enrichFromNameString(currentNode, relatedName);
+                Node authorshipNodeCandidate = currentNode.getNextSibling();
+                String authorshipString = enrichFromNameString(relatedName, currentNode.getTextContent(), authorshipNodeCandidate == null ? null : authorshipNodeCandidate.getTextContent());
 
                 currentNode = currentNode.getNextSibling();
 
@@ -251,13 +252,12 @@ private static void enrichFromAuthorString(String authorshipString, Map<String,
         }
     }
 
-    private static String enrichFromNameString(Node currentNode, Map<String, String> relatedName) {
-        String altName = trimNameNodeTextContent(currentNode);
+    private static String enrichFromNameString(Map<String, String> relatedName, String altNameText, String authorshipText) {
+        String altName = trimNameNodeTextContent(altNameText);
 
         String authorship = null;
-        Node authorshipNode = currentNode.getNextSibling();
-        if (authorshipNode != null) {
-            authorship = StringUtils.trim(authorshipNode.getTextContent());
+        if (StringUtils.isNotBlank(authorshipText)) {
+            authorship = StringUtils.trim(authorshipText);
             if (StringUtils.startsWith(authorship, ",")) {
                 authorship = StringUtils.trim(authorship.substring(1));
             }
@@ -275,9 +275,9 @@ private static String enrichFromNameString(Node currentNode, Map<String, String>
         return authorship;
     }
 
-    private static String trimNameNodeTextContent(Node currentNode) {
+    private static String trimNameNodeTextContent(String textContent) {
         return StringUtils.replace(
-                StringUtils.trim(currentNode.getTextContent()),
+                StringUtils.trim(textContent),
                 "_sic",
                 "");
     }

diff --git a/nomer-taxon-resolver/src/main/java/org/eol/globi/taxon/DiscoverLifeUtil2.java b/nomer-taxon-resolver/src/main/java/org/eol/globi/taxon/DiscoverLifeUtil2.java
@@ -0,0 +1,169 @@
+package org.eol.globi.taxon;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ArrayNode;
+import com.fasterxml.jackson.databind.node.ObjectNode;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.eol.globi.data.CharsetConstant;
+import org.eol.globi.domain.NameType;
+import org.eol.globi.domain.PropertyAndValueDictionary;
+import org.eol.globi.service.TaxonUtil;
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathExpressionException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.Scanner;
+import java.util.TreeMap;
+import java.util.function.Consumer;
+
+import static org.eol.globi.service.TaxonUtil.generateTaxonPathNames;
+
+public class DiscoverLifeUtil2 {
+
+    public static final List<String> RANKS = Arrays.asList("Family", "Subfamily", "Tribe", "Subtribe", "Genus", "Subgenus");
+
+    public static void splitRecords(InputStream is, Consumer<String> lineConsumer) {
+        Scanner scanner = new Scanner(is, StandardCharsets.UTF_8.name());
+        while (scanner.hasNext()) {
+            String record = nextRecord(scanner);
+            if (StringUtils.isNotBlank(record)) {
+                lineConsumer.accept(record);
+            }
+        }
+    }
+
+    public static String nextRecord(Scanner scanner) {
+        String record = null;
+        scanner.useDelimiter("__START__");
+        scanner.next();
+        if (scanner.hasNext()) {
+            scanner.useDelimiter("\n<set level=");
+            scanner.next();
+            if (scanner.hasNext()) {
+                scanner.useDelimiter("__STOP__");
+                record = StringUtils.trim(scanner.next());
+            }
+        }
+
+        return record;
+    }
+
+    public static void parse(InputStream is, final TermMatchListener listener) throws ParserConfigurationException {
+        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+        final DocumentBuilder builder = factory.newDocumentBuilder();
+
+        splitRecords(is, new Consumer<String>() {
+            @Override
+            public void accept(String recordXml) {
+                try {
+                    // escape unescaped ampersands
+                    String scrubbedXml = StringUtils.replace(recordXml, " & ", " &amp; ");
+                    Document doc = builder.parse(IOUtils.toInputStream(scrubbedXml, StandardCharsets.UTF_8));
+                    Map<String, String> nameMap = parseFocalTaxon(doc);
+                    listener.foundTaxonForTerm(
+                            null,
+                            TaxonUtil.mapToTaxon(nameMap),
+                            NameType.HAS_ACCEPTED_NAME,
+                            TaxonUtil.mapToTaxon(nameMap)
+                    );
+
+                } catch (SAXException | IOException | XPathExpressionException e) {
+                    try {
+                        IOUtils.copy(IOUtils.toInputStream(recordXml, StandardCharsets.UTF_8), System.err);
+                    } catch (IOException e1) {
+                        //
+                    }
+                    throw new RuntimeException("failed to parse DiscoverLife record [" + recordXml + "]", e);
+                }
+
+            }
+        });
+    }
+
+    static Map<String, String> parseFocalTaxon(Document doc) throws XPathExpressionException {
+        Map<String, String> nameMap = new TreeMap<String, String>() {{
+            put("kingdom", "Animalia");
+            put("phylum", "Arthropoda");
+            put("class", "Insecta");
+            put("order", "Hymenoptera");
+            put("superfamily", "Apoidea");
+        }};
+
+        Node setNode = (Node) XmlUtil.applyXPath(doc, "set", XPathConstants.NODE);
+
+
+        putTextValueForElement(nameMap, setNode, "name", PropertyAndValueDictionary.NAME);
+        putTextValueForElement(nameMap, setNode, "authority", PropertyAndValueDictionary.AUTHORSHIP);
+
+        Node level = setNode.getAttributes().getNamedItem("level");
+        String taxonomicRank = level == null ? null : level.getTextContent();
+        nameMap.put(PropertyAndValueDictionary.RANK, taxonomicRank);
+        nameMap.put(taxonomicRank, nameMap.get(PropertyAndValueDictionary.NAME));
+
+
+        NodeList attr = (NodeList) XmlUtil.applyXPath(setNode, "//attributes", XPathConstants.NODESET);
+
+        if (attr != null && attr.getLength() > 0) {
+
+            NodeList childNodes = attr.item(0).getChildNodes();
+
+            String keyCurrent = "";
+            ObjectNode objectNode = new ObjectMapper().createObjectNode();
+            ArrayNode valuesCurrent = new ObjectMapper().createArrayNode();
+
+            for (int i = 0; i < childNodes.getLength(); i++) {
+                Node childNode = childNodes.item(i);
+                String key = childNode.getNodeName();
+                if (StringUtils.equals("character", key)) {
+                    keyCurrent = childNode.getTextContent();
+                    valuesCurrent = new ObjectMapper().createArrayNode();
+                } else if (StringUtils.equals("state", key)) {
+                    if (StringUtils.isNotBlank(keyCurrent)) {
+                        valuesCurrent.add(childNode.getTextContent());
+                        if (valuesCurrent.size() > 0 && objectNode != null) {
+                            if (RANKS.contains(keyCurrent)) {
+                                nameMap.put(StringUtils.lowerCase(keyCurrent), valuesCurrent.get(0).asText());
+                            }
+                        }
+                    }
+                }
+            }
+
+            String pathNames = generateTaxonPathNames(nameMap, Arrays.asList("kingdom", "phylum", "class", "order", "family", "subfamily", "tribe", "subtribe", "genus", "subgenus", "subspecies"), "", "genus", "specificEpithet", "subspecificEpithet", "species");
+
+            nameMap.put(PropertyAndValueDictionary.PATH_NAMES, pathNames);
+
+            String[] ranks = StringUtils.splitByWholeSeparator(pathNames, CharsetConstant.SEPARATOR);
+            List<String> path = new ArrayList<>();
+
+            for (String rank : ranks) {
+                path.add(nameMap.get(rank));
+            }
+
+            String pathString = StringUtils.join(path, CharsetConstant.SEPARATOR);
+            nameMap.put(PropertyAndValueDictionary.PATH, pathString);
+        }
+        return nameMap;
+    }
+
+    private static void putTextValueForElement(Map<String, String> nameMap, Node setNode, String sourceElementName, String targetName) throws XPathExpressionException {
+        Node nameNode = (Node) XmlUtil.applyXPath(setNode, sourceElementName, XPathConstants.NODE);
+        if (nameNode != null) {
+            nameMap.put(targetName, nameNode.getTextContent());
+        }
+    }
+}
diff --git a/nomer-taxon-resolver/src/test/java/org/eol/globi/taxon/DiscoverLifeUtil2IntegrationTest.java b/nomer-taxon-resolver/src/test/java/org/eol/globi/taxon/DiscoverLifeUtil2IntegrationTest.java
@@ -0,0 +1,40 @@
+package org.eol.globi.taxon;
+
+import org.eol.globi.domain.NameType;
+import org.eol.globi.domain.Taxon;
+import org.eol.globi.domain.Term;
+import org.hamcrest.core.Is;
+import org.junit.Test;
+
+import javax.xml.parsers.ParserConfigurationException;
+import java.io.IOException;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.zip.GZIPInputStream;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+
+public class DiscoverLifeUtil2IntegrationTest {
+
+    public static final String BEES_XML_GZIP = "/org/globalbioticinteractions/nomer/match/discoverlife/bees.xml.gz";
+
+    @Test
+    public void compareLocalVersionToRemoteVersion() throws IOException {
+        DiscoverLifeTestUtil.compareLocalVersionToRemoteVersion(
+                BEES_XML_GZIP,
+                DiscoverLifeUtil.URL_ENDPOINT_DISCOVER_LIFE + "/nh/id/20q/Apoidea_species.xml"
+        );
+    }
+
+    @Test
+    public void parseNames() throws ParserConfigurationException, IOException {
+        AtomicInteger counter = new AtomicInteger(0);
+        DiscoverLifeUtil2.parse(new GZIPInputStream(getClass().getResourceAsStream(BEES_XML_GZIP)), new TermMatchListener() {
+            @Override
+            public void foundTaxonForTerm(Long requestId, Term providedTerm, NameType nameType, Taxon resolvedTaxon) {
+                counter.incrementAndGet();
+            }
+        });
+        assertThat(counter.get(), Is.is(20932));
+    }
+
+}