diff --git a/nomer-taxon-resolver/src/main/java/org/eol/globi/taxon/DiscoverLifeUtil2.java b/nomer-taxon-resolver/src/main/java/org/eol/globi/taxon/DiscoverLifeUtil2.java index 6f8d1d26..75391035 100644 --- a/nomer-taxon-resolver/src/main/java/org/eol/globi/taxon/DiscoverLifeUtil2.java +++ b/nomer-taxon-resolver/src/main/java/org/eol/globi/taxon/DiscoverLifeUtil2.java @@ -42,11 +42,33 @@ public class DiscoverLifeUtil2 { public static final List RANKS = Arrays.asList("Family", "Subfamily", "Tribe", "Subtribe", "Genus", "Subgenus"); - public static final String NAME_PATTERN_AUTHORSHIP_PARENTHESES = "(?[A-Z][a-z]+[ ][a-z]+)[ ]+(?[(][^,]+[,][ ][0-9]{4}[)])"; - public static final String NAME_PATTERN_WITH_NOTE = "(?[A-Z][a-z]+[ ][a-z]+)(?[_][_a-z]+)[ ]+(?[^,]+[,][ ][0-9]{4})"; - public static final String NAME_PATTERN_WITH_PARENTHESIS = "(?[A-Z][a-z]+[ ][(][A-Z][a-z]+[)][ ][a-z]+)[ ]+(?[^,]+[,][ ][0-9]{4})"; - public static final String NAME_PATTERN_AUTHORSHIP_MULTIPLE_AUTHORS = "(?[A-Z][a-z]+[ ][a-z]+)[ ]+(?([A-Z][a-z]+)([ ]and[ ])([A-Z][a-z]+)[,][ ][0-9]{4})"; - public static final List NAME_PATTERNS = Arrays.asList(NAME_PATTERN_AUTHORSHIP_PARENTHESES, NAME_PATTERN_WITH_NOTE, NAME_PATTERN_WITH_PARENTHESIS, NAME_PATTERN_AUTHORSHIP_MULTIPLE_AUTHORS); + + public static final String NAME = "(?([A-Z][a-z]+)([ ][a-z]+)([ ][a-z]+){0,1})"; + public static final String NAME_PARENTHESIS = "(?[A-Z][a-z]+[ ][(][A-Z][a-z]+[)][ ][a-z]+)"; + public static final String AUTHORSHIP = "(?[^,]+[,][ ][0-9]{4})"; + public static final String AUTHORSHIP_PARENTHESIS = "(?[(][^,]+[,][ ][0-9]{4}[)])"; + public static final String AUTHORSHIP_AND = "(?([A-Z][a-z]+)([ ]and[ ])([A-Z][a-z]+)[,][ ][0-9]{4})"; + public static final String SPACE = "[ ]+"; + public static final String NOTE = "(?[_][_a-z]+)"; + + public static final String NAME_AUTHORSHIP_PARENTHESES + = NAME + SPACE + AUTHORSHIP_PARENTHESIS; + public static final String NAME_AUTHORSHIP + = NAME + SPACE + AUTHORSHIP; + public static final String NAME_WITH_NOTE + = NAME + NOTE + SPACE + AUTHORSHIP; + public static final String NAME_WITH_PARENTHESIS + = NAME_PARENTHESIS + SPACE + AUTHORSHIP; + public static final String NAME_AUTHORSHIP_MULTIPLE_AUTHORS + = NAME + SPACE + AUTHORSHIP_AND; + + + public static final List NAME_PATTERNS = Arrays.asList( + NAME_AUTHORSHIP, + NAME_AUTHORSHIP_PARENTHESES, + NAME_WITH_NOTE, + NAME_WITH_PARENTHESIS, + NAME_AUTHORSHIP_MULTIPLE_AUTHORS); public static void splitRecords(InputStream is, Consumer lineConsumer) { Scanner scanner = new Scanner(is, StandardCharsets.UTF_8.name()); diff --git a/nomer-taxon-resolver/src/test/java/org/eol/globi/taxon/DiscoverLifeUtil2IntegrationTest.java b/nomer-taxon-resolver/src/test/java/org/eol/globi/taxon/DiscoverLifeUtil2IntegrationTest.java index 87169b0f..1507aaf1 100644 --- a/nomer-taxon-resolver/src/test/java/org/eol/globi/taxon/DiscoverLifeUtil2IntegrationTest.java +++ b/nomer-taxon-resolver/src/test/java/org/eol/globi/taxon/DiscoverLifeUtil2IntegrationTest.java @@ -33,14 +33,9 @@ public void parseNames() throws ParserConfigurationException, IOException { @Override public void foundTaxonForTerm(Long requestId, Term providedTerm, NameType nameType, Taxon resolvedTaxon) { counter.incrementAndGet(); - Taxon providedTaxon = (Taxon) providedTerm; - if (!NameType.HAS_ACCEPTED_NAME.equals(nameType)) { - System.out.println(providedTaxon.getName()); - System.out.println(providedTaxon.getAuthorship()); - } } }); - assertThat(counter.get(), Is.is(51164)); + assertThat(counter.get(), Is.is(31278)); } } \ No newline at end of file diff --git a/nomer-taxon-resolver/src/test/java/org/eol/globi/taxon/DiscoverLifeUtil2Test.java b/nomer-taxon-resolver/src/test/java/org/eol/globi/taxon/DiscoverLifeUtil2Test.java index e22e4a3d..f21f24ef 100644 --- a/nomer-taxon-resolver/src/test/java/org/eol/globi/taxon/DiscoverLifeUtil2Test.java +++ b/nomer-taxon-resolver/src/test/java/org/eol/globi/taxon/DiscoverLifeUtil2Test.java @@ -48,9 +48,9 @@ public void parseRelatedNames() throws ParserConfigurationException, XPathExpres List taxons = DiscoverLifeUtil2.parseRelatedNames(doc); - assertThat(taxons.size(), is(13)); + assertThat(taxons.size(), is(17)); - Taxon lastTaxon = taxons.get(12); + Taxon lastTaxon = taxons.get(taxons.size() - 1); assertThat(lastTaxon.getName(), is("Andrena (Holandrena) cressonii")); assertThat(lastTaxon.getAuthorship(), is("Robertson, 1891")); } @@ -168,6 +168,14 @@ public void parseNameAlt4() { } + @Test + public void parseNameAlt5() { + Taxon matched = DiscoverLifeUtil2.parse("Agapostemon texanus subtilior Cockerell, 1898"); + assertThat(matched.getName(), is("Agapostemon texanus subtilior")); + assertThat(matched.getAuthorship(), is("Cockerell, 1898")); + + } + @Test public void patchCommonNames() { String name = "Protandrena bachue Gonzalez and Ruz, 2007 Rhophitulus bachue (Gonzalez and Ruz, 2007)";