diff --git a/nomer/src/main/java/org/eol/globi/service/Synonymizer.java b/nomer/src/main/java/org/eol/globi/service/Synonymizer.java index f2d0f48..6e928ab 100644 --- a/nomer/src/main/java/org/eol/globi/service/Synonymizer.java +++ b/nomer/src/main/java/org/eol/globi/service/Synonymizer.java @@ -9,15 +9,31 @@ import org.eol.globi.taxon.TermMatcher; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.List; +import java.util.Map; +import java.util.TreeMap; import java.util.concurrent.atomic.AtomicBoolean; +import java.util.regex.Pattern; import java.util.stream.Collectors; public class Synonymizer implements TermMatcher { private final TermMatcher matcher; + private static final Pattern CAPITALIZED = Pattern.compile("^[A-Z].*"); + private static final Map> SUFFIX_ALTERNATE_MAP = new TreeMap>() {{ + put("us", Arrays.asList("a", "um", "or")); // e.g., Mops russatus -> Mops russata + put("a", Arrays.asList("us", "um")); // e.g., Mops russata -> Mops russatus + put("um", Arrays.asList("a", "us")); // e.g., Mops russatum -> Mops russatus + put("is", Arrays.asList("e")); // e.g., Baeodon gracilis -> Baeodon gracile + put("e", Arrays.asList("is")); // e.g., Styloctenium mindorense -> Styloctenium mindorensis + put("or", Arrays.asList("us")); //e.g., major -> majus + put("i", Arrays.asList("ii")); // e.g., Mops bemmeleni -> Mops bemmelenii + put("ii", Arrays.asList("i")); // e.g., Plecotus christii -> Plecotus christi + }}; + public Synonymizer(TermMatcher matcher) { this.matcher = matcher; } @@ -30,9 +46,9 @@ public void foundTaxonForTerm(Long aLong, Term term, NameType nameType, Taxon ta final AtomicBoolean rematched = new AtomicBoolean(false); if (NameType.NONE.equals(nameType)) { String name = term.getName(); - List alternates = proposeNameAlternate(name); + List synonyms = proposeSynonymForUpToTwoNonGenusNameParts(name); try { - matcher.match(alternates.stream().map(alt -> new TermImpl(term.getId(), alt)).collect(Collectors.toList()), new TermMatchListener() { + matcher.match(synonyms.stream().map(alt -> new TermImpl(term.getId(), alt)).collect(Collectors.toList()), new TermMatchListener() { @Override public void foundTaxonForTerm(Long aLong, Term term, NameType nameType, Taxon taxon) { if (!NameType.NONE.equals(nameType)) { @@ -54,54 +70,44 @@ public void foundTaxonForTerm(Long aLong, Term term, NameType nameType, Taxon ta } - public static List proposeNameAlternate(String name) { - String[] parts = StringUtils.split(name, ' '); - List> alternates = new ArrayList<>(); + static List proposeSynonymForUpToTwoNonGenusNameParts(String name) { List heads = Collections.emptyList(); + List> alternates = new ArrayList<>(); + if (StringUtils.isNotBlank(name) && CAPITALIZED.matcher(name).matches()) { + heads = proposeSynonymForUpToTwoNonGenusNameParts(name, heads, alternates); + } + List expand = expand(heads, alternates); + return expand + .stream() + .filter(alt -> !StringUtils.equals(name, alt)) + .collect(Collectors.toList()); + } + + private static List proposeSynonymForUpToTwoNonGenusNameParts(String name, List heads, List> alternates) { + String[] parts = StringUtils.split(name, ' '); if (parts != null && parts.length > 0) { heads = Collections.singletonList(parts[0]); + int alteredPartCount = 0; for (int i = 1; i < parts.length; i++) { String part = parts[i]; List alternateForPart = new ArrayList<>(); alternateForPart.add(part); - if (StringUtils.isAllLowerCase(part)) { - if (StringUtils.endsWith(part, "ii")) { - alternateForPart.add(StringUtils.substring(part, 0, part.length() - 1)); - } else if (StringUtils.endsWith(part, "i")) { - alternateForPart.add(part + "i"); - } - if (StringUtils.endsWith(part, "us")) { - alternateForPart.add(StringUtils.substring(part, 0, part.length() - 2) + "a"); - alternateForPart.add(StringUtils.substring(part, 0, part.length() - 2) + "um"); - alternateForPart.add(StringUtils.substring(part, 0, part.length() - 2) + "or"); - } - if (StringUtils.endsWith(part, "a")) { - alternateForPart.add(StringUtils.substring(part, 0, part.length() - 1) + "us"); - alternateForPart.add(StringUtils.substring(part, 0, part.length() - 1) + "um"); - } - if (StringUtils.endsWith(part, "um")) { - String stem = StringUtils.substring(part, 0, part.length() - 2); - alternateForPart.add(stem + "us"); - alternateForPart.add(StringUtils.substring(part, 0, part.length() - 2) + "a"); - } - if (StringUtils.endsWith(part, "is")) { - alternateForPart.add(StringUtils.substring(part, 0, part.length() - 2) + "e"); - } - if (StringUtils.endsWith(part, "e")) { - alternateForPart.add(StringUtils.substring(part, 0, part.length() - 1) + "is"); - } - if (StringUtils.endsWith(part, "or")) { - alternateForPart.add(StringUtils.substring(part, 0, part.length() - 2) + "us"); - } + if (StringUtils.isAllLowerCase(part) && alteredPartCount < 2) { + SUFFIX_ALTERNATE_MAP + .forEach((key, suffixAlternates) -> { + if (StringUtils.endsWith(part, key)) { + String stem = StringUtils.substring(part, 0, part.length() - key.length()); + suffixAlternates.forEach(suffixAlt -> alternateForPart.add(stem + suffixAlt)); + } + }); + } + if (alternateForPart.size() > 1) { + alteredPartCount++; } alternates.add(alternateForPart); } } - List expand = expand(heads, alternates); - return expand - .stream() - .filter(alt -> !StringUtils.equals(name, alt)) - .collect(Collectors.toList()); + return heads; } public static List expand(List heads, List> tail) { diff --git a/nomer/src/test/java/org/eol/globi/service/SynonymizerTest.java b/nomer/src/test/java/org/eol/globi/service/SynonymizerTest.java index 0cd8479..9b8dbc2 100644 --- a/nomer/src/test/java/org/eol/globi/service/SynonymizerTest.java +++ b/nomer/src/test/java/org/eol/globi/service/SynonymizerTest.java @@ -17,6 +17,7 @@ import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.core.IsCollectionContaining.hasItem; import static org.hamcrest.core.IsCollectionContaining.hasItems; +import static org.hamcrest.core.IsNot.not; public class SynonymizerTest { @@ -64,7 +65,7 @@ public void foundTaxonForTerm(Long aLong, Term term, NameType nameType, Taxon ta @Test public void proposeNameAlternate() { assertThat( - Synonymizer.proposeNameAlternate("Bla blaii"), + Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("Bla blaii"), hasItem("Bla blai") ); } @@ -72,7 +73,7 @@ public void proposeNameAlternate() { @Test public void proposeNameAlternate2() { assertThat( - Synonymizer.proposeNameAlternate("Bla blai"), + Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("Bla blai"), hasItem("Bla blaii") ); } @@ -80,7 +81,7 @@ public void proposeNameAlternate2() { @Test public void proposeNameAlternate3() { assertThat( - Synonymizer.proposeNameAlternate("Donald duckus"), + Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("Donald duckus"), hasItem("Donald ducka") ); } @@ -88,7 +89,7 @@ public void proposeNameAlternate3() { @Test public void proposeNameAlternate4() { assertThat( - Synonymizer.proposeNameAlternate("Donald ducka"), + Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("Donald ducka"), hasItem("Donald duckus") ); } @@ -96,7 +97,7 @@ public void proposeNameAlternate4() { @Test public void proposeNameAlternate44() { assertThat( - Synonymizer.proposeNameAlternate("Donald ducka var. ducka"), + Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("Donald ducka var. ducka"), hasItems("Donald duckus var. duckus", "Donald duckus var. ducka") ); } @@ -104,7 +105,7 @@ public void proposeNameAlternate44() { @Test public void proposeNameAlternate45() { assertThat( - Synonymizer.proposeNameAlternate("Donald ducka var. ducka (L. 1758)"), + Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("Donald ducka var. ducka (L. 1758)"), hasItems("Donald duckus var. ducka (L. 1758)") ); } @@ -112,7 +113,7 @@ public void proposeNameAlternate45() { @Test public void proposeNameAlternate5() { assertThat( - Synonymizer.proposeNameAlternate("Archaeotherium palustre"), + Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("Archaeotherium palustre"), hasItem("Archaeotherium palustris") ); } @@ -120,7 +121,7 @@ public void proposeNameAlternate5() { @Test public void proposeNameAlternate6() { assertThat( - Synonymizer.proposeNameAlternate("Archaeotherium palustris"), + Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("Archaeotherium palustris"), hasItem("Archaeotherium palustre") ); } @@ -128,20 +129,20 @@ public void proposeNameAlternate6() { @Test public void proposeNameAlternate7() { assertThat( - Synonymizer.proposeNameAlternate("Donald major"), + Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("Donald major"), hasItem("Donald majus") ); } @Test public void proposeNameAlternate9() { - assertThat(Synonymizer.proposeNameAlternate("Carollia brevicauda"), hasItem("Carollia brevicaudum")); + assertThat(Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("Carollia brevicauda"), hasItem("Carollia brevicaudum")); } @Test public void proposeNameAlternate10() { assertThat( - Synonymizer.proposeNameAlternate("Carollia brevicaudum colombiana"), + Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("Carollia brevicaudum colombiana"), hasItem("Carollia brevicauda colombiana") ); @@ -149,7 +150,7 @@ public void proposeNameAlternate10() { @Test public void proposeNameAlternate8() { - List alternate = Synonymizer.proposeNameAlternate("Donald majus"); + List alternate = Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("Donald majus"); assertThat( alternate, hasItems("Donald major", "Donald maja", "Donald majum") @@ -160,4 +161,31 @@ public void proposeNameAlternate8() { ); } + @Test + public void superLong() { + List alternate = Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("Donald majus bla blaus bla bla bla bla"); + assertThat( + alternate, + hasItem("Donald maja blus blaus bla bla bla bla") + ); + assertThat( + alternate, + not(hasItem("Donald maja blus blaus bla bla bla blus")) + ); + + assertThat( + alternate.size(), + Is.is(11) + ); + } + + @Test + public void noCapitalizedGenus() { + List alternate = Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("donald majus bla blaus bla bla bla bla"); + assertThat( + alternate.size(), + Is.is(0) + ); + } + } \ No newline at end of file