Skip to content

Commit

Permalink
name synonymizer as inspired by @n8upham and #143
Browse files Browse the repository at this point in the history
  • Loading branch information
Jorrit Poelen committed Jun 27, 2024
1 parent 1d7bd21 commit b67e00d
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 51 deletions.
84 changes: 45 additions & 39 deletions nomer/src/main/java/org/eol/globi/service/Synonymizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,31 @@
import org.eol.globi.taxon.TermMatcher;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

public class Synonymizer implements TermMatcher {

private final TermMatcher matcher;

private static final Pattern CAPITALIZED = Pattern.compile("^[A-Z].*");
private static final Map<String, List<String>> SUFFIX_ALTERNATE_MAP = new TreeMap<String, List<String>>() {{
put("us", Arrays.asList("a", "um", "or")); // e.g., Mops russatus -> Mops russata
put("a", Arrays.asList("us", "um")); // e.g., Mops russata -> Mops russatus
put("um", Arrays.asList("a", "us")); // e.g., Mops russatum -> Mops russatus
put("is", Arrays.asList("e")); // e.g., Baeodon gracilis -> Baeodon gracile
put("e", Arrays.asList("is")); // e.g., Styloctenium mindorense -> Styloctenium mindorensis
put("or", Arrays.asList("us")); //e.g., major -> majus
put("i", Arrays.asList("ii")); // e.g., Mops bemmeleni -> Mops bemmelenii
put("ii", Arrays.asList("i")); // e.g., Plecotus christii -> Plecotus christi
}};

public Synonymizer(TermMatcher matcher) {
this.matcher = matcher;
}
Expand All @@ -30,9 +46,9 @@ public void foundTaxonForTerm(Long aLong, Term term, NameType nameType, Taxon ta
final AtomicBoolean rematched = new AtomicBoolean(false);
if (NameType.NONE.equals(nameType)) {
String name = term.getName();
List<String> alternates = proposeNameAlternate(name);
List<String> synonyms = proposeSynonymForUpToTwoNonGenusNameParts(name);
try {
matcher.match(alternates.stream().map(alt -> new TermImpl(term.getId(), alt)).collect(Collectors.toList()), new TermMatchListener() {
matcher.match(synonyms.stream().map(alt -> new TermImpl(term.getId(), alt)).collect(Collectors.toList()), new TermMatchListener() {
@Override
public void foundTaxonForTerm(Long aLong, Term term, NameType nameType, Taxon taxon) {
if (!NameType.NONE.equals(nameType)) {
Expand All @@ -54,54 +70,44 @@ public void foundTaxonForTerm(Long aLong, Term term, NameType nameType, Taxon ta

}

public static List<String> proposeNameAlternate(String name) {
String[] parts = StringUtils.split(name, ' ');
List<List<String>> alternates = new ArrayList<>();
static List<String> proposeSynonymForUpToTwoNonGenusNameParts(String name) {
List<String> heads = Collections.emptyList();
List<List<String>> alternates = new ArrayList<>();
if (StringUtils.isNotBlank(name) && CAPITALIZED.matcher(name).matches()) {
heads = proposeSynonymForUpToTwoNonGenusNameParts(name, heads, alternates);
}
List<String> expand = expand(heads, alternates);
return expand
.stream()
.filter(alt -> !StringUtils.equals(name, alt))
.collect(Collectors.toList());
}

private static List<String> proposeSynonymForUpToTwoNonGenusNameParts(String name, List<String> heads, List<List<String>> alternates) {
String[] parts = StringUtils.split(name, ' ');
if (parts != null && parts.length > 0) {
heads = Collections.singletonList(parts[0]);
int alteredPartCount = 0;
for (int i = 1; i < parts.length; i++) {
String part = parts[i];
List<String> alternateForPart = new ArrayList<>();
alternateForPart.add(part);
if (StringUtils.isAllLowerCase(part)) {
if (StringUtils.endsWith(part, "ii")) {
alternateForPart.add(StringUtils.substring(part, 0, part.length() - 1));
} else if (StringUtils.endsWith(part, "i")) {
alternateForPart.add(part + "i");
}
if (StringUtils.endsWith(part, "us")) {
alternateForPart.add(StringUtils.substring(part, 0, part.length() - 2) + "a");
alternateForPart.add(StringUtils.substring(part, 0, part.length() - 2) + "um");
alternateForPart.add(StringUtils.substring(part, 0, part.length() - 2) + "or");
}
if (StringUtils.endsWith(part, "a")) {
alternateForPart.add(StringUtils.substring(part, 0, part.length() - 1) + "us");
alternateForPart.add(StringUtils.substring(part, 0, part.length() - 1) + "um");
}
if (StringUtils.endsWith(part, "um")) {
String stem = StringUtils.substring(part, 0, part.length() - 2);
alternateForPart.add(stem + "us");
alternateForPart.add(StringUtils.substring(part, 0, part.length() - 2) + "a");
}
if (StringUtils.endsWith(part, "is")) {
alternateForPart.add(StringUtils.substring(part, 0, part.length() - 2) + "e");
}
if (StringUtils.endsWith(part, "e")) {
alternateForPart.add(StringUtils.substring(part, 0, part.length() - 1) + "is");
}
if (StringUtils.endsWith(part, "or")) {
alternateForPart.add(StringUtils.substring(part, 0, part.length() - 2) + "us");
}
if (StringUtils.isAllLowerCase(part) && alteredPartCount < 2) {
SUFFIX_ALTERNATE_MAP
.forEach((key, suffixAlternates) -> {
if (StringUtils.endsWith(part, key)) {
String stem = StringUtils.substring(part, 0, part.length() - key.length());
suffixAlternates.forEach(suffixAlt -> alternateForPart.add(stem + suffixAlt));
}
});
}
if (alternateForPart.size() > 1) {
alteredPartCount++;
}
alternates.add(alternateForPart);
}
}
List<String> expand = expand(heads, alternates);
return expand
.stream()
.filter(alt -> !StringUtils.equals(name, alt))
.collect(Collectors.toList());
return heads;
}

public static List<String> expand(List<String> heads, List<List<String>> tail) {
Expand Down
52 changes: 40 additions & 12 deletions nomer/src/test/java/org/eol/globi/service/SynonymizerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.core.IsCollectionContaining.hasItem;
import static org.hamcrest.core.IsCollectionContaining.hasItems;
import static org.hamcrest.core.IsNot.not;

public class SynonymizerTest {

Expand Down Expand Up @@ -64,92 +65,92 @@ public void foundTaxonForTerm(Long aLong, Term term, NameType nameType, Taxon ta
@Test
public void proposeNameAlternate() {
assertThat(
Synonymizer.proposeNameAlternate("Bla blaii"),
Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("Bla blaii"),
hasItem("Bla blai")
);
}

@Test
public void proposeNameAlternate2() {
assertThat(
Synonymizer.proposeNameAlternate("Bla blai"),
Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("Bla blai"),
hasItem("Bla blaii")
);
}

@Test
public void proposeNameAlternate3() {
assertThat(
Synonymizer.proposeNameAlternate("Donald duckus"),
Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("Donald duckus"),
hasItem("Donald ducka")
);
}

@Test
public void proposeNameAlternate4() {
assertThat(
Synonymizer.proposeNameAlternate("Donald ducka"),
Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("Donald ducka"),
hasItem("Donald duckus")
);
}

@Test
public void proposeNameAlternate44() {
assertThat(
Synonymizer.proposeNameAlternate("Donald ducka var. ducka"),
Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("Donald ducka var. ducka"),
hasItems("Donald duckus var. duckus", "Donald duckus var. ducka")
);
}

@Test
public void proposeNameAlternate45() {
assertThat(
Synonymizer.proposeNameAlternate("Donald ducka var. ducka (L. 1758)"),
Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("Donald ducka var. ducka (L. 1758)"),
hasItems("Donald duckus var. ducka (L. 1758)")
);
}

@Test
public void proposeNameAlternate5() {
assertThat(
Synonymizer.proposeNameAlternate("Archaeotherium palustre"),
Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("Archaeotherium palustre"),
hasItem("Archaeotherium palustris")
);
}

@Test
public void proposeNameAlternate6() {
assertThat(
Synonymizer.proposeNameAlternate("Archaeotherium palustris"),
Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("Archaeotherium palustris"),
hasItem("Archaeotherium palustre")
);
}

@Test
public void proposeNameAlternate7() {
assertThat(
Synonymizer.proposeNameAlternate("Donald major"),
Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("Donald major"),
hasItem("Donald majus")
);
}

@Test
public void proposeNameAlternate9() {
assertThat(Synonymizer.proposeNameAlternate("Carollia brevicauda"), hasItem("Carollia brevicaudum"));
assertThat(Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("Carollia brevicauda"), hasItem("Carollia brevicaudum"));
}

@Test
public void proposeNameAlternate10() {
assertThat(
Synonymizer.proposeNameAlternate("Carollia brevicaudum colombiana"),
Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("Carollia brevicaudum colombiana"),
hasItem("Carollia brevicauda colombiana")
);

}

@Test
public void proposeNameAlternate8() {
List<String> alternate = Synonymizer.proposeNameAlternate("Donald majus");
List<String> alternate = Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("Donald majus");
assertThat(
alternate,
hasItems("Donald major", "Donald maja", "Donald majum")
Expand All @@ -160,4 +161,31 @@ public void proposeNameAlternate8() {
);
}

@Test
public void superLong() {
List<String> alternate = Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("Donald majus bla blaus bla bla bla bla");
assertThat(
alternate,
hasItem("Donald maja blus blaus bla bla bla bla")
);
assertThat(
alternate,
not(hasItem("Donald maja blus blaus bla bla bla blus"))
);

assertThat(
alternate.size(),
Is.is(11)
);
}

@Test
public void noCapitalizedGenus() {
List<String> alternate = Synonymizer.proposeSynonymForUpToTwoNonGenusNameParts("donald majus bla blaus bla bla bla bla");
assertThat(
alternate.size(),
Is.is(0)
);
}

}

0 comments on commit b67e00d

Please sign in to comment.