From c73fff4c9891265e1be326a9d8636571d47b26df Mon Sep 17 00:00:00 2001 From: Peter Ansell Date: Wed, 14 Oct 2020 09:49:19 +1100 Subject: [PATCH 01/19] Bump parent pom --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index eb6906c93..412fb9fd5 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ au.org.ala ala-parent-pom - 9 + 14 4.0.0 @@ -96,7 +96,7 @@ junit junit - 4.12 + 4.13.1 test From f0ab298b5310fc2052747e8d1f574a5a3d847be4 Mon Sep 17 00:00:00 2001 From: pal155 Date: Mon, 8 Feb 2021 16:25:49 +1100 Subject: [PATCH 02/19] Bump development version --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 8bfb7fee9..d759f3fe5 100644 --- a/pom.xml +++ b/pom.xml @@ -11,7 +11,7 @@ ala-name-matching jar - 3.5 + 3.6-SNAPSHOT ALA Name Matching (for Lucene 6 or above) From 429fb65c065b1f4131ede6df4ffe22de6de90358 Mon Sep 17 00:00:00 2001 From: pal155 Date: Tue, 8 Jun 2021 14:47:28 +1000 Subject: [PATCH 03/19] Version to use with Lucene 8 (8.1.0) Bump development version to a new major revision to associate with new Lucene version Update travis to use the new index. Note that the -lucene8 index does not have the same left- and right-values as the production index. It's for testing only. --- .travis.yml | 8 +-- pom.xml | 13 ++-- .../java/au/org/ala/names/index/Taxonomy.java | 2 +- .../analyzer/LowerCaseKeywordAnalyzer.java | 35 ++++++----- .../org/ala/names/search/ALANameIndexer.java | 25 ++++---- .../org/ala/names/search/ALANameSearcher.java | 32 +++++----- .../org/ala/names/search/DwcaNameIndexer.java | 22 +++---- .../java/au/org/ala/names/util/TermDump.java | 8 ++- .../ala/names/search/ALANameSearcherTest.java | 62 ++++++++++++++++++- .../ala/names/search/BiocacheMatchTest.java | 2 +- .../ala/names/search/IconicSpeciesTest.java | 2 +- .../ala/names/search/VernacularMatchTest.java | 2 +- 12 files changed, 142 insertions(+), 71 deletions(-) diff --git a/.travis.yml b/.travis.yml index e0186b7ae..57ba34a7f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,15 +10,15 @@ branches: before_install: - mkdir -p ~/.m2; wget -q -O ~/.m2/settings.xml https://raw.githubusercontent.com/AtlasOfLivingAustralia/travis-build-configuration/master/travis_maven_settings_simple.xml -- sudo mkdir -p /data/lucene; sudo wget -O /data/lucene/namematching-20200214.tgz https://archives.ala.org.au/archives/nameindexes/20200214/namematching-20200214.tgz +- sudo mkdir -p /data/lucene; sudo wget -O /data/lucene/namematching-20200214-lucene8.tgz https://archives.ala.org.au/archives/nameindexes/20200214/namematching-20200214-lucene8.tgz - cd /data/lucene -- sudo tar zxvf namematching-20200214.tgz -- sudo ln -s namematching-20200214 namematching +- sudo tar zxvf namematching-20200214-lucene8.tgz +- sudo ln -s namematching-20200214-lucene8 namematching - ls -laF - cd $TRAVIS_BUILD_DIR script: -- "[ \"${TRAVIS_PULL_REQUEST}\" = \"false\" ] && mvn -P travis clean install deploy || mvn -P travis clean install" +- 'if [ "${TRAVIS_PULL_REQUEST}" = "false" ]; then mvn -P travis clean install deploy; else mvn -P travis clean install; fi' env: global: diff --git a/pom.xml b/pom.xml index e254ea916..66835aac5 100644 --- a/pom.xml +++ b/pom.xml @@ -11,8 +11,8 @@ ala-name-matching jar - 3.6-SNAPSHOT - ALA Name Matching (for Lucene 6 or above) + 4.0-SNAPSHOT + ALA Name Matching (for Lucene 8 or above) scm:git:git@github.com:AtlasOfLivingAustralia/ala-name-matching.git @@ -23,7 +23,8 @@ UTF-8 - 6.6.5 + 8.1.0 + 2.12.3 1.8 java18 1.0 @@ -146,17 +147,17 @@ com.fasterxml.jackson.core jackson-core - 2.8.9 + ${com.fasterxml.jackson.version} com.fasterxml.jackson.core jackson-databind - 2.8.9 + ${com.fasterxml.jackson.version} com.fasterxml.jackson.core jackson-annotations - 2.8.9 + ${com.fasterxml.jackson.version} diff --git a/src/main/java/au/org/ala/names/index/Taxonomy.java b/src/main/java/au/org/ala/names/index/Taxonomy.java index 471bbd6be..881da9b75 100644 --- a/src/main/java/au/org/ala/names/index/Taxonomy.java +++ b/src/main/java/au/org/ala/names/index/Taxonomy.java @@ -1659,7 +1659,7 @@ public List> getIndexValues(Term type, String taxonID) throws I IndexSearcher searcher = this.searcherManager.acquire(); try { TopDocs docs = searcher.search(query, 100, Sort.INDEXORDER); - List> valueList = new ArrayList<>(docs.totalHits); + List> valueList = new ArrayList<>((int) docs.totalHits.value); for (ScoreDoc sd : docs.scoreDocs) { Document document = searcher.doc(sd.doc); Map values = new HashMap<>(); diff --git a/src/main/java/au/org/ala/names/lucene/analyzer/LowerCaseKeywordAnalyzer.java b/src/main/java/au/org/ala/names/lucene/analyzer/LowerCaseKeywordAnalyzer.java index 6e2f12fb9..e73a7551c 100644 --- a/src/main/java/au/org/ala/names/lucene/analyzer/LowerCaseKeywordAnalyzer.java +++ b/src/main/java/au/org/ala/names/lucene/analyzer/LowerCaseKeywordAnalyzer.java @@ -19,9 +19,14 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizerFactory; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.LowerCaseFilterFactory; +import org.apache.lucene.analysis.custom.CustomAnalyzer; import org.apache.lucene.util.Version; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * A custom KeywordAnalyzer that converts the text to lowercase before tokenizing @@ -29,20 +34,20 @@ * * @author Natasha */ -public final class LowerCaseKeywordAnalyzer extends Analyzer { +public final class LowerCaseKeywordAnalyzer { + private static final Logger logger = LoggerFactory.getLogger(LowerCaseKeywordAnalyzer.class); - @Override - protected TokenStreamComponents createComponents(String fieldName) { - - KeywordTokenizer src = new KeywordTokenizer(); - TokenStream result = new LowerCaseFilter(src); - - return new TokenStreamComponents(src, result) { - - @Override - protected void setReader(final Reader reader){ - super.setReader(reader); - } - }; + /** + * Get an instance of a lower-case keyword analyser. + * + * @return The analyser + */ + public static Analyzer newInstance() { + try { + return CustomAnalyzer.builder().withTokenizer(KeywordTokenizerFactory.class).addTokenFilter(LowerCaseFilterFactory.class).build(); + } catch (IOException ex) { + logger.error("Unable to build analyzer", ex); + throw new IllegalStateException(ex); + } } -} + } diff --git a/src/main/java/au/org/ala/names/search/ALANameIndexer.java b/src/main/java/au/org/ala/names/search/ALANameIndexer.java index f86800453..0661cf64a 100644 --- a/src/main/java/au/org/ala/names/search/ALANameIndexer.java +++ b/src/main/java/au/org/ala/names/search/ALANameIndexer.java @@ -200,7 +200,7 @@ public void createIndex(String exportsDir, String indexDir, boolean generateSciN * @throws Exception */ public void createIrmngIndex(String exportsDir, String indexDir) throws Exception { - Analyzer analyzer = new LowerCaseKeywordAnalyzer(); + Analyzer analyzer = LowerCaseKeywordAnalyzer.newInstance(); IndexWriter irmngWriter = createIndexWriter(new File(indexDir + File.separator + "irmng"), analyzer, true); indexIrmngDwcA(irmngWriter, irmngDwcaDirectory); indexIRMNG(irmngWriter, exportsDir + File.separator + "ala-species-homonyms.txt", RankType.SPECIES); @@ -211,7 +211,7 @@ public void createIrmngIndex(String exportsDir, String indexDir) throws Exceptio public void createIndex(String exportsDir, String indexDir, String acceptedFile, String synonymFile, String irmngDwca, boolean generateSciNames, boolean generateCommonNames) throws Exception { - Analyzer analyzer = new LowerCaseKeywordAnalyzer(); + Analyzer analyzer = LowerCaseKeywordAnalyzer.newInstance(); //generate the extra id index createExtraIdIndex(indexDir + File.separator + "id", new File(exportsDir + File.separator + "identifiers.txt")); if (generateSciNames) { @@ -376,7 +376,7 @@ private void indexALA(IndexWriter iw, String file, String synonymFile) throws Ex public void addAdditionalName(String lsid, String scientificName, String author, LinnaeanRankClassification cl) throws Exception { if (cbIndexWriter == null) - cbIndexWriter = createIndexWriter(new File(indexDirectory + File.separator + "cb"), new LowerCaseKeywordAnalyzer(), false); + cbIndexWriter = createIndexWriter(new File(indexDirectory + File.separator + "cb"), LowerCaseKeywordAnalyzer.newInstance(), false); Document doc = createALAIndexDocument(scientificName, "-1", lsid, author, cl); cbIndexWriter.addDocument(doc); @@ -389,7 +389,7 @@ public void addAdditionalName(String lsid, String scientificName, String author, */ public void deleteName(String lsid) throws Exception{ if(cbIndexWriter == null){ - cbIndexWriter = createIndexWriter(new File(indexDirectory+ File.separator + "cb"), new LowerCaseKeywordAnalyzer(), false); + cbIndexWriter = createIndexWriter(new File(indexDirectory+ File.separator + "cb"), LowerCaseKeywordAnalyzer.newInstance(), false); } Term term = new Term("lsid", lsid); cbIndexWriter.deleteDocuments(new TermQuery(term)); @@ -568,7 +568,7 @@ private void addCoLCommonNames(IndexWriter iw, IndexSearcher currentSearcher) th while ((values = reader.readNext()) != null) { if (values.length == 3) { if (doesTaxonConceptExist(currentSearcher, values[2])) { - iw.addDocument(createCommonNameDocument(values[0], values[1], values[2], null, 1.0f)); + iw.addDocument(createCommonNameDocument(values[0], values[1], values[2], null)); count++; } else { System.out.println("Unable to locate LSID " + values[2] + " in current dump"); @@ -605,13 +605,13 @@ private void addAnbgCommonNames(String fileName, IndexWriter iw, IndexSearcher c if (doesTaxonConceptExist(currentSearcher, values[3]) || doesTaxonConceptExist(idSearcher, values[3])) { //each common name could be a comma separated list if (!values[2].contains(",") || values[2].toLowerCase().contains(" and ")) { - iw.addDocument(createCommonNameDocument(values[2], null, values[3], null, 2.0f)); + iw.addDocument(createCommonNameDocument(values[2], null, values[3], null)); count++; } else { //we need to process each common name in the list String[] names = p.split(values[2]); for (String name : names) { - iw.addDocument(createCommonNameDocument(name, null, values[3],null, 2.0f)); + iw.addDocument(createCommonNameDocument(name, null, values[3],null)); count++; } } @@ -723,7 +723,7 @@ private boolean doesTaxonConceptExist(IndexSearcher is, String lsid) { TermQuery query = new TermQuery(new Term("lsid", lsid)); try { org.apache.lucene.search.TopDocs results = is.search(query, 1); - return results.totalHits > 0; + return results.totalHits.value > 0; } catch (IOException e) { return false; } @@ -743,7 +743,7 @@ private String getAcceptedLSID(String value) { try { TermQuery tq = new TermQuery(new Term("lsid", value)); org.apache.lucene.search.TopDocs results = idSearcher.search(tq, 1); - if (results.totalHits > 0) + if (results.totalHits.value > 0) return idSearcher.doc(results.scoreDocs[0].doc).get("reallsid"); } catch (IOException e) { } @@ -751,16 +751,15 @@ private String getAcceptedLSID(String value) { return value; } - protected Document createCommonNameDocument(String cn, String sn, String lsid, String language, float boost){ - return createCommonNameDocument(cn, sn, lsid, language, boost, true); + protected Document createCommonNameDocument(String cn, String sn, String lsid, String language){ + return createCommonNameDocument(cn, sn, lsid, language, true); } - protected Document createCommonNameDocument(String cn, String sn, String lsid, String language, float boost, boolean checkAccepted) { + protected Document createCommonNameDocument(String cn, String sn, String lsid, String language, boolean checkAccepted) { Document doc = new Document(); //we are only interested in keeping all the alphanumerical values of the common name //when searching the same operations will need to be peformed on the search string TextField searchAbleName = new TextField(IndexField.SEARCHABLE_COMMON_NAME.toString(), cn.toUpperCase().replaceAll("[^A-Z0-9ÏËÖÜÄÉÈČÁÀÆŒ]", ""), Store.YES); - searchAbleName.setBoost(boost); doc.add(searchAbleName); if (sn != null) { diff --git a/src/main/java/au/org/ala/names/search/ALANameSearcher.java b/src/main/java/au/org/ala/names/search/ALANameSearcher.java index dff4a273d..93e616193 100644 --- a/src/main/java/au/org/ala/names/search/ALANameSearcher.java +++ b/src/main/java/au/org/ala/names/search/ALANameSearcher.java @@ -105,7 +105,7 @@ public ALANameSearcher(String indexDirectory) throws IOException { queryParser = new ThreadLocal() { @Override protected QueryParser initialValue() { - QueryParser qp = new QueryParser("genus", new LowerCaseKeywordAnalyzer()); + QueryParser qp = new QueryParser("genus", LowerCaseKeywordAnalyzer.newInstance()); qp.setFuzzyMinSim(0.8f); //fuzzy match similarity setting. used to match the authorship. return qp; } @@ -1513,7 +1513,7 @@ else if (rank == RankType.SPECIES && cl.getSpecies() == null) private boolean isHomonymResolvable(LinnaeanRankClassification cl) { TopDocs results = getIRMNGGenus(cl, RankType.GENUS); if (results != null) - return results.totalHits <= 1; + return results.totalHits.value <= 1; return false; } @@ -1562,7 +1562,7 @@ public RankType resolveIRMNGHomonym(LinnaeanRankClassification cl, RankType rank newcl.setSpecies(cl.getSpecies()); if (cl != null && (cl.getGenus() != null || cl.getSpecies() != null)) { TopDocs results = getIRMNGGenus(newcl, rank); - if (results == null || results.totalHits <= 1) + if (results == null || results.totalHits.value <= 1) return null; if (cl != null && cl.getKingdom() != null) { @@ -1570,39 +1570,39 @@ public RankType resolveIRMNGHomonym(LinnaeanRankClassification cl, RankType rank newcl.setKingdom(cl.getKingdom()); //Step 1 search for kingdom and genus results = getIRMNGGenus(newcl, rank); - if (results.totalHits == 1) + if (results.totalHits.value == 1) return RankType.KINGDOM; } //Step 2 add the phylum - if (cl.getPhylum() != null && results.totalHits > 1) { + if (cl.getPhylum() != null && results.totalHits.value > 1) { newcl.setPhylum(cl.getPhylum()); results = getIRMNGGenus(newcl, rank); - if (results.totalHits == 1) + if (results.totalHits.value == 1) return RankType.PHYLUM; //This may not be a good idea - else if (results.totalHits == 0) + else if (results.totalHits.value == 0) newcl.setPhylum(null);//just in case the phylum was specified incorrectly } //Step 3 try the class if (cl.getKlass() != null) {// && results.totalHits>1){ newcl.setKlass(cl.getKlass()); results = getIRMNGGenus(newcl, rank); - if (results.totalHits == 1) + if (results.totalHits.value == 1) return RankType.CLASS; } //step 4 try order - if (cl.getOrder() != null && results.totalHits > 1) { + if (cl.getOrder() != null && results.totalHits.value > 1) { newcl.setOrder(cl.getOrder()); results = getIRMNGGenus(newcl, rank); - if (results.totalHits == 1) + if (results.totalHits.value == 1) return RankType.ORDER; } //step 5 try the family - if (cl.getFamily() != null && results.totalHits > 1) { + if (cl.getFamily() != null && results.totalHits.value > 1) { newcl.setFamily(cl.getFamily()); results = getIRMNGGenus(newcl, rank); - if (results.totalHits == 1) + if (results.totalHits.value == 1) return RankType.FAMILY; } } @@ -1794,7 +1794,7 @@ public String getPrimaryLsid(String lsid) { TermQuery tq = new TermQuery(new Term("lsid", lsid)); try { org.apache.lucene.search.TopDocs results = idSearcher.search(tq, 1); - if (results.totalHits > 0) + if (results.totalHits.value > 0) return idSearcher.doc(results.scoreDocs[0].doc).get("reallsid"); } catch (IOException e) { } @@ -1808,9 +1808,9 @@ public NameSearchResult searchForRecordByLsid(String lsid) { try { Query query = new TermQuery(new Term(NameIndexField.LSID.toString(), lsid)); TopDocs hits = this.idSearcher.search(query, 1); - if (hits.totalHits == 0) + if (hits.totalHits.value == 0) hits = this.cbSearcher.search(query, 1); - if (hits.totalHits > 0) + if (hits.totalHits.value > 0) return new NameSearchResult(cbSearcher.doc(hits.scoreDocs[0].doc), MatchType.TAXON_ID); } catch (Exception ex) { log.error("Unable to search for record by LSID " + lsid, ex); @@ -2055,7 +2055,7 @@ private String findLSIDByConcatName(String name) { Query query = new TermQuery(new Term("concat_name", concatName)); TopDocs topDocs = cbSearcher.search(query, 2); - if (topDocs != null && topDocs.totalHits == 1) { + if (topDocs != null && topDocs.totalHits.value == 1) { for (ScoreDoc scoreDoc : topDocs.scoreDocs) { Document doc = cbSearcher.doc(scoreDoc.doc); return doc.get("guid"); diff --git a/src/main/java/au/org/ala/names/search/DwcaNameIndexer.java b/src/main/java/au/org/ala/names/search/DwcaNameIndexer.java index 883464bdb..71ccbd30a 100644 --- a/src/main/java/au/org/ala/names/search/DwcaNameIndexer.java +++ b/src/main/java/au/org/ala/names/search/DwcaNameIndexer.java @@ -24,6 +24,7 @@ import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.time.DateFormatUtils; import org.apache.log4j.Logger; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.KeywordAnalyzer; import org.apache.lucene.document.*; import org.apache.lucene.index.DirectoryReader; @@ -92,15 +93,15 @@ public class DwcaNameIndexer extends ALANameIndexer { private IndexWriter loadingIndexWriter = null; private IndexWriter vernacularIndexWriter = null; private IndexWriter idWriter = null; - private LowerCaseKeywordAnalyzer analyzer; + private Analyzer analyzer; private Map priorities; - public DwcaNameIndexer(File targetDir, File tmpDir, Properties priorities, boolean loadingIndex, boolean sciIndex) { + public DwcaNameIndexer(File targetDir, File tmpDir, Properties priorities, boolean loadingIndex, boolean sciIndex) throws IOException { this.targetDir = targetDir; this.tmpDir = tmpDir; this.loadingIndex = loadingIndex; this.sciIndex = sciIndex; - this.analyzer = new LowerCaseKeywordAnalyzer(); + this.analyzer = LowerCaseKeywordAnalyzer.newInstance(); this.priorities = this.buildPriorities(priorities); } @@ -311,7 +312,7 @@ private boolean loadCommonNames(File verncacularDwc) throws Exception { lsid = result.getAcceptedLsid() != null ? result.getAcceptedLsid() : result.getLsid(); if (scientificName == null) scientificName = result.getRankClassification().getScientificName(); - Document doc = this.createCommonNameDocument(vernacularName, scientificName, lsid, language,1.0f, false); + Document doc = this.createCommonNameDocument(vernacularName, scientificName, lsid, language, false); this.vernacularIndexWriter.addDocument(doc); } return true; @@ -344,9 +345,9 @@ private void indexCommonNames(File file) throws Exception { String lsid = StringUtils.isNotEmpty(values[1]) ? values[1] : values[0]; //check to see if it exists TopDocs result = getLoadIdxResults(null, "lsid", lsid, 1); - if(result.totalHits>0){ + if(result.totalHits.value > 0){ //we can add the common name - Document doc = createCommonNameDocument(values[3], values[2], lsid, values[4], 1.0f, false); + Document doc = createCommonNameDocument(values[3], values[2], lsid, values[4], false); this.vernacularIndexWriter.addDocument(doc); count++; } @@ -379,7 +380,7 @@ private void indexCommonNameExtension(Archive archive) throws Exception { String vernacularName = record.value(DwcTerm.vernacularName); String language = record.value(DcTerm.language); TopDocs result = getLoadIdxResults(null, "lsid", taxonID, 1); - if(result.totalHits > 0){ + if(result.totalHits.value > 0){ Document sciNameDoc = lsearcher.doc(result.scoreDocs[0].doc); //get the scientific name //we can add the common name @@ -388,7 +389,6 @@ private void indexCommonNameExtension(Archive archive) throws Exception { sciNameDoc.get(NameIndexField.NAME.toString()), taxonID, language, - 1.0f, false); this.vernacularIndexWriter.addDocument(doc); count++; @@ -586,7 +586,7 @@ public void generateIndex() throws Exception{ int right = left; int lastRight = right; int count = 0; - while (rootConcepts != null && rootConcepts.totalHits > 0) { + while (rootConcepts != null && rootConcepts.totalHits.value > 0) { ScoreDoc lastConcept = null; for (ScoreDoc sd : rootConcepts.scoreDocs) { lastConcept = sd; @@ -626,7 +626,7 @@ private int addIndex(Document doc, int currentDepth, int currentLeft, LinnaeanRa String id = doc.get(NameIndexField.ID.toString()); //get children for this record TopDocs children = getLoadIdxResults(null, "parent_id", id, PAGE_SIZE); - if(children.totalHits == 0){ + if(children.totalHits.value == 0){ id = doc.get(NameIndexField.LSID.toString()); children = getLoadIdxResults(null, "parent_id", id, PAGE_SIZE); } @@ -754,7 +754,7 @@ protected Document createALASynonymDocument(String scientificName, String author String infraspecificEpithet = null; try { TopDocs hits = this.cbSearcher.search(new TermQuery(new Term(NameIndexField.LSID.toString(), acceptedLsid)), 1); - if (hits.totalHits > 0) + if (hits.totalHits.value > 0) accepted = this.cbSearcher.doc(hits.scoreDocs[0].doc); } catch (Exception ex) { log.warn("Error finding accepted document for " + acceptedLsid, ex); diff --git a/src/main/java/au/org/ala/names/util/TermDump.java b/src/main/java/au/org/ala/names/util/TermDump.java index 5be096686..5d405f68c 100644 --- a/src/main/java/au/org/ala/names/util/TermDump.java +++ b/src/main/java/au/org/ala/names/util/TermDump.java @@ -6,6 +6,8 @@ import org.apache.lucene.util.BytesRef; import java.io.*; +import java.util.HashSet; +import java.util.Set; /** * Dump the terms in an index. @@ -33,7 +35,11 @@ public TermDump(File index, Writer output) { public void dump() throws IOException { DirectoryReader reader = DirectoryReader.open(FSDirectory.open(this.index.toPath())); - Fields fields = MultiFields.getFields(reader); + Set fields = new HashSet<>(); + for (LeafReaderContext lc: reader.leaves()) { + for (FieldInfo fi: lc.reader().getFieldInfos()) + fields.add(fi.name); + } PrintWriter pw = new PrintWriter(this.output); for (String field: fields) { diff --git a/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java b/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java index 44f77b492..a4710b16a 100644 --- a/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java +++ b/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java @@ -20,7 +20,7 @@ public class ALANameSearcherTest { @org.junit.BeforeClass public static void init() throws Exception { - searcher = new ALANameSearcher("/data/lucene/namematching-20200214"); + searcher = new ALANameSearcher("/data/lucene/namematching-20200214-lucene8"); } @Test @@ -1693,4 +1693,64 @@ public void testSynonymMisapplied1() throws Exception { assertTrue(metrics.getErrors().contains(ErrorType.MATCH_MISAPPLIED)); } + + // Higher taxonomy only filled out + @Test + public void testHigherTaxonomy() throws Exception { + String family = "Pterophoridae"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setFamily(family); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertNotNull(metrics); + assertEquals("NZOR-6-49519", metrics.getResult().getLsid()); + assertEquals(RankType.FAMILY, metrics.getResult().getRank()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + } + + // Phrase name with rank marker + @Test + public void testPhraseName1() throws Exception { + String name = "Tephrosia sp. Crowded pinnae (C.R.Dunlop 8202)"; + String kingdom = "Planate"; + String phylum = "Streptophyta"; + String class_ = "Equisetopsida"; + String order = "Fabales"; + String genus = "Tephrosia"; + String specificEpithet = "sp. Crowded pinnae (C.R.Dunlop 8202)"; + String rank = "species"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setKingdom(kingdom); + cl.setPhylum(phylum); + cl.setKlass(class_); + cl.setOrder(order); + cl.setGenus(genus); + cl.setSpecificEpithet(specificEpithet); + //cl.setRank(rank); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertNotNull(metrics); + assertEquals("https://id.biodiversity.org.au/instance/apni/932722", metrics.getResult().getLsid()); + assertEquals("https://id.biodiversity.org.au/node/apni/2890778", metrics.getResult().getAcceptedLsid()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + } + + @Test + public void testPhraseName2() throws Exception { + String name = "Tephrosia sp. Miriam Vale (E.J.Thompson+ MIR33)"; + String kingdom = "Planate"; + String class_ = "Equisetopsida"; + String genus = "Tephrosia"; + String rank = "species"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setKingdom(kingdom); + cl.setKlass(class_); + cl.setGenus(genus); + //cl.setRank(rank); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertNotNull(metrics); + assertEquals("https://id.biodiversity.org.au/node/apni/2903953", metrics.getResult().getLsid()); + assertEquals(MatchType.PHRASE, metrics.getResult().getMatchType()); + } + } diff --git a/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java b/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java index 20f522b1c..5885fcf05 100644 --- a/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java +++ b/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java @@ -22,7 +22,7 @@ public class BiocacheMatchTest { @org.junit.BeforeClass public static void init() throws Exception { - searcher = new ALANameSearcher("/data/lucene/namematching-20200214"); + searcher = new ALANameSearcher("/data/lucene/namematching-20200214-lucene8"); } @Test diff --git a/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java b/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java index f0a071cef..b4a77dc74 100644 --- a/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java +++ b/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java @@ -30,7 +30,7 @@ public class IconicSpeciesTest { @org.junit.BeforeClass public static void init() throws Exception { - searcher = new ALANameSearcher("/data/lucene/namematching-20200214"); + searcher = new ALANameSearcher("/data/lucene/namematching-20200214-lucene8"); } //@Test diff --git a/src/test/java/au/org/ala/names/search/VernacularMatchTest.java b/src/test/java/au/org/ala/names/search/VernacularMatchTest.java index 3f9c16177..20a34d554 100644 --- a/src/test/java/au/org/ala/names/search/VernacularMatchTest.java +++ b/src/test/java/au/org/ala/names/search/VernacularMatchTest.java @@ -24,7 +24,7 @@ public class VernacularMatchTest { @org.junit.BeforeClass public static void init() throws Exception { - searcher = new ALANameSearcher("/data/lucene/namematching-20200214"); + searcher = new ALANameSearcher("/data/lucene/namematching-20200214-lucene8"); } @Test From 4fddc6e05c03fd4de6911c83561ad9bc4890d115 Mon Sep 17 00:00:00 2001 From: pal155 Date: Sun, 18 Jul 2021 13:12:38 +1000 Subject: [PATCH 04/19] Update to the new name matching data. * New index 20210629 * Forbid "Unknown" and "XXX sp." names * Forbid illegitimate-type names from APNI-only source * Get dataset name if available * Build taxonomy from a recursive list of directories * Don't allow forbidden instances when looking for accepted elements (default to scientific name in the expectation that this will be resolved) --- .travis.yml | 6 +- README.md | 4 +- data/ala-taxon-config.json | 71 +++- .../org/ala/names/index/DwcaNameSource.java | 11 +- .../au/org/ala/names/index/NameSource.java | 13 +- .../org/ala/names/index/ScientificName.java | 21 +- .../au/org/ala/names/index/TaxonConcept.java | 2 +- .../ala/names/index/TaxonConceptInstance.java | 27 +- .../java/au/org/ala/names/index/Taxonomy.java | 3 +- .../org/ala/names/index/TaxonomyBuilder.java | 52 ++- .../index/provider/MatchTaxonCondition.java | 2 +- src/main/resources/taxonomy.properties | 3 + .../provider/MatchTaxonConditionTest.java | 17 + .../ala/names/search/ALANameSearcherTest.java | 380 +++++++++++++----- .../ala/names/search/BiocacheMatchTest.java | 16 +- .../ala/names/search/IconicSpeciesTest.java | 2 +- .../ala/names/search/VernacularMatchTest.java | 8 +- .../ala/names/search/iconic_species_list.csv | 10 +- 18 files changed, 500 insertions(+), 148 deletions(-) diff --git a/.travis.yml b/.travis.yml index 57ba34a7f..56e72af80 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,10 +10,10 @@ branches: before_install: - mkdir -p ~/.m2; wget -q -O ~/.m2/settings.xml https://raw.githubusercontent.com/AtlasOfLivingAustralia/travis-build-configuration/master/travis_maven_settings_simple.xml -- sudo mkdir -p /data/lucene; sudo wget -O /data/lucene/namematching-20200214-lucene8.tgz https://archives.ala.org.au/archives/nameindexes/20200214/namematching-20200214-lucene8.tgz +- sudo mkdir -p /data/lucene; sudo wget -O /data/lucene/namematching-20210621.tgz https://archives.ala.org.au/archives/nameindexes/20200214/namematching-20200214-lucene8.tgz - cd /data/lucene -- sudo tar zxvf namematching-20200214-lucene8.tgz -- sudo ln -s namematching-20200214-lucene8 namematching +- sudo tar zxvf namematching-20210629.tgz +- sudo ln -s namematching-20210629 namematching - ls -laF - cd $TRAVIS_BUILD_DIR diff --git a/README.md b/README.md index 8e0b9cfba..6d0dd4438 100644 --- a/README.md +++ b/README.md @@ -82,8 +82,8 @@ The build creates 3 artefacts in the ala-name-matching/target directory: * ala-name-matching-3.5-distribution.zip - zip containing the project jar and dependencies * ala-name-matching-3.5-sources.jar - source jar for the project code only -The name index for Australian names lists used in unit tests can be downloaded [from here](https://biocache.ala.org.au/archives/nameindexes/20200214) and needs to be extracted to the -directory `/data/lucene/namematching-20200214` +The name index for Australian names lists used in unit tests can be downloaded [from here](https://biocache.ala.org.au/archives/nameindexes/20220629) and needs to be extracted to the +directory `/data/lucene/namematching-20210629` ## ALA Names List diff --git a/data/ala-taxon-config.json b/data/ala-taxon-config.json index f04775e7e..e776d88c5 100644 --- a/data/ala-taxon-config.json +++ b/data/ala-taxon-config.json @@ -26,6 +26,16 @@ { "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", "taxonomicStatus": "INFERRED_INVALID" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "matchType": "REGEX", + "scientificName": "Unknown( .*|)" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "matchType": "REGEX", + "scientificName": "[A-Z][A-Za-z]+ sp\\.?" } ], "adjustments": [ @@ -183,13 +193,6 @@ }, "adjustment": -20 }, - { - "condition": { - "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", - "nomenclaturalStatus": "FORGOTTEN" - }, - "adjustment": -20 - }, { "condition": { "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", @@ -743,7 +746,59 @@ "description": "Australian Plant Name Index entries not placed by the Australian Plant Census, given an assumed parent of Plantae", "parent": "apni-apc", "authority": false, - "defaultScore" : 4000 + "defaultScore" : 4000, + "adjuster": { + "forbidden": [ + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "FORGOTTEN" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "CONFUSED" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "ABORTED" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "SUPERFLUOUS" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "NUDUM" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "NULL_NAME" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "SUPPRESSED" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "REJECTED" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "REJECTED_OUTRIGHT" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "ILLEGITIMATE" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "INVALID" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "DENIED" + } + ] + } }, { "id" : "dr2699", diff --git a/src/main/java/au/org/ala/names/index/DwcaNameSource.java b/src/main/java/au/org/ala/names/index/DwcaNameSource.java index d4473cca0..00e004961 100644 --- a/src/main/java/au/org/ala/names/index/DwcaNameSource.java +++ b/src/main/java/au/org/ala/names/index/DwcaNameSource.java @@ -227,6 +227,12 @@ public void loadVernacularDwCA(Taxonomy taxonomy) throws IndexBuilderException { * @throws IndexBuilderException if unable to load a record into the taxonomy. */ protected void loadTaxonDwCA(Taxonomy taxonomy) throws IndexBuilderException { + String defaultDatasetName = null; + try { + defaultDatasetName = archive.getMetadata().getTitle(); + } catch (MetadataException e) { + taxonomy.report(IssueType.PROBLEM, "provider.archive.noMetadata", (String) null, null); + } if (archive.getCore().getRowType() != DwcTerm.Taxon) throw new IndexBuilderException("Expecting a core row type of " + DwcTerm.Taxon); List classifiers = TaxonConceptInstance.CLASSIFICATION_FIELDS.stream().filter(t -> archive.getCore().hasTerm(t)).collect(Collectors.toList()); @@ -240,7 +246,10 @@ protected void loadTaxonDwCA(Taxonomy taxonomy) throws IndexBuilderException { Record core = record.core(); taxonID = core.value(DwcTerm.taxonID); String verbatimNomenclaturalCode = core.value(DwcTerm.nomenclaturalCode); - NameProvider provider = taxonomy.resolveProvider(core.value(DwcTerm.datasetID), core.value(DwcTerm.datasetName)); + String datasetName = core.value(DwcTerm.datasetName); + if (datasetName == null) + datasetName = defaultDatasetName; + NameProvider provider = taxonomy.resolveProvider(core.value(DwcTerm.datasetID), datasetName); NomenclaturalCode code = taxonomy.resolveCode(verbatimNomenclaturalCode); String scientificName = core.value(DwcTerm.scientificName); String scientificNameAuthorship = core.value(DwcTerm.scientificNameAuthorship); diff --git a/src/main/java/au/org/ala/names/index/NameSource.java b/src/main/java/au/org/ala/names/index/NameSource.java index 4b75daad0..926c766d7 100644 --- a/src/main/java/au/org/ala/names/index/NameSource.java +++ b/src/main/java/au/org/ala/names/index/NameSource.java @@ -345,16 +345,15 @@ abstract public class NameSource { * * @throws IndexBuilderException if unable to create the name source */ - public static NameSource create(String f) throws IndexBuilderException { + public static NameSource create(File f) throws IndexBuilderException { try { - File nf = new File(f); NameSource ns; - if (!nf.exists()) - throw new IndexBuilderException("Name source " + nf + " does not exist"); - if (nf.isDirectory()) - ns = new DwcaNameSource(nf); + if (!f.exists()) + throw new IndexBuilderException("Name source " + f + " does not exist"); + if (f.isDirectory()) + ns = new DwcaNameSource(f); else - ns = new CSVNameSource(nf.toPath(), "UTF-8", DwcTerm.Taxon); + ns = new CSVNameSource(f.toPath(), "UTF-8", DwcTerm.Taxon); ns.validate(); return ns; } catch (IOException ex) { diff --git a/src/main/java/au/org/ala/names/index/ScientificName.java b/src/main/java/au/org/ala/names/index/ScientificName.java index 74d8ce9a6..3a8ca1161 100644 --- a/src/main/java/au/org/ala/names/index/ScientificName.java +++ b/src/main/java/au/org/ala/names/index/ScientificName.java @@ -2,6 +2,8 @@ import au.org.ala.names.model.RankType; import au.org.ala.names.util.DwcaWriter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.*; @@ -30,6 +32,8 @@ * @copyright Copyright (c) 2017 CSIRO */ public class ScientificName extends Name implements Comparable { + private static final Logger logger = LoggerFactory.getLogger(ScientificName.class); + /** * Construct for a container and a key * @@ -100,13 +104,18 @@ public TaxonomicElement findElement(Taxonomy taxonomy, NameProvider provider) { */ @Override protected TaxonConcept findPrincipal(Taxonomy taxonomy) { - TaxonConcept principal = this.findBasePrincipal(taxonomy); - TaxonConceptInstance representative = principal.getRepresentative(); - TaxonConceptInstance resolved = representative.getResolvedAccepted(); + try { + TaxonConcept principal = this.findBasePrincipal(taxonomy); + TaxonConceptInstance representative = principal.getRepresentative(); + TaxonConceptInstance resolved = representative.getResolvedAccepted(); - if (resolved != representative && resolved.getContainer().getContainer() == this) - principal = resolved.getContainer(); - return principal; + if (resolved != representative && resolved.getContainer().getContainer() == this) + principal = resolved.getContainer(); + return principal; + } catch (RuntimeException ex) { + logger.error("Unable to find principal for " + this); + throw ex; + } } /** diff --git a/src/main/java/au/org/ala/names/index/TaxonConcept.java b/src/main/java/au/org/ala/names/index/TaxonConcept.java index 832b3eeb2..e3f512fdf 100644 --- a/src/main/java/au/org/ala/names/index/TaxonConcept.java +++ b/src/main/java/au/org/ala/names/index/TaxonConcept.java @@ -118,7 +118,7 @@ public TaxonConceptInstance addInstance(NameKey instanceKey, TaxonConceptInstanc */ public TaxonConceptInstance findInstance(NameProvider provider, boolean acceptedOnly) { for (TaxonConceptInstance instance: this.instances) - if (instance.getProvider().equals(provider) && (!acceptedOnly || instance.isAccepted())) + if (instance.getProvider().equals(provider) && !instance.isForbidden() && (!acceptedOnly || instance.isAccepted())) return instance; return null; } diff --git a/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java b/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java index 0bb5b539b..182ed9714 100644 --- a/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java +++ b/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java @@ -12,6 +12,8 @@ import org.gbif.dwc.terms.DwcTerm; import org.gbif.dwc.terms.GbifTerm; import org.gbif.dwc.terms.Term; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import javax.annotation.Nullable; import java.io.IOException; @@ -29,6 +31,8 @@ * @copyright Copyright © 2017 Atlas of Living Australia */ public class TaxonConceptInstance extends TaxonomicElement { + private static final Logger logger = LoggerFactory.getLogger(TaxonConceptInstance.class); + /** Compare instance base (priovider only) scores */ public static Comparator PROVIDER_SCORE_COMPARATOR = new Comparator() { @Override @@ -935,6 +939,10 @@ private TaxonConceptInstance getResolvedAccepted(TaxonConceptInstance original, if (trace != null) trace.add(ae); TaxonConceptInstance accepted = ae.getRepresentative(); + if (accepted == null) { + logger.warn("Null representative instance for " + ae + " when resolving " + this); + return resolved; + } accepted = accepted.getResolvedAccepted(original, steps - 1, trace, exception); if (!accepted.isForbidden()) return accepted; @@ -993,27 +1001,35 @@ public void normalise() throws IndexBuilderException { * * @param taxonomy The current taxonomy * + * @return True if successfully resolved + * * @throws IndexBuilderException If unable to make a link, usually due to a broken reference */ // If you plan to change this, it is called by a parallel stream, so consisder thread safety // At the moment, this fills out inferred information only - public void resolveLinks(Taxonomy taxonomy) throws IndexBuilderException { + public boolean resolveLinks(Taxonomy taxonomy) throws IndexBuilderException { + if (this.scientificName.equals("Zieria fordii")) + System.out.println("Found it"); if (this.parentNameUsageID != null) { this.parent = taxonomy.getInstance(this.parentNameUsageID); } if (this.parentNameUsage != null && this.parent == null) { this.parent = taxonomy.findElement(this.code, this.parentNameUsage, this.provider, null); } - if (this.parent == null && (this.parentNameUsage != null || this.parentNameUsageID != null)) - throw new IndexBuilderException("Unable to find parent taxon for " + this + " from " + this.parentNameUsageID + " - " + this.parentNameUsage); + if (this.parent == null && (this.parentNameUsage != null || this.parentNameUsageID != null)) { + taxonomy.report(IssueType.ERROR, "instance.parent.invalidLink", this.taxonID, this.scientificName, "Unable to find parent taxon for " + this + " from " + this.parentNameUsageID + " - " + this.parentNameUsage); + return false; + } if (this.acceptedNameUsageID != null) { this.accepted = taxonomy.getInstance(this.acceptedNameUsageID); } if (this.acceptedNameUsage != null && this.accepted == null) { this.accepted = taxonomy.findElement(this.code, this.acceptedNameUsage, this.provider, null); } - if (this.accepted == null && (this.acceptedNameUsage != null || this.acceptedNameUsageID != null)) - throw new IndexBuilderException("Unable to find accepted taxon for " + this + " from " + this.acceptedNameUsageID + " - " + this.acceptedNameUsage); + if (this.accepted == null && (this.acceptedNameUsage != null || this.acceptedNameUsageID != null)) { + taxonomy.report(IssueType.ERROR, "instance.accepted.invalidLink", this.taxonID, this.scientificName, "Unable to find accepted taxon for " + this + " from " + this.acceptedNameUsageID + " - " + this.acceptedNameUsage); + return false; + } // No parent or accepted taxon but has a classification, so see if we can deduce a parent if (this.parent == null && this.accepted == null && this.classification != null) { String genus = ""; @@ -1045,6 +1061,7 @@ public void resolveLinks(Taxonomy taxonomy) throws IndexBuilderException { if (this.parent == null) this.parent = this.provider.findDefaultParent(taxonomy, this); taxonomy.count("count.resolve.instance.links"); + return true; } /** diff --git a/src/main/java/au/org/ala/names/index/Taxonomy.java b/src/main/java/au/org/ala/names/index/Taxonomy.java index 881da9b75..0580f76ea 100644 --- a/src/main/java/au/org/ala/names/index/Taxonomy.java +++ b/src/main/java/au/org/ala/names/index/Taxonomy.java @@ -478,7 +478,8 @@ public void provideUnknownTaxon() throws Exception { */ public void resolveLinks() throws IndexBuilderException { logger.info("Resolving links"); - this.instances.values().parallelStream().forEach(instance -> instance.resolveLinks(this)); + if (!this.instances.values().parallelStream().allMatch(instance -> instance.resolveLinks(this))) + throw new IndexBuilderException("Errors resolving links"); logger.info("Finished resolving links"); } diff --git a/src/main/java/au/org/ala/names/index/TaxonomyBuilder.java b/src/main/java/au/org/ala/names/index/TaxonomyBuilder.java index 0cf6380bc..d9784b94d 100644 --- a/src/main/java/au/org/ala/names/index/TaxonomyBuilder.java +++ b/src/main/java/au/org/ala/names/index/TaxonomyBuilder.java @@ -8,6 +8,7 @@ import org.slf4j.LoggerFactory; import java.io.*; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; @@ -22,6 +23,48 @@ public class TaxonomyBuilder { private static Logger logger = LoggerFactory.getLogger(TaxonomyBuilder.class); + /** + * Recursively find sources. + *

+ * The directory and sub-directory are first searched for a meta.xml file and, + * if present, the source is added as a DwCA. + * Otherwise, any csv files are added to the list and subdirectories recursively + * searched. + *

+ * @param path + * @return + */ + protected static List findSources(File path) { + List sources = new ArrayList<>(); + try { + if (!path.exists()) { + logger.info("Path does not exist " + path); + return sources; + } + if (path.isFile()) { + logger.info("Adding source file at " + path); + sources.add(NameSource.create(path)); + return sources; + } + if (!path.isDirectory()) { + logger.info("Unknown file type for " + path); + } + File meta = new File(path, "meta.xml"); + if (meta.exists()) { + logger.info("Adding DwCA at " + path); + sources.add(NameSource.create(path)); + return sources; + } else { + for (File f : path.listFiles()) { + if (f.isDirectory() || f.getName().endsWith(".csv")) + sources.addAll(findSources(f)); + } + } + } catch (Exception ex) { + logger.error("Unable to get sources for " + path, ex); + } + return sources; + } public static void main(String[] args) { try { @@ -36,12 +79,14 @@ public static void main(String[] args) { Integer samples = null; DwcaNameIndexer indexer; TaxonomyConfiguration config = null; + List sources; Option o = OptionBuilder.withLongOpt("output").withDescription("Output directory - defaults to 'combined' in the current directory").hasArg().withArgName("DIR").withType(File.class).create('o'); Option w = OptionBuilder.withLongOpt("work").withDescription("Working directory - defaults to the current directory").hasArg().withArgName("DIR").withType(File.class).create('w'); Option c = OptionBuilder.withLongOpt("config").withDescription("Configuration file").hasArg().withArgName("FILE").withType(File.class).create('c'); Option r = OptionBuilder.withLongOpt("report").withDescription("Report file").hasArg().withArgName("FILE").withType(File.class).create('r'); Option p = OptionBuilder.withLongOpt("previous").withDescription("Previous taxonomy DwCA").hasArg().withArgName("DIR").withType(File.class).create('p'); + Option recurse = OptionBuilder.withLongOpt("recurse").withDescription("Input file is a directory, recurse through subdirectories").create('R'); Option ncl = OptionBuilder.withLongOpt("noclean").withDescription("Don't clean up work area").create(); Option nc = OptionBuilder.withLongOpt("nocreate").withDescription("Don't create an output taxonomy").create(); Option s = OptionBuilder.withLongOpt("sample").withDescription("Output a sample taxonomy, consisting of n concepts plus their parents/accepted").hasArg().withArgName("N").withType(Integer.class).create(); @@ -50,6 +95,7 @@ public static void main(String[] args) { options.addOption(c); options.addOption(r); options.addOption(p); + options.addOption(recurse); options.addOption(ncl); options.addOption(nc); options.addOption(s); @@ -80,7 +126,11 @@ public static void main(String[] args) { if (cmd.hasOption("sample")) { samples = Integer.parseInt(cmd.getOptionValue("sample")); } - List sources = Arrays.asList(cmd.getArgs()).stream().map(f -> NameSource.create(f)).collect(Collectors.toList()); + if (cmd.hasOption("recurse")) { + sources = Arrays.asList(cmd.getArgs()).stream().map(File::new).map(f -> findSources(f)).flatMap(List::stream).collect(Collectors.toList()); + } else { + sources = Arrays.asList(cmd.getArgs()).stream().map(File::new).map(f -> NameSource.create(f)).collect(Collectors.toList()); + } Taxonomy taxonomy = new Taxonomy(config, work); taxonomy.begin(); taxonomy.load(sources); diff --git a/src/main/java/au/org/ala/names/index/provider/MatchTaxonCondition.java b/src/main/java/au/org/ala/names/index/provider/MatchTaxonCondition.java index c9992a01d..52047e6dd 100644 --- a/src/main/java/au/org/ala/names/index/provider/MatchTaxonCondition.java +++ b/src/main/java/au/org/ala/names/index/provider/MatchTaxonCondition.java @@ -211,7 +211,7 @@ private boolean matchScientificName(String name) { return this.matchScientificName.equals(name); case REGEX: if (this.patternScientificName == null) - this.patternScientificName = Pattern.compile(this.scientificName); + this.patternScientificName = Pattern.compile(this.scientificName, Pattern.CASE_INSENSITIVE); return this.patternScientificName.matcher(name).matches(); default: if (this.matchScientificName == null) diff --git a/src/main/resources/taxonomy.properties b/src/main/resources/taxonomy.properties index 00d2a60e2..216340814 100644 --- a/src/main/resources/taxonomy.properties +++ b/src/main/resources/taxonomy.properties @@ -38,6 +38,7 @@ count.vernacularName.placed=Placed {0} additional vernacular names count.vernacularName.unplaced=Unable to find taxa for {0} additional vernacular names dwca.additionalInfo=Created by combining source taxonomies using the ala-name-matching algorithms. \ See https://github.com/AtlasOfLivingAustralia/ala-name-matching for more information. +instance.accepted.invalidLink=Invalid accepted link for {0} {1} - {2} instance.accepted.resolve=Unable to resolve accepted taxon for {3} instance.accepted.resolve.loop=Loop resolving accepted taxon for {3} - {2} instance.accepted.resolve.loop.provenance=Synonym loop resolved by converting to inferred unplaced @@ -47,6 +48,7 @@ instance.discarded.synonym.provenance=Discarded name synonymised into this taxon instance.inferredSynonym.provenance=Inferred from {0} in source {1} instance.multiIndex=Multiple index entries for {3}: {4} {5} choosing first instance.noIndex=No index entry for {3} +instance.parent.invalidLink=Invalid parent link for {0} {1} - {2} instance.parent.resolve=Unable to resolve parent for {3} instance.parent.resolve.loop=Loop resolving parent for {3} - {2} instance.parent.resolve.loop.provenance=Parent loop resolved by replacing parent with the unknown taxon @@ -74,6 +76,7 @@ name.UnrankedScientificName.principal=Principal for unranked scientific name {3} name.principal=Principal for {3} is {4} nomenclaturalCode.notFound=Cant find nomenclatural code {2} nomenclaturalStatus.notFound=Cant find nomenclatural status {2}, ignoring - reported once for each status +provider.archive.noMetadata=Archive has no metadata provider.validation.unknownTaxonID.noID=No unknown taxon identifier provider.validation.unknownTaxonID.notFound=Unknown taxon identifier {0} not found rank.notFound=Cant find rank of {2}, making unranked - reported once for each rank diff --git a/src/test/java/au/org/ala/names/index/provider/MatchTaxonConditionTest.java b/src/test/java/au/org/ala/names/index/provider/MatchTaxonConditionTest.java index c256f22ad..a4e43cdea 100644 --- a/src/test/java/au/org/ala/names/index/provider/MatchTaxonConditionTest.java +++ b/src/test/java/au/org/ala/names/index/provider/MatchTaxonConditionTest.java @@ -440,6 +440,23 @@ public void testMatch41() { assertFalse(condition.match(instance, key)); } + + @Test + public void testMatch42() { + MatchTaxonCondition condition = new MatchTaxonCondition(); + condition.setScientificName("Unknown(\\s.*|)"); + condition.setMatchType(NameMatchType.REGEX); + TaxonConceptInstance instance = new TaxonConceptInstance("ID-1", NomenclaturalCode.BOTANICAL, NomenclaturalCode.BOTANICAL.getAcronym(), this.provider, "unknown", null, null, "1975", TaxonomicType.ACCEPTED, TaxonomicType.ACCEPTED.getTerm(), RankType.SPECIES, RankType.SPECIES.getRank(), null, null, null, null, null, null, null, null, null, null); + NameKey key = this.analyser.analyse(instance); + assertTrue(condition.match(instance, key)); + instance = new TaxonConceptInstance("ID-1", NomenclaturalCode.BOTANICAL, NomenclaturalCode.BOTANICAL.getAcronym(), this.provider, "Unknown sp.", null, null, "1975", TaxonomicType.ACCEPTED, TaxonomicType.ACCEPTED.getTerm(), RankType.SPECIES, RankType.SPECIES.getRank(), null, null, null, null, null, null, null, null, null, null); + key = this.analyser.analyse(instance); + assertTrue(condition.match(instance, key)); + instance = new TaxonConceptInstance("ID-1", NomenclaturalCode.BOTANICAL, NomenclaturalCode.BOTANICAL.getAcronym(), this.provider, "Unknownsp.", null, null, "1975", TaxonomicType.ACCEPTED, TaxonomicType.ACCEPTED.getTerm(), RankType.SPECIES, RankType.SPECIES.getRank(), null, null, null, null, null, null, null, null, null, null); + key = this.analyser.analyse(instance); + assertFalse(condition.match(instance, key)); + } + @Test public void testWrite1() throws Exception { MatchTaxonCondition condition = new MatchTaxonCondition(); diff --git a/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java b/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java index a4710b16a..0df2e9d9a 100644 --- a/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java +++ b/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java @@ -20,7 +20,7 @@ public class ALANameSearcherTest { @org.junit.BeforeClass public static void init() throws Exception { - searcher = new ALANameSearcher("/data/lucene/namematching-20200214-lucene8"); + searcher = new ALANameSearcher("/data/lucene/namematching-20210629"); } @Test @@ -31,7 +31,7 @@ public void testMisappliedNames1() throws Exception { fail("A misapplied exception should be thrown"); //assertEquals("urn:lsid:biodiversity.org.au:apni.taxon:549612",lsid); } catch (MisappliedException ex) { - assertEquals("https://id.biodiversity.org.au/node/apni/2915977", ex.getMatchedResult().getLsid()); + assertEquals("https://id.biodiversity.org.au/taxon/apni/51401037", ex.getMatchedResult().getLsid()); //assertNull(ex.getMisappliedResult()); } } @@ -53,12 +53,27 @@ public void testMisappliedNames2() { @Test public void testMisappliedNames3() { try { - String name = "Scleroderma aurantium (L. : Pers.) Pers."; + String name = "Acacia bivenosa DC."; NameSearchResult nsr = searcher.searchForRecord(name); fail("Expecting misapplied exception"); assertNotNull(nsr); } catch (MisappliedException ex) { - assertEquals("92a4e5c4-32c1-44c6-a9f7-410659692dfa", ex.getMatchedResult().getLsid()); + assertEquals("https://id.biodiversity.org.au/node/apni/2912987", ex.getMatchedResult().getLsid()); + } catch (SearchResultException ex) { + fail("Unexpected search exception " + ex); + } + } + + + @Test + public void testMisappliedNames4() { + try { + String name = "Caladenia concinna"; + NameSearchResult nsr = searcher.searchForRecord(name); + fail("Expecting misapplied exception"); + assertNotNull(nsr); + } catch (MisappliedException ex) { + assertEquals("https://id.biodiversity.org.au/taxon/apni/51398909", ex.getMatchedResult().getLsid()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex); } @@ -119,7 +134,7 @@ public void parserBlackList() throws Exception { String name = "Petaurus australis unnamed subsp."; String lsid = searcher.searchForLSID(name, true); assertNotNull(lsid); - assertEquals("ALA_Petaurus_australis_unnamed_subsp", lsid); + assertEquals("ALA_3617757", lsid); } @Test @@ -133,7 +148,7 @@ public void testRecursiveSearch() { try { NameSearchResult nsr = searcher.searchForRecord(cl, true, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:3309bb2e-5b3f-4664-977b-147e60b66109", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/3309bb2e-5b3f-4664-977b-147e60b66109", nsr.getLsid()); System.out.println(nsr); } catch (Exception e) { e.printStackTrace(); @@ -150,8 +165,8 @@ public void testSpeciesSplitSynonym() { } catch (Exception e) { assertTrue(e instanceof ParentSynonymChildException); ParentSynonymChildException psce = (ParentSynonymChildException) e; - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:c195483c-6ef0-4043-8bdf-6d9464bef8f9", psce.getParentResult().getLsid()); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:db338300-a464-4ccb-bdc6-2cf92665fb7d", psce.getChildResult().getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/2c5fd509-d4d6-4adb-9566-96280ff9e6af", psce.getParentResult().getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/b4f39a2b-cfaf-4c69-8ace-77f1664acd6b", psce.getChildResult().getLsid()); } } @@ -176,7 +191,7 @@ public void testExcludedNames() { } catch (Exception e) { assertTrue(e instanceof ExcludedNameException); ExcludedNameException ene = (ExcludedNameException) e; - assertEquals("urn:lsid:biodiversity.org.au:afd.name:433c43fe-cf38-4b76-9bdb-55a89fbac291", ene.getExcludedName().getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/74ac7082-6138-4eb0-86ba-95535deab180", ene.getExcludedName().getLsid()); } String apcExcludedName = "Parestia elegans"; @@ -205,7 +220,7 @@ public void testHomonymsWithResolution1() throws Exception { cl.setScientificName("Thalia"); try { nsr = searcher.searchForRecord("Thalia", null, true); - fail("Thalia should throw a homonym without kingdom or author"); + fail("Thalia should throw a homonym without kingdom or author, got " + nsr.getLsid()); } catch (HomonymException e) { } } @@ -220,7 +235,7 @@ public void testHomonymsWithResolution2() throws Exception { try { nsr = searcher.searchForRecord(cl, false); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:52c68649-47d5-4f2e-9730-417fc54fb080", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/52c68649-47d5-4f2e-9730-417fc54fb080", nsr.getLsid()); } catch (HomonymException e) { fail("Homonym should be resolved via the Kingdom"); } @@ -250,7 +265,7 @@ public void testHomonymsWithResolution4() throws Exception { try { nsr = searcher.searchForRecord(cl, false); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:52c68649-47d5-4f2e-9730-417fc54fb080", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/52c68649-47d5-4f2e-9730-417fc54fb080", nsr.getLsid()); } catch (HomonymException e) { fail("Author should identify homonym value to use"); } @@ -265,7 +280,7 @@ public void testHomonymsWithResolution5() throws Exception { try { nsr = searcher.searchForRecord(cl, false); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:52c68649-47d5-4f2e-9730-417fc54fb080", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/52c68649-47d5-4f2e-9730-417fc54fb080", nsr.getLsid()); } catch (HomonymException e) { fail("Author should identify homonym value to use"); } @@ -294,7 +309,7 @@ public void testsStrMarker1(){ cl.setScientificName("Macropus rufus"); nsr = searcher.searchForRecord(cl, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.name:fbe09d8b-8cc2-444a-b8f7-d06730543781", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/fbe09d8b-8cc2-444a-b8f7-d06730543781", nsr.getLsid()); } catch (SearchResultException ex) { fail("Not expecting exception " + ex); } @@ -311,7 +326,7 @@ public void testsStrMarker2(){ cl.setScientificName("Osphranter rufus"); nsr = searcher.searchForRecord(cl, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:e6aff6af-ff36-4ad5-95f2-2dfdcca8caff", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/e6aff6af-ff36-4ad5-95f2-2dfdcca8caff", nsr.getLsid()); } catch (SearchResultException ex) { fail("Not expecting exception " + ex); } @@ -324,7 +339,7 @@ public void testsStrMarker3() { String name = "Oenochrominae s. str."; // There's only one of these left NameSearchResult nsr = searcher.searchForRecord(name, null); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:537ff8fb-b6c2-4536-9cb8-ad244832c1de", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/537ff8fb-b6c2-4536-9cb8-ad244832c1de", nsr.getLsid()); } catch (SearchResultException ex) { fail("Not expecting exception " + ex); } @@ -349,7 +364,7 @@ public void testsStrMarker5() { String name = "Stennella longirostris longirostris"; NameSearchResult nsr = searcher.searchForRecord(name, null, true); assertNotNull(nsr); - assertEquals("ALA_Stennella_longirostris_longirostris", nsr.getLsid()); + assertEquals("ALA_190693", nsr.getLsid()); } catch (SearchResultException ex) { fail("Not expecting exception " + ex); } @@ -361,7 +376,7 @@ public void testsStrMarker6() { String name = "Aplonis fusca hulliana"; NameSearchResult nsr = searcher.searchForRecord(name, null); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:d1674a33-af14-4592-be4d-2ededc1b53cd", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/7b241ea8-07ab-4aa0-a2d7-c0b43767c3d4", nsr.getLsid()); } catch (SearchResultException ex) { fail("Not expecting exception " + ex); } @@ -398,8 +413,8 @@ public void testsStrMarker9() { String name = "Siganus nebulosus"; NameSearchResult nsr = searcher.searchForRecord(name, null, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.name:c2d406d8-1066-4fd3-8c95-31ee6343a1b8", nsr.getLsid()); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:0aa9653f-00c7-42b9-896b-f399103703b8", nsr.getAcceptedLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/c2d406d8-1066-4fd3-8c95-31ee6343a1b8", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/0aa9653f-00c7-42b9-896b-f399103703b8", nsr.getAcceptedLsid()); } catch (SearchResultException ex) { fail("Not expecting exception " + ex); @@ -411,7 +426,7 @@ public void testsStrMarker10() { String name = "Anabathron contabulatum"; NameSearchResult nsr = searcher.searchForRecord(name, null, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:eea54328-a4a5-406b-bdfd-3ed119241591", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/39a6129d-dca7-4e3f-bec7-88f0e848c92c", nsr.getLsid()); } catch (SearchResultException ex) { fail("Not expecting exception " + ex); } @@ -443,7 +458,7 @@ public void testQuestionSpeciesMatch() { //test the "name based" synonym "has generic combination" nsr = searcher.searchForRecord("Cacatua leadbeateri", null); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:0217f06f-664c-4c64-bc59-1b54650fa23d", nsr.getAcceptedLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/5815e99d-01cd-4a92-99ba-36f480c4834d", nsr.getAcceptedLsid()); name = "Zieria smithii"; nsr = searcher.searchForRecord(name, null); @@ -465,7 +480,7 @@ public void testSpMarker1() { nsr = searcher.searchForRecord(name, null); assertNotNull(nsr); // Either one can match - assertTrue("https://id.biodiversity.org.au/name/apni/190511".equals(nsr.getLsid()) || "https://id.biodiversity.org.au/name/apni/233691".equals(nsr.getLsid())); + assertEquals("https://id.biodiversity.org.au/taxon/apni/51414212", nsr.getLsid()); } catch (SearchResultException e) { fail("Unexpected search exception " + e); } @@ -583,7 +598,7 @@ public void testPhraseMatch4() { NameSearchResult nsr = null; nsr = searcher.searchForRecord(name, null); assertNotNull(nsr); - assertEquals("https://id.biodiversity.org.au/name/apni/233691", nsr.getLsid()); + assertEquals("https://id.biodiversity.org.au/taxon/apni/51414212", nsr.getLsid()); } catch (SearchResultException e) { fail("Unexpected search exception " + e); } @@ -596,29 +611,41 @@ public void testPhraseMatch5() { NameSearchResult nsr = null; nsr = searcher.searchForRecord(name, null); assertNotNull(nsr); - assertEquals("https://id.biodiversity.org.au/node/apni/2898916", nsr.getLsid()); + assertEquals("https://id.biodiversity.org.au/instance/apni/9302042", nsr.getLsid()); } catch (SearchResultException e) { fail("Unexpected search exception " + e); } } - @Test - public void testSynonymWithoutRank() { - try { - LinnaeanRankClassification cl = new LinnaeanRankClassification(); - cl.setKingdom("Animalia"); - cl.setScientificName("Gymnorhina tibicen"); - NameSearchResult nsr = searcher.searchForRecord(cl, true, true); - assertEquals("Gymnorhina tibicen", nsr.getRankClassification().getScientificName()); - assertEquals("(Latham, 1801)", nsr.getRankClassification().getAuthorship()); - nsr = searcher.searchForRecord("Cracticus tibicen", RankType.SPECIES); - assertEquals("Cracticus tibicen", nsr.getRankClassification().getScientificName()); - nsr = searcher.searchForRecord("Cracticus tibicen", RankType.GENUS); - assertEquals(null, nsr); - } catch (Exception e) { + public void testSynonymWithoutRank1() throws Exception { + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setKingdom("Animalia"); + cl.setScientificName("Gymnorhina tibicen"); + NameSearchResult nsr = searcher.searchForRecord(cl, true, true); + assertEquals("Gymnorhina tibicen", nsr.getRankClassification().getScientificName()); + assertEquals("(Latham, 1801)", nsr.getRankClassification().getAuthorship()); + cl.setScientificName("Cracticus tibicen"); + cl.setRank(RankType.SPECIES.getRank()); + nsr = searcher.searchForRecord(cl, true, true); + assertEquals("ALA_3267030", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/5291343e-fdeb-4a65-8ba5-928f5b96acf5", nsr.getAcceptedLsid()); + nsr = searcher.searchForRecord("Cracticus tibicen", RankType.GENUS); + assertEquals(null, nsr); + } - } + + @Test + public void testSynonymWithoutRank2() throws Exception { + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName("Abantiades zonatriticum"); + NameSearchResult nsr = searcher.searchForRecord(cl, true, true); + assertEquals("Abantiades", nsr.getRankClassification().getScientificName()); + assertEquals(MatchType.RECURSIVE, nsr.getMatchType()); + cl.setRank(RankType.SPECIES.getRank()); + nsr = searcher.searchForRecord(cl, true, true); + assertEquals("Abantiades", nsr.getRankClassification().getScientificName()); + assertEquals(MatchType.RECURSIVE, nsr.getMatchType()); } @Test @@ -637,11 +664,11 @@ public void testRecordSearchWithoutScientificName() { @Test public void testInfragenricAndSoundEx1() { - String nameDifferentEnding = "Phylidonyris pyrrhopterus"; + String nameDifferentEnding = "Phylidonyris pyrrhoptera"; try { NameSearchResult nsr = searcher.searchForRecord(nameDifferentEnding, null, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:604e6ea0-1a7f-4ee4-ad50-2cff8243631f", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/61f2bc62-dd50-4ba2-82a0-0377d386e4d8", nsr.getLsid()); assertEquals(MatchType.SOUNDEX, nsr.getMatchType()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); @@ -654,8 +681,8 @@ public void testInfragenricAndSoundEx2() { try { NameSearchResult nsr = searcher.searchForRecord(nameWithInfraGenric, null, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:604e6ea0-1a7f-4ee4-ad50-2cff8243631f", nsr.getLsid()); - assertEquals(MatchType.EXACT, nsr.getMatchType()); + assertEquals("https://biodiversity.org.au/afd/taxa/61f2bc62-dd50-4ba2-82a0-0377d386e4d8", nsr.getLsid()); + assertEquals(MatchType.SOUNDEX, nsr.getMatchType()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); } @@ -663,11 +690,11 @@ public void testInfragenricAndSoundEx2() { @Test public void testInfragenricAndSoundEx3() { - String nameDiffEndInfraGeneric = "Phylidonyris (Phylidonyris) pyrrhopterus"; + String nameDiffEndInfraGeneric = "Phylidonyris (Phylidonyris) pyrrhopteras"; try { NameSearchResult nsr = searcher.searchForRecord(nameDiffEndInfraGeneric, null, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:604e6ea0-1a7f-4ee4-ad50-2cff8243631f", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/61f2bc62-dd50-4ba2-82a0-0377d386e4d8", nsr.getLsid()); assertEquals(MatchType.SOUNDEX, nsr.getMatchType()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); @@ -680,7 +707,7 @@ public void testInfragenricAndSoundEx4() { try { NameSearchResult nsr = searcher.searchForRecord(name, null, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:b1ba5449-a68e-4c3b-ae90-8e667617945b", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/c7d8dbc8-dcde-4182-85ba-907182f95ea9", nsr.getLsid()); assertEquals(MatchType.EXACT, nsr.getMatchType()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); @@ -693,7 +720,7 @@ public void testInfragenricAndSoundEx5() { try { NameSearchResult nsr = searcher.searchForRecord(name, null, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:b1ba5449-a68e-4c3b-ae90-8e667617945b", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/c7d8dbc8-dcde-4182-85ba-907182f95ea9", nsr.getLsid()); assertEquals(MatchType.SOUNDEX, nsr.getMatchType()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); @@ -706,8 +733,8 @@ public void testInfragenricAndSoundEx6() { try { NameSearchResult nsr = searcher.searchForRecord(name, null, true); assertNotNull(nsr); - assertEquals("CAAB:79629da1:6054320e:589caaa6:bb265593", nsr.getLsid()); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:1a85a82f-5a1f-4c56-9f04-918643461260", nsr.getAcceptedLsid()); + assertEquals("SY_39006017_1", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/1a85a82f-5a1f-4c56-9f04-918643461260", nsr.getAcceptedLsid()); assertEquals(MatchType.SOUNDEX, nsr.getMatchType()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); @@ -780,7 +807,7 @@ public void testOutOfGeography1() { try { NameSearchResult nsr = searcher.searchForRecord(classification, true, true, true); assertNotNull(nsr); - assertEquals("ALA_Proboscidea", nsr.getLsid()); + assertEquals("ALA_3267031", nsr.getLsid()); assertEquals(MatchType.RECURSIVE, nsr.getMatchType()); assertEquals(RankType.ORDER, nsr.getRank()); } catch (SearchResultException ex) { @@ -797,7 +824,7 @@ public void testOutOfGeography2() { try { NameSearchResult nsr = searcher.searchForRecord(classification, true, true, true); assertNotNull(nsr); - assertEquals("ALA_Myrina", nsr.getLsid()); + assertEquals("ALA_3267033", nsr.getLsid()); assertEquals(MatchType.RECURSIVE, nsr.getMatchType()); assertEquals(RankType.GENUS, nsr.getRank()); } catch (SearchResultException ex) { @@ -856,7 +883,7 @@ public void testPhraseNames() { public void testNoRank() { try { String lsid = searcher.searchForLSID("Animalia"); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:4647863b-760d-4b59-aaa1-502c8cdf8d3c", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/4647863b-760d-4b59-aaa1-502c8cdf8d3c", lsid); lsid = searcher.searchForLSID("Bacteria"); assertEquals("NZOR-6-73174", lsid); } catch (SearchResultException e) { @@ -911,7 +938,7 @@ public void testIgnoredHomonyms2() { cl.setGenus("Macropus"); //NameSearchResult nsr =searcher.searchForRecord(cl.getId(), cl, null, true,true); String lsid = searcher.searchForLSID("Macropus", false, true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:b1d9bf29-648f-47e6-8544-2c2fbdf632b1", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/b1d9bf29-648f-47e6-8544-2c2fbdf632b1", lsid); } catch (Exception e) { fail("ignored homonyms should not throw exception " + e.getMessage()); } @@ -940,7 +967,7 @@ public void testIgnoredHomonyms4() { cl.setGenus("Agathis"); cl.setKingdom("Animalia"); NameSearchResult nsr = searcher.searchForRecord(cl.getScientificName(), cl, null, true, true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:a4109d9e-723c-491a-9363-95df428fe230", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/d02923bc-cf54-4d7f-ae74-aac1d6af1830", nsr.getLsid()); } catch (Exception e) { fail("A kingdom was supplied and should be resolvable. " + e.getMessage()); } @@ -1022,7 +1049,7 @@ public void testCommonNames1() { String name = "Red Kangaroo"; String lsid = getCommonNameLSID(name); String sciName = getCommonName(name); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:e6aff6af-ff36-4ad5-95f2-2dfdcca8caff", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/e6aff6af-ff36-4ad5-95f2-2dfdcca8caff", lsid); assertEquals("Osphranter rufus", sciName); } @@ -1031,8 +1058,8 @@ public void testCommonNames2() { String name = "Yellow-tailed Black-Cockatoo"; String lsid = getCommonNameLSID(name); String sciName = getCommonName(name); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:72ca8d75-71da-4751-a5cf-aa07ac3869f7", lsid); - assertEquals("Calyptorhynchus (Zanda) funereus", sciName); + assertEquals("https://biodiversity.org.au/afd/taxa/145b081d-eca7-4d9b-9171-b97e2d061536", lsid); + assertEquals("Zanda funerea", sciName); } @Test @@ -1040,7 +1067,7 @@ public void testCommonNames3() { String name = "Scarlet Robin"; String lsid = getCommonNameLSID(name); String sciName = getCommonName(name); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:b02a8195-266e-463b-89b7-3dc2a1c48450", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/a3e5376b-f9e6-4bdf-adae-1e7add9f5c29", lsid); assertEquals("Petroica (Petroica) boodang", sciName); } @@ -1049,7 +1076,7 @@ public void testCommonNames4() { String name = "Pacific Bluefin Tuna"; String lsid = getCommonNameLSID(name); String sciName = getCommonName(name); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:b35bf6d6-3b67-4d4c-b81e-b7ca7a64d341", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/b35bf6d6-3b67-4d4c-b81e-b7ca7a64d341", lsid); assertEquals("Thunnus orientalis", sciName); } @@ -1058,7 +1085,7 @@ public void testCommonNames5() { String name = "Pacific Black Duck"; String lsid = getCommonNameLSID(name); String sciName = getCommonName(name); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:da8a156f-95e2-4fcb-a6e7-52721705a70c", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/81be58f5-caf7-4f3d-b1eb-d4f83eb0af5a", lsid); assertEquals("Anas (Anas) superciliosa", sciName); } @@ -1067,7 +1094,7 @@ public void testCommonNames6() { String name = "European Carp"; String lsid = getCommonNameLSID(name); String sciName = getCommonName(name); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:16171fac-8d6c-4327-9fab-f2db864d71bf", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/16171fac-8d6c-4327-9fab-f2db864d71bf", lsid); assertEquals("Cyprinus carpio", sciName); } @@ -1075,13 +1102,13 @@ public void testCommonNames6() { public void testCommonNames7() { String name = "Sulphur-crested Cockatoo"; String lsid = getCommonNameLSID(name); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9eb417b-2de3-48ac-ba4e-1d438f0cb323", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/2c33a1fd-34f4-48ec-9ae6-38b51f2aa7ea", lsid); name = "Sulphur crested Cockatoo"; lsid = getCommonNameLSID(name); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9eb417b-2de3-48ac-ba4e-1d438f0cb323", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/2c33a1fd-34f4-48ec-9ae6-38b51f2aa7ea", lsid); name = "SULPHUR CRESTED COCKATOO"; lsid = getCommonNameLSID(name); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9eb417b-2de3-48ac-ba4e-1d438f0cb323", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/2c33a1fd-34f4-48ec-9ae6-38b51f2aa7ea", lsid); String sciName = getCommonName(name); assertEquals("Cacatua (Cacatua) galerita", sciName); } @@ -1159,7 +1186,7 @@ public void testMyrmecia() { public void testSearchForLSID1() { try { String lsid = searcher.searchForLSID("Anochetus"); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); } @@ -1169,7 +1196,7 @@ public void testSearchForLSID1() { public void testSearchForLSID2() { try { String lsid = searcher.searchForLSID("Anochetus", true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); } @@ -1179,7 +1206,7 @@ public void testSearchForLSID2() { public void testSearchForLSID3() { try { String lsid = searcher.searchForLSID("Anochetus", true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); } @@ -1189,7 +1216,7 @@ public void testSearchForLSID3() { public void testSearchForLSID4() { try { String lsid = searcher.searchForLSID("Anochetus", RankType.GENUS); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); } @@ -1200,7 +1227,7 @@ public void testSearchForLSID5() { try { LinnaeanRankClassification cl = new LinnaeanRankClassification("Animalia", "Arthropoda", "Insecta", "Hymenoptera", "Formicidae", "Anochetus", null); String lsid = searcher.searchForLSID("Anochetus", cl, RankType.GENUS); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); } @@ -1211,7 +1238,7 @@ public void testSearchForLSID6() { try { LinnaeanRankClassification cl = new LinnaeanRankClassification("Animalia", "Arthropoda", "Insecta", "Hymenoptera", "Formicidae", "Anochetus", null); String lsid = searcher.searchForLSID(cl, true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); } @@ -1222,7 +1249,7 @@ public void testSearchForLSID7() { try { LinnaeanRankClassification cl = new LinnaeanRankClassification("Animalia", "Arthropoda", "Insecta", "Hymenoptera", "Formicidae", "Anochetus", null); String lsid = searcher.searchForLSID(cl, true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); } @@ -1230,11 +1257,11 @@ public void testSearchForLSID7() { @Test public void testFuzzyMatches() throws Exception { - //Eolophus roseicapillus - non fuzzy match - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:577ff059-a2a7-48b0-976c-fdd6a345f878", searcher.searchForLSID("Eolophus roseicapilla")); + //Eolophus roseicapilla - non fuzzy match + assertEquals("https://biodiversity.org.au/afd/taxa/9b4ad548-8bb3-486a-ab0a-905506c463ea", searcher.searchForLSID("Eolophus roseicapilla")); - //Eolophus roseicapilla - fuzzy match - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:577ff059-a2a7-48b0-976c-fdd6a345f878", searcher.searchForLSID("Eolophus roseicapillus", true)); + //Eolophus roseicapillus - fuzzy match + assertEquals("https://biodiversity.org.au/afd/taxa/9b4ad548-8bb3-486a-ab0a-905506c463ea", searcher.searchForLSID("Eolophus roseicapillus", true)); } @Test @@ -1308,10 +1335,10 @@ public void testRankMarker() { @Test public void testSimpleLookup1() { try { - String name = "Megalurus gramineus"; + String name = "Poodytes gramineus"; NameSearchResult nsr = searcher.searchForRecord(name); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:b88430ed-f7d7-482e-a586-f0a02d8e11ce", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/061fef09-7c9d-4b6d-9827-4da13a350dc6", nsr.getLsid()); } catch (SearchResultException e) { fail("Unexpected search exception " + e); } @@ -1324,7 +1351,7 @@ public void testSimpleLookup2() { String name = "Synemon plana"; NameSearchResult nsr = searcher.searchForRecord(name); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:a51dca29-50e7-49b4-ae35-5c35a9c4f854", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/a51dca29-50e7-49b4-ae35-5c35a9c4f854", nsr.getLsid()); } catch (SearchResultException e) { fail("Unexpected search exception " + e); } @@ -1349,7 +1376,7 @@ public void testSimpleLookup4() { String name = "Chenopodium x bontei nothovar. submelanocarpum"; NameSearchResult nsr = searcher.searchForRecord(name); assertNotNull(nsr); - assertEquals("https://id.biodiversity.org.au/instance/apni/769095", nsr.getLsid()); + assertEquals("https://id.biodiversity.org.au/node/apni/2902250", nsr.getLsid()); } catch (SearchResultException e) { fail("Unexpected search exception " + e); } @@ -1363,8 +1390,8 @@ public void testSimpleLookup5() { cl.setScientificName("Favolus princeps"); NameSearchResult nsr = searcher.searchForRecord(cl, true); assertNotNull(nsr); // Been removed - assertEquals("43e1bc65-3580-47db-b269-cdb066ed49e9", nsr.getLsid()); - assertEquals( "10911fd1-a2dd-41f1-9c4d-8dff7f118670", nsr.getAcceptedLsid()); + assertEquals("https://id.biodiversity.org.au/instance/fungi/60071845", nsr.getLsid()); + assertEquals( "https://id.biodiversity.org.au/node/fungi/60098663", nsr.getAcceptedLsid()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex); } @@ -1393,7 +1420,7 @@ public void testSimpleLookup7() { String name = "Astomum"; NameSearchResult nsr = searcher.searchForRecord(name, cl, RankType.GENUS); assertNotNull(nsr); - assertEquals("https://id.biodiversity.org.au/name/ausmoss/10001613", nsr.getLsid()); + assertEquals("NZOR-6-29460", nsr.getLsid()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex); } @@ -1404,7 +1431,7 @@ public void testSimpleLookup8() { try { String name = "Carbo ater"; NameSearchResult nsr = searcher.searchForRecord(name); - fail("Expecting ecxluded name exception"); + fail("Expecting excluded name exception"); } catch (ExcludedNameException ex) { assertNull(ex.getNonExcludedName()); // Two types both excluded } catch (SearchResultException ex) { @@ -1418,7 +1445,7 @@ public void testSimpleLookup9() { String name = "Neobatrachus sudellae"; NameSearchResult nsr = searcher.searchForRecord(name); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:953a5af4-2932-4c8b-8f33-850b5f8f3fed", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/953a5af4-2932-4c8b-8f33-850b5f8f3fed", nsr.getLsid()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex); } @@ -1449,6 +1476,76 @@ public void testSimpleLookup12() { fail("Unexpected search exception " + e); } } + + // Do not match nom illeg. taxonomic status + @Test + public void testSimpleLookup13() throws Exception { + String name = "Banksia collina"; + NameSearchResult nsr = searcher.searchForRecord(name); + assertNotNull(nsr); + assertEquals("https://id.biodiversity.org.au/instance/apni/838699", nsr.getLsid()); + } + + @Test + public void testSimpleLookup14() throws Exception { + String name = "Stephanopis similis"; + NameSearchResult nsr = searcher.searchForRecord(name); + assertNotNull(nsr); + assertEquals("https://biodiversity.org.au/afd/taxa/24bc164a-85b2-4633-85c5-a3b399daec0a", nsr.getLsid()); + } + + @Test + public void testSimpleLookup15() throws Exception { + String name = "Fraus latistria"; + NameSearchResult nsr = searcher.searchForRecord(name); + assertNotNull(nsr); + assertEquals("https://biodiversity.org.au/afd/taxa/2358fcc0-8db2-475d-8da4-fd4bd5e711f2", nsr.getLsid()); + } + + @Test + public void testSimpleLookup16() throws Exception { + String name = "Metrosideros fulgens"; + NameSearchResult nsr = searcher.searchForRecord(name); + assertNotNull(nsr); + assertEquals("https://id.biodiversity.org.au/name/apni/110385", nsr.getLsid()); + } + + + @Test + public void testSimpleLookup17() throws Exception { + String name = "Metrosideros scandens"; + NameSearchResult nsr = searcher.searchForRecord(name); + assertNotNull(nsr); + assertEquals("https://id.biodiversity.org.au/name/apni/233086", nsr.getLsid()); + } + + + @Test + public void testMetricsLookup1() throws Exception { + String name = "Geopelia placida"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true, true); + assertNotNull(metrics); + assertEquals("https://biodiversity.org.au/afd/taxa/3d5c4e0d-5138-46e0-8e14-5acd8fd2c523", metrics.getResult().getLsid()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + assertTrue(metrics.getErrors().contains(ErrorType.PARENT_CHILD_SYNONYM)); + } + + + @Test + public void testMetricsLookup2() throws Exception { + String name = "Trigonaphera vinnulum"; // Synonym of Trigonostoma vinnulum + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true, true); + assertNotNull(metrics); + assertEquals("https://biodiversity.org.au/afd/taxa/7e67e588-927e-48a9-8765-365ae9f25fcb", metrics.getResult().getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/5855a347-eee2-47bb-8130-94d49602d232", metrics.getResult().getAcceptedLsid()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + assertTrue(metrics.getErrors().contains(ErrorType.PARENT_CHILD_SYNONYM)); + } + @Test public void testParentChildSynonym1() { try { @@ -1458,7 +1555,7 @@ public void testParentChildSynonym1() { } catch (ParentSynonymChildException ex) { NameSearchResult nsr = ex.getChildResult(); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:d0e66526-1cdd-4b03-85b2-71b7e7d8b84a", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/3e062650-6ecb-43e7-a903-5487e3dbbbb5", nsr.getLsid()); assertEquals(RankType.SUBSPECIES, nsr.getRank()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex); @@ -1474,7 +1571,7 @@ public void testParentChildSynonym2() { } catch (ParentSynonymChildException ex) { NameSearchResult nsr = ex.getChildResult(); assertNotNull(nsr); - assertEquals("8e64942a-f300-46c8-ba97-76492d25d985", nsr.getLsid()); + assertEquals("https://id.biodiversity.org.au/node/fungi/60083449", nsr.getLsid()); assertEquals(RankType.FORM, nsr.getRank()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex); @@ -1490,8 +1587,8 @@ public void testStigmoderaAurifera() { cl.setScientificName(name); NameSearchResult nsr = searcher.searchForRecord(cl, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.name:e89de580-2942-479d-b5ef-5edd60424560", nsr.getLsid()); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:2e8ac1d8-5f2b-4fcd-a124-c619c7cab6b0", nsr.getAcceptedLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/426ab801-0d5f-4b43-b1b4-55ce7ce7a44e", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/6c212123-fadc-4307-8dd8-ac501bb534ba", nsr.getAcceptedLsid()); assertEquals("Stigmodera aurifera", nsr.getRankClassification().getScientificName()); assertEquals(MatchType.CANONICAL, nsr.getMatchType()); } catch (SearchResultException e) { @@ -1568,7 +1665,7 @@ public void testHigherTaxonMatch2() { cl.setScientificName(name); NameSearchResult nsr = searcher.searchForRecord(cl, true); assertNotNull(nsr); - assertEquals("NZOR-6-1843", nsr.getLsid()); + assertEquals("https://id.biodiversity.org.au/node/lichen/30088140", nsr.getLsid()); assertEquals("Ramalina", nsr.getRankClassification().getGenus()); assertEquals(MatchType.RECURSIVE, nsr.getMatchType()); } catch (SearchResultException e) { @@ -1639,6 +1736,26 @@ public void testMultipleMisappliedResolution3() throws Exception { assertTrue(metrics.getErrors().contains(ErrorType.MISAPPLIED)); } + // Ensure misapplication is ignored + @Test + public void testMultipleMisappliedResolution4() throws Exception { + String name = "Pterostylis bryophila"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertNotNull(metrics); + assertEquals("https://id.biodiversity.org.au/taxon/apni/51412050", metrics.getResult().getLsid()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + name = "Pterostylis obtusa"; + cl = new LinnaeanRankClassification(); + cl.setScientificName(name); + metrics = searcher.searchForRecordMetrics(cl, true); + assertNotNull(metrics); + assertEquals("https://id.biodiversity.org.au/taxon/apni/51412242", metrics.getResult().getLsid()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + assertTrue(metrics.getErrors().contains(ErrorType.MATCH_MISAPPLIED)); + } + // Synonym and accepted @Test @@ -1678,6 +1795,20 @@ public void testSynonymAccepted3() throws Exception { assertEquals("https://id.biodiversity.org.au/node/apni/2911212", metrics.getResult().getAcceptedLsid()); } + + @Test + public void testSynonymAccepted4() throws Exception { + String name = "Sugomel niger"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertNotNull(metrics); + assertEquals("ALA_3782348", metrics.getResult().getLsid()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + assertTrue(metrics.getErrors().contains(ErrorType.NONE)); + assertEquals("https://biodiversity.org.au/afd/taxa/b32a2ec6-315c-48cf-84b3-4898e39f4b57", metrics.getResult().getAcceptedLsid()); + } + // Available as a synonym but also misapplied. @Test public void testSynonymMisapplied1() throws Exception { @@ -1711,7 +1842,7 @@ public void testHigherTaxonomy() throws Exception { @Test public void testPhraseName1() throws Exception { String name = "Tephrosia sp. Crowded pinnae (C.R.Dunlop 8202)"; - String kingdom = "Planate"; + String kingdom = "Plantae"; String phylum = "Streptophyta"; String class_ = "Equisetopsida"; String order = "Fabales"; @@ -1725,7 +1856,7 @@ public void testPhraseName1() throws Exception { cl.setOrder(order); cl.setGenus(genus); cl.setSpecificEpithet(specificEpithet); - //cl.setRank(rank); + cl.setRank(rank); cl.setScientificName(name); MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); assertNotNull(metrics); @@ -1736,8 +1867,8 @@ public void testPhraseName1() throws Exception { @Test public void testPhraseName2() throws Exception { - String name = "Tephrosia sp. Miriam Vale (E.J.Thompson+ MIR33)"; - String kingdom = "Planate"; + String name = "Tephrosia sp. (Miriam Vale E.J.Thompson+ MIR33)"; + String kingdom = "Plantae"; String class_ = "Equisetopsida"; String genus = "Tephrosia"; String rank = "species"; @@ -1749,8 +1880,69 @@ public void testPhraseName2() throws Exception { cl.setScientificName(name); MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); assertNotNull(metrics); + assertEquals("https://id.biodiversity.org.au/instance/apni/51376249", metrics.getResult().getLsid()); + assertEquals("https://id.biodiversity.org.au/node/apni/2903953", metrics.getResult().getAcceptedLsid()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + assertEquals(SynonymType.SUBJECTIVE_SYNONYM, metrics.getResult().getSynonymType()); + } + + @Test + public void testPhraseName3() throws Exception { + String name = "Thryptomene sp. Leinster (B.J. Lepschi & L.A. Craven 4362) PN"; + String kingdom = "Plantae"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setKingdom(kingdom); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertNotNull(metrics); + assertEquals("https://id.biodiversity.org.au/node/apni/2904210", metrics.getResult().getLsid()); + assertEquals(MatchType.PHRASE, metrics.getResult().getMatchType()); + } + + @Test + public void testPhraseName4() throws Exception { + String name = "Tephrosia sp. Miriam Vale (E.J.Thompson+ MIR33) WA Herbarium"; + String kingdom = "Plantae"; + String class_ = "Equisetopsida"; + String genus = "Tephrosia"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setKingdom(kingdom); + cl.setKlass(class_); + cl.setGenus(genus); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertNotNull(metrics); assertEquals("https://id.biodiversity.org.au/node/apni/2903953", metrics.getResult().getLsid()); assertEquals(MatchType.PHRASE, metrics.getResult().getMatchType()); } + // Ensure illegitimate names are excluded from the system and don't gum the works up + @Test + public void testIllegitimate1() throws Exception { + String name = "Banksia collina"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertNotNull(metrics); + assertEquals("https://id.biodiversity.org.au/instance/apni/838699", metrics.getResult().getLsid()); + assertEquals("https://id.biodiversity.org.au/node/apni/2900678", metrics.getResult().getAcceptedLsid()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + assertEquals(SynonymType.OBJECTIVE_SYNONYM, metrics.getResult().getSynonymType()); + } + + + // Ensure illegitimate names are excluded from the system and don't gum the works up + @Test + public void testIllegitimate2() throws Exception { + String name = "Zieria fordii"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertNotNull(metrics); + assertEquals("https://id.biodiversity.org.au/name/apni/51337126", metrics.getResult().getLsid()); + assertEquals("https://id.biodiversity.org.au/taxon/apni/51367864", metrics.getResult().getRankClassification().getGid()); + assertEquals("https://id.biodiversity.org.au/taxon/apni/51367862", metrics.getResult().getRankClassification().getFid()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + } + } diff --git a/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java b/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java index 5885fcf05..ed7805dd6 100644 --- a/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java +++ b/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java @@ -22,7 +22,7 @@ public class BiocacheMatchTest { @org.junit.BeforeClass public static void init() throws Exception { - searcher = new ALANameSearcher("/data/lucene/namematching-20200214-lucene8"); + searcher = new ALANameSearcher("/data/lucene/namematching-20210629"); } @Test @@ -82,7 +82,7 @@ public void testRecursiveAuthorshipIssue() { cl.setGenus("Graphis"); cl.setSpecificEpithet("notreallyaname"); MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:2af76a1e-2086-46e3-90b9-6f00983b21a5", metrics.getResult().getLsid()); // Graphis from AFD + assertEquals("https://biodiversity.org.au/afd/taxa/2af76a1e-2086-46e3-90b9-6f00983b21a5", metrics.getResult().getLsid()); // Graphis from AFD } catch (Exception e) { e.printStackTrace(); fail("Exception should not occur"); @@ -98,7 +98,7 @@ public void testRecursiveAuthorshipIssue2() throws Exception { cl.setGenus("Graphis"); cl.setSpecificEpithet("notreallyaname"); MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); - assertEquals("NZOR-6-122770", metrics.getResult().getLsid()); // Can't find Graphis since not APC placed so gets Graphidaceae + assertEquals("https://id.biodiversity.org.au/taxon/lichen/30241431", metrics.getResult().getLsid()); // Can't find Graphis since not APC placed so gets Graphidaceae } @Test @@ -236,7 +236,7 @@ public void testParentChildWithDifferentSpelling1() throws Exception { cl.setScientificName("Climacteris affinis"); MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:5d7c50bc-2c2d-4984-9924-d2a46dc3b00f", metrics.getResult().getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/0d28bce2-0bae-44f6-9c73-0afc0f343b8c", metrics.getResult().getLsid()); assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); // Dereferenced synonym assertTrue(metrics.getErrors().contains(ErrorType.PARENT_CHILD_SYNONYM)); } @@ -247,7 +247,7 @@ public void testParentChildWithDifferentSpelling2() throws Exception { cl.setScientificName("Limnodynastes dumerilii"); MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:2c50c2f6-7a0d-44e1-b549-458427b420c4", metrics.getResult().getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/2c50c2f6-7a0d-44e1-b549-458427b420c4", metrics.getResult().getLsid()); assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); // Dereferenced synonym assertTrue(metrics.getErrors().contains(ErrorType.PARENT_CHILD_SYNONYM)); } @@ -259,7 +259,7 @@ public void testAffCfSpecies1() throws Exception { // No issues cl.setScientificName("Zabidius novemaculeatus"); MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:58e06bba-de3b-4c8c-b165-d75bbeb21a36", metrics.getResult().getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/58e06bba-de3b-4c8c-b165-d75bbeb21a36", metrics.getResult().getLsid()); assertTrue(metrics.getErrors().contains(ErrorType.NONE)); cl = new LinnaeanRankClassification(); @@ -453,8 +453,8 @@ public void testDingo1() { cl.setScientificName(name); MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); assertNotNull(metrics); - assertEquals("urn:lsid:biodiversity.org.au:afd.name:3064f20b-f6de-4375-8377-904cbd6cf9fa", metrics.getResult().getLsid()); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:c2056f1b-fcde-45b9-904b-1cab280368d1", metrics.getResult().getAcceptedLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/3064f20b-f6de-4375-8377-904cbd6cf9fa", metrics.getResult().getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/c2056f1b-fcde-45b9-904b-1cab280368d1", metrics.getResult().getAcceptedLsid()); assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex); diff --git a/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java b/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java index b4a77dc74..cd2c324e0 100644 --- a/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java +++ b/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java @@ -30,7 +30,7 @@ public class IconicSpeciesTest { @org.junit.BeforeClass public static void init() throws Exception { - searcher = new ALANameSearcher("/data/lucene/namematching-20200214-lucene8"); + searcher = new ALANameSearcher("/data/lucene/namematching-20210629"); } //@Test diff --git a/src/test/java/au/org/ala/names/search/VernacularMatchTest.java b/src/test/java/au/org/ala/names/search/VernacularMatchTest.java index 20a34d554..62ac8c484 100644 --- a/src/test/java/au/org/ala/names/search/VernacularMatchTest.java +++ b/src/test/java/au/org/ala/names/search/VernacularMatchTest.java @@ -24,13 +24,13 @@ public class VernacularMatchTest { @org.junit.BeforeClass public static void init() throws Exception { - searcher = new ALANameSearcher("/data/lucene/namematching-20200214-lucene8"); + searcher = new ALANameSearcher("/data/lucene/namematching-20210629"); } @Test public void testVernacular1() throws Exception { String name = "Mary River Turtle"; - String expectedLsid = "urn:lsid:biodiversity.org.au:afd.taxon:d315deea-822c-4f2c-b439-da33d6af5fd6"; + String expectedLsid = "https://biodiversity.org.au/afd/taxa/d315deea-822c-4f2c-b439-da33d6af5fd6"; NameSearchResult result = null; result = searcher.searchForCommonName(name); @@ -52,7 +52,7 @@ public void testVernacular2() throws Exception { @Test public void testVernacular3() throws Exception { String name = "Drain Mangrovegoby"; - String expectedLsid = "urn:lsid:biodiversity.org.au:afd.taxon:19c60dcd-93a0-40a2-9ac1-3abe7119c505"; + String expectedLsid = "https://biodiversity.org.au/afd/taxa/19c60dcd-93a0-40a2-9ac1-3abe7119c505"; NameSearchResult result = null; result = searcher.searchForCommonName(name); @@ -64,7 +64,7 @@ public void testVernacular3() throws Exception { @Test public void testVernacular4() throws Exception { String name = "Onespine Unicornfish"; - String expectedLsid = "urn:lsid:biodiversity.org.au:afd.taxon:f7bfd383-5501-4196-9acb-d9d4d03cc45d"; + String expectedLsid = "https://biodiversity.org.au/afd/taxa/f7bfd383-5501-4196-9acb-d9d4d03cc45d"; NameSearchResult result = null; result = searcher.searchForCommonName(name); diff --git a/src/test/resources/au/org/ala/names/search/iconic_species_list.csv b/src/test/resources/au/org/ala/names/search/iconic_species_list.csv index d96313247..568e24879 100644 --- a/src/test/resources/au/org/ala/names/search/iconic_species_list.csv +++ b/src/test/resources/au/org/ala/names/search/iconic_species_list.csv @@ -23,12 +23,12 @@ BIRDS ,Boobook Owl,,Animalia,Chordata,Aves,STRIGIFORMES,STRIGIDAE,Ninox,novaeseelandiae,,,Yes,Yes,Yes ,Little Raven,,Animalia,Chordata,Aves,PASSERIFORMES,CORVIDAE,Corvus,mellori,,,Yes,Yes,Yes ,Sulphur-crested Cockatoo,,Animalia,Chordata,Aves,PSITTACIFORMES,CACATUIDAE,Cacatua,galerita,,,Yes,Yes,Yes -,Osprey,,Animalia,Chordata,Aves,FALCONIFORMES,ACCIPITRIDAE,Pandion,haliaetus,,,Yes,Yes,Yes +,Osprey,,Animalia,Chordata,Aves,Accipitriformes,Accipitridae,Pandion,haliaetus,,,Yes,Yes,Yes ,Major Mitchell Cockatoo,,Animalia,Chordata,Aves,PSITTACIFORMES,CACATUIDAE,Lophochroa,leadbeateri,,,Yes,Yes,Yes ,Southern Cassowary,,Animalia,Chordata,Aves,STRUTHIONIFORMES,CASUARIIDAE,Casuarius,casuarius,,,Yes,No,No ,Cape Baron Goose,,Animalia,Chordata,Aves,ANSERIFORMES,ANATIDAE,Cereopsis,novaehollandiae,novaehollandiae,,Yes,Yes,Yes ,Brolga,,Animalia,Chordata,Aves,GRUIFORMES,GRUIDAE,Grus,rubicunda,,,Yes,No,No -,Wedge-tailed Eagle,,Animalia,Chordata,Aves,FALCONIFORMES,ACCIPITRIDAE,Aquila,audax,,,Yes,No,Yes +,Wedge-tailed Eagle,,Animalia,Chordata,Aves,Accipitriformes,Accipitridae,Aquila,audax,,,Yes,No,Yes FISH ,Barramundi,,Animalia,CHORDATA,ACTINOPTERYGII,PERCIFORMES,LATIDAE,Lates,calcarifer,,,yes,yes,yes @@ -116,7 +116,7 @@ REPTILES INVERTEBRATES ,Red-backed Spider,,Animalia,Arthropoda,Arachnida,ARANEAE,THERIDIIDAE,Latrodectus,hasseltii,,,yes,yes,yes -,Sydney Funnelweb Spider,,Animalia,Arthropoda,Arachnida,ARANEAE,HEXATHELIDAE,Atrax,robustus,,,yes,yes,yes +,Sydney Funnelweb Spider,,Animalia,Arthropoda,Arachnida,Araneae,Atracidae,Atrax,robustus,,,yes,yes,yes ,Red-headed Mouse Spider,,Animalia,Arthropoda,Arachnida,ARANEAE,ACTINOPODIDAE,Missulena,occatoria,,,yes,yes,yes ,Cairn's Birdwing,,Animalia,Arthropoda,Insecta,LEPIDOPTERA,PAPILIONIDAE,Ornithoptera,priamus,,,yes,yes,yes ,Cabbage White Butterfly,,Animalia,Arthropoda,Insecta,LEPIDOPTERA,PIERIDAE,Pieris,rapae,,,yes,yes,yes @@ -196,12 +196,12 @@ Marine,Blue Groper,Official,Animalia,,,,,Achoerodus,viridis NT Animal,Red Kangaroo,Official,Animalia,CHORDATA,MAMMALIA,DIPROTODONTIA,MACROPODIDAE,Osphranter,rufus,,,yes,yes - limited,yes Plant,Sturt's Desert Rose,official,Plantae,Charophyta,Equisetopsida,MALVALES,MALVACEAE,Gossypium,sturtianum,,,yes,yes - limited,yes -Bird,Wedge-tailed,Official,Animalia,CHORDATA,AVES,FALCONIFORMES,ACCIPITRIDAE,Aquila,audax,,Wedge-tailed Eagle,yes,yes - limited,yes - change image on region page +Bird,Wedge-tailed,Official,Animalia,CHORDATA,AVES,Accipitriformes,Accipitridae,Aquila,audax,,Wedge-tailed Eagle,yes,yes - limited,yes - change image on region page Marine,No emblem QLD Animal,Koala,Official,ANIMALIA,CHORDATA,MAMMALIA,DIPROTODONTIA,PHASCOLARCTIDAE,Phascolarctos,cinereus,,,yes,yes,yes - change image on Region Page -Plant,,Official,,,,,,Vappodes,phalaenopsis,,Cooktown Orchid,yes,yes - limited,yes - limited +Plant,,Official,Plantae,Charophyta,Equisetopsida,Asparagales,Orchidaceae,Dendrobium,bigibbum ,,Cooktown Orchid,yes,yes - limited,yes - limited Bird,Brolga,Official,ANIMALIA,CHORDATA,AVES,GRUIFORMES,GRUIDAE,Grus,rubicunda,,,yes,yes,yes Marine,Anemone Fish,Official,ANIMALIA,,,,,Amphiprion,akindynos From cc9ba6da90f76287feece50a56bfaa2fa6e6308f Mon Sep 17 00:00:00 2001 From: pal155 Date: Wed, 21 Jul 2021 07:00:22 +1000 Subject: [PATCH 05/19] Use lucene API for searches --- data/ala-taxon-config.json | 16 + .../ala/names/index/TaxonConceptInstance.java | 2 - .../java/au/org/ala/names/index/Taxonomy.java | 1 + .../au/org/ala/names/model/FieldType.java | 245 ++++++++++++ .../model/LinnaeanRankClassification.java | 27 +- .../org/ala/names/model/NameIndexField.java | 133 +++++-- .../org/ala/names/search/ALANameIndexer.java | 216 +++++------ .../org/ala/names/search/ALANameSearcher.java | 356 ++++++++---------- .../org/ala/names/search/DwcaNameIndexer.java | 26 +- .../ala/names/search/ALANameSearcherTest.java | 35 +- .../ala/names/search/BiocacheMatchTest.java | 37 +- 11 files changed, 680 insertions(+), 414 deletions(-) create mode 100644 src/main/java/au/org/ala/names/model/FieldType.java diff --git a/data/ala-taxon-config.json b/data/ala-taxon-config.json index e776d88c5..b719d75f3 100644 --- a/data/ala-taxon-config.json +++ b/data/ala-taxon-config.json @@ -32,6 +32,11 @@ "matchType": "REGEX", "scientificName": "Unknown( .*|)" }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "matchType": "REGEX", + "scientificName": "Not assigned" + }, { "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", "matchType": "REGEX", @@ -847,6 +852,17 @@ ] } }, + { + "id" : "dr17664", + "name": "ABRSL", + "description": "ABRS Lichen Checklist", + "parent": "apni-apc", + "rightsHolder": "Commonwealth Scientific and Industrial Research Organisation", + "authority": false, + "defaultScore" : 2500, + "defaultParentTaxon": "Plantae", + "conceptResolutionPriority": "ADDITIONAL" + }, { "id" : "dr2704", "name": "CAAB", diff --git a/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java b/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java index 182ed9714..df9c44ebd 100644 --- a/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java +++ b/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java @@ -1008,8 +1008,6 @@ public void normalise() throws IndexBuilderException { // If you plan to change this, it is called by a parallel stream, so consisder thread safety // At the moment, this fills out inferred information only public boolean resolveLinks(Taxonomy taxonomy) throws IndexBuilderException { - if (this.scientificName.equals("Zieria fordii")) - System.out.println("Found it"); if (this.parentNameUsageID != null) { this.parent = taxonomy.getInstance(this.parentNameUsageID); } diff --git a/src/main/java/au/org/ala/names/index/Taxonomy.java b/src/main/java/au/org/ala/names/index/Taxonomy.java index 0580f76ea..b5e7eeeb0 100644 --- a/src/main/java/au/org/ala/names/index/Taxonomy.java +++ b/src/main/java/au/org/ala/names/index/Taxonomy.java @@ -1470,6 +1470,7 @@ public void createWorkingIndex() throws IOException { indexer.commitLoadingIndexes(); indexer.generateIndex(); indexer.create(interim); + indexer.createIrmng(null); indexer.commit(); } catch (Exception ex) { throw new IndexBuilderException("Unable to build working index"); diff --git a/src/main/java/au/org/ala/names/model/FieldType.java b/src/main/java/au/org/ala/names/model/FieldType.java new file mode 100644 index 000000000..16231e761 --- /dev/null +++ b/src/main/java/au/org/ala/names/model/FieldType.java @@ -0,0 +1,245 @@ +package au.org.ala.names.model; + +import au.org.ala.names.lucene.analyzer.LowerCaseKeywordAnalyzer; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.document.*; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.util.QueryBuilder; + +import java.util.function.BiConsumer; +import java.util.function.BiFunction; + +/** + * The type of field stored in the lucene index. + *

+ * Used to determine how to store and search for a field. + *

+ */ +abstract public class FieldType { + protected static final ThreadLocal ANALYZER = ThreadLocal.withInitial( + () -> LowerCaseKeywordAnalyzer.newInstance() + ); + protected static final ThreadLocal QUERY_BUILDER = ThreadLocal.withInitial( + () -> new QueryBuilder(ANALYZER.get()) + ); + protected static final ThreadLocal TERM_FIELD_TYPE = ThreadLocal.withInitial( + () -> { + org.apache.lucene.document.FieldType ft = new org.apache.lucene.document.FieldType(TextField.TYPE_STORED); + ft.setOmitNorms(true); + return ft; + } + ); + + /** The class of term stored */ + private Class class_; + /** The name of the field type */ + private String name; + + /** + * Construct with a name + * + * @param name The name + */ + public FieldType(Class class_, String name) { + this.class_ = class_; + this.name = name; + } + + /** + * Store a field into a lucene document. + *

+ * This may involve storing multiple lucene fields for range types. + *

+ * + * @param value The value to store + * @param name The name of the field + * @param document The document to add the field to + */ + abstract public void store(T value, String name, Document document); + + /** + * Generate a query for a field of this type. + * + * @param value The value to search for + * @param name The field name + * @return A query that searches for the value + */ + abstract public Query search(T value, String name); + + /** + * Search for a value in a range (inclusive). + *

+ * By default, this throws a {@link UnsupportedOperationException}. + * Types that have a concept of range can use this to implement a range search. + *

+ * + * @param lower The lower bound + * @param upper The upper bound + * @param name The field name + * + * @return A query based on the range + */ + public Query searchRange(T lower, T upper, String name) { + throw new UnsupportedOperationException("Field type " + this.name + " does not support ranges"); + } + + + /** + * Store-only field. + */ + public static final FieldType STORE = new FieldType(String.class,"store") { + @Override + public void store(String value, String name, Document document) { + document.add(new StoredField(name, value)); + } + + @Override + public Query search(String value, String name) { + throw new UnsupportedOperationException("Store-only field"); + } + }; + + /** + * An exact identifier. + *

+ * Storage and search is accomplished via extact lookup. + *

+ */ + public static final FieldType IDENTIFIER = new FieldType(String.class,"identifier") { + @Override + public void store(String value, String name, Document document) { + document.add(new StringField(name, value, Field.Store.YES)); + } + + @Override + public Query search(String value, String name) { + return new TermQuery(new Term(name, value)); + } + }; + + /** + * A simple term. + *

+ * Storage and search is accomplished via case-insensitive storage and lookup. + *

+ */ + public static final FieldType TERM = new FieldType(String.class, "term") { + @Override + public void store(String value, String name, Document document) { + Field field = new Field(name, value, TERM_FIELD_TYPE.get()); + document.add(field); + } + + @Override + public Query search(String value, String name) { + return QUERY_BUILDER.get().createPhraseQuery(name, value); + } + }; + + /** + * A tokenisable term. + *

+ * Storage and search is accomplished via case-insensitive tokenisation and search + *

+ */ + public static final FieldType TEXT = new FieldType(String.class, "text") { + + @Override + public void store(String value, String name, Document document) { + document.add(new TextField(name, value, Field.Store.YES)); + } + + @Override + public Query search(String value, String name) { + return QUERY_BUILDER.get().createPhraseQuery(name, value); + } + }; + + /** + * A common name. + *

+ * Storage and search is based on a simplified lookup where non alpha-numeric characters are removed + * and made case insensitive. + *

+ */ + public static final FieldType COMMON = new FieldType(String.class,"common") { + @Override + public void store(String value, String name, Document document) { + value = value.toUpperCase().replaceAll("[^A-Z0-9ÏËÖÜÄÉÈČÁÀÆŒ]", ""); + document.add(new StringField(name, value, Field.Store.YES)); + } + + @Override + public Query search(String value, String name) { + value = value.toUpperCase().replaceAll("[^A-Z0-9ÏËÖÜÄÉÈČÁÀÆŒ]", ""); + return new TermQuery(new Term(name, value)); + } + }; + + /** + * An integer term. + *

+ * Storage and search allow range-based queries. + *

+ */ + public static final FieldType INTEGER = new FieldType(Integer.class, "integer") { + @Override + public void store(Integer value, String name, Document document) { + document.add(new IntPoint(name, value)); + document.add(new StoredField(name, value)); + } + + @Override + public Query search(Integer value, String name) { + return IntPoint.newExactQuery(name, value); + } + + /** + * Search for a value in a range (inclusive). + * + * @param lower The lower bound + * @param upper The upper bound + * @param name The field name + * @return A query based on the range + */ + @Override + public Query searchRange(Integer lower, Integer upper, String name) { + return IntPoint.newRangeQuery(name, lower, upper); + } + }; + + /** + * A double term. + *

+ * Storage and search allow range-based queries. + *

+ */ + public static final FieldType DOUBLE = new FieldType(Double.class, "double") { + @Override + public void store(Double value, String name, Document document) { + document.add(new DoublePoint(name, value)); + document.add(new StoredField(name, value)); + } + + @Override + public Query search(Double value, String name) { + return DoublePoint.newExactQuery(name, value); + } + + /** + * Search for a value in a range (inclusive). + * + * @param lower The lower bound + * @param upper The upper bound + * @param name The field name + * @return A query based on the range + */ + @Override + public Query searchRange(Double lower, Double upper, String name) { + return DoublePoint.newRangeQuery(name, lower, upper); + } + }; + +} diff --git a/src/main/java/au/org/ala/names/model/LinnaeanRankClassification.java b/src/main/java/au/org/ala/names/model/LinnaeanRankClassification.java index 244b4bf49..d21949828 100644 --- a/src/main/java/au/org/ala/names/model/LinnaeanRankClassification.java +++ b/src/main/java/au/org/ala/names/model/LinnaeanRankClassification.java @@ -5,6 +5,8 @@ import org.apache.commons.lang.builder.HashCodeBuilder; import org.apache.commons.lang.builder.ToStringBuilder; import org.apache.commons.lang3.StringUtils; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; /** * A model object that represents a Linnaean Classification. @@ -465,27 +467,26 @@ public boolean hasIdenticalClassification(LinnaeanRankClassification lrc, RankTy * @param optional Indicates whether the the terms should be optional * @return */ - public String getLuceneSearchString(boolean optional) { - String prefix = optional ? " " : " +"; + public void appendLuceneQuery(BooleanQuery.Builder builder, boolean optional) { + BooleanClause.Occur occurs = optional ? BooleanClause.Occur.SHOULD : BooleanClause.Occur.FILTER; StringBuilder sb = new StringBuilder(); if (StringUtils.isNotEmpty(kingdom)) - sb.append(prefix).append(RankType.KINGDOM.getRank()).append(":\"").append(kingdom).append("\""); - if (StringUtils.isNotEmpty(phylum)) - sb.append(prefix).append(RankType.PHYLUM.getRank()).append(":\"").append(phylum).append("\""); + builder.add(NameIndexField.KINGDOM.search(this.kingdom), occurs); + if (StringUtils.isNotEmpty(phylum)) + builder.add(NameIndexField.PHYLUM.search(this.phylum), occurs); if (StringUtils.isNotEmpty(klass)) - sb.append(prefix).append(RankType.CLASS.getRank()).append(":\"").append(klass).append("\""); - if (StringUtils.isNotEmpty(order)) - sb.append(prefix).append(RankType.ORDER.getRank()).append(":\"").append(order).append("\""); + builder.add(NameIndexField.CLASS.search(this.klass), occurs); + if (StringUtils.isNotEmpty(order)) + builder.add(NameIndexField.ORDER.search(this.order), occurs); if (StringUtils.isNotEmpty(family)) - sb.append(prefix).append(RankType.FAMILY.getRank()).append(":\"").append(family).append("\""); + builder.add(NameIndexField.FAMILY.search(this.family), occurs); if (StringUtils.isNotEmpty(genus)) - sb.append(prefix).append(RankType.GENUS.getRank()).append(":\"").append(genus).append("\""); + builder.add(NameIndexField.GENUS.search(this.genus), occurs); if (StringUtils.isNotEmpty(species)) - sb.append(prefix).append(RankType.SPECIES.getRank()).append(":\"").append(species).append("\""); + builder.add(NameIndexField.SPECIES.search(this.species), occurs); //authorship is always optional due to inconsistencies in the name format etc... if (StringUtils.isNotEmpty(authorship)) - sb.append(" ").append(NameIndexField.AUTHOR.toString()).append(":\"").append(authorship).append("\"~"); - return sb.toString(); + builder.add(NameIndexField.AUTHOR.search(this.authorship), BooleanClause.Occur.SHOULD); } diff --git a/src/main/java/au/org/ala/names/model/NameIndexField.java b/src/main/java/au/org/ala/names/model/NameIndexField.java index baafd43ae..ec5237840 100644 --- a/src/main/java/au/org/ala/names/model/NameIndexField.java +++ b/src/main/java/au/org/ala/names/model/NameIndexField.java @@ -14,6 +14,11 @@ */ package au.org.ala.names.model; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.WildcardQuery; + /** * An Enum for all the fields that are indexed for the name matching. This enum is used by * {@link au.org.ala.names.search.ALANameIndexer} to create the index and @@ -22,39 +27,115 @@ * @author Natasha Carter */ public enum NameIndexField { - ID("id"), - LSID("lsid"), - PARENT_ID("parent_id"), - DOCUMENT_TYPE("doctype"), - ACCEPTED("accepted_lsid"), - iS_SYNONYM("is_synonym"),//whether or not the record is a synonym - GENUS("genus"), - GENUS_EX("genus_ex"), //genus sounds like expression - handles masculine and feminine too. - SPECIES_EX("specific_ex"),// specific epithet sounds like expression - INFRA_EX("infra_ex"),//infra specific epithet sounds like expression - SPECIFIC("specific"), - INFRA_SPECIFIC("infra"), - NAME("name"),// search name - OTHER_NAMES("other_names"),// Alternative names - NAME_CANONICAL("name_canonical"), // Canonical name - NAME_COMPLETE("name_complete"), // Complete name - RANK_ID("rank_id"), - RANK("rank"), - AUTHOR("author"), - PHRASE("phrase"),//stores the values of a "phrase" name. Some more intelligence will be needed when matching these - VOUCHER("voucher"), //stores a voucher value minus the spaces and fullstops. - ALA("ala"), //stores whether or not it is an ALA generated name - DATASET_ID("dataset_id"), // The source dataset - SYNONYM_TYPE("syn_type"), //stores the type of synonym that it represents + ID("id", FieldType.IDENTIFIER), + GUID("guid", FieldType.IDENTIFIER), + OTHER_GUID("otherGuid", FieldType.IDENTIFIER), + LEFT("left", FieldType.INTEGER), + RIGHT("right", FieldType.INTEGER), + LSID("lsid", FieldType.IDENTIFIER), + REAL_LSID("reallsid", FieldType.STORE), + PARENT_ID("parent_id", FieldType.IDENTIFIER), + DOCUMENT_TYPE("doctype", FieldType.IDENTIFIER), + ACCEPTED("accepted_lsid", FieldType.IDENTIFIER), + iS_SYNONYM("is_synonym", FieldType.IDENTIFIER),//whether or not the record is a synonym + KINGDOM("kingdom", FieldType.TERM), + KINGDOM_ID("kid", FieldType.STORE), + PHYLUM("phylum", FieldType.TERM), + PHYLUM_ID("pid", FieldType.STORE), + CLASS("class", FieldType.TERM), + CLASS_ID("cid", FieldType.STORE), + ORDER("order", FieldType.TERM), + ORDER_ID("oid", FieldType.STORE), + FAMILY("family", FieldType.TERM), + FAMILY_ID("fid", FieldType.STORE), + GENUS("genus", FieldType.TERM), + GENUS_ID("gid", FieldType.STORE), + GENUS_EX("genus_ex", FieldType.TERM), //genus sounds like expression - handles masculine and feminine too. + SPECIES("species", FieldType.TERM), + SPECIES_ID("sid", FieldType.STORE), + SPECIES_EX("specific_ex", FieldType.TERM),// specific epithet sounds like expression + INFRA_EX("infra_ex", FieldType.TERM),//infra specific epithet sounds like expression + SPECIFIC("specific", FieldType.TERM), + INFRA_SPECIFIC("infra", FieldType.TERM), + NAME("name", FieldType.TEXT),// search name + OTHER_NAMES("other_names", FieldType.TEXT),// Alternative names + NAME_CANONICAL("name_canonical", FieldType.TEXT), // Canonical name + NAME_COMPLETE("name_complete", FieldType.TEXT), // Complete name + SEARCHABLE_COMMON_NAME("common", FieldType.COMMON), + COMMON_NAME("common_orig", FieldType.TEXT), + CONCAT_NAME("concat_name", FieldType.TERM), + RANK_ID("rank_id", FieldType.INTEGER), + RANK("rank", FieldType.TERM), + AUTHOR("author", FieldType.TEXT), + PHRASE("phrase", FieldType.TEXT),//stores the values of a "phrase" name. Some more intelligence will be needed when matching these + VOUCHER("voucher", FieldType.TEXT), //stores a voucher value minus the spaces and fullstops. + ALA("ala", FieldType.IDENTIFIER), //stores whether or not it is an ALA generated name + DATASET_ID("dataset_id", FieldType.IDENTIFIER), // The source dataset + SYNONYM_TYPE("syn_type", FieldType.IDENTIFIER), //stores the type of synonym that it represents + HOMONYM("homonym", FieldType.IDENTIFIER), + LANGUAGE("lang", FieldType.IDENTIFIER), /* Stores the priority score associated with a taxon */ - PRIORITY("priority"); + PRIORITY("priority", FieldType.INTEGER); + + /** The field name */ String name; + /** The field type */ + FieldType type; - NameIndexField(String name) { + NameIndexField(String name, FieldType type) { this.name = name; + this.type = type; } public String toString() { return name; } + + /** + * Store a value into this field in a document + * + * @param value The value + * @param document The document + */ + public void store(Object value, Document document) { + if (value == null) + return; + this.type.store(value, this.name, document); + } + + /** + * Make a query for this field for a value. + * + * @param value The value + * + * @return A matching query + */ + public Query search(Object value) { + return this.type.search(value, this.name); + } + + /** + * Make a range query for this field for a value. + * + * @param lower The lower value (inclusive) + * @param upper The upper value (inclusive) + * + * @return A matching query + */ + public Query searchRange(Object lower, Object upper) { + return this.type.searchRange(lower, upper, this.name); + } + + + /** + * Make a wildcard query for this field for a value. + * + * @param value The value, including "*" for wildcards + * + * @return A matching query + */ + public Query searchWildcard(String value) { + return new WildcardQuery(new Term(this.name, value)); + } + } diff --git a/src/main/java/au/org/ala/names/search/ALANameIndexer.java b/src/main/java/au/org/ala/names/search/ALANameIndexer.java index 0661cf64a..f96890c65 100644 --- a/src/main/java/au/org/ala/names/search/ALANameIndexer.java +++ b/src/main/java/au/org/ala/names/search/ALANameIndexer.java @@ -29,8 +29,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.KeywordAnalyzer; -import org.apache.lucene.document.*; -import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; @@ -117,35 +116,6 @@ public class ALANameIndexer { private String indexDirectory; private IndexWriter cbIndexWriter; - //Fields that are being indexed or stored in the lucene index - public enum IndexField { - - NAME("name"), - NAMES("names"), - ID("id"), - RANK("rank"), - SEARCHABLE_NAME("searchcan"), - LSID("lsid"), - HOMONYM("homonym"), - ACCEPTED("synonym"), - LEFT("left"), - RIGHT("right"), - PRIORITY("priority"), - SEARCHABLE_COMMON_NAME("common"), - COMMON_NAME("common_orig"), - LANGUAGE("lang"); - - String name; - - IndexField(String name) { - this.name = name; - } - - public String toString() { - return name; - } - } - PhraseNameParser parser = new PhraseNameParser(); Set knownHomonyms = new HashSet(); Set blacklist = new HashSet(); @@ -244,11 +214,10 @@ private IndexSearcher createTmpGuidIndex(String cbExportFile) throws Exception { Document doc = new Document(); String id = values[POS_ID]; String guid = values[POS_LSID]; - doc.add(new StringField("id", id, Store.YES)); - if (StringUtils.isEmpty(id)) + NameIndexField.ID.store(id, doc); + if (StringUtils.isEmpty(id)) guid = id; - - doc.add(new StoredField("guid", guid)); + NameIndexField.GUID.store(guid, doc); iw.addDocument(doc); } System.out.println("Finished writing the tmp guid index..."); @@ -348,13 +317,13 @@ private void indexALA(IndexWriter iw, String file, String synonymFile) throws Ex values[POS_PID], values[POS_C], values[POS_CID], values[POS_O], values[POS_OID], values[POS_F], values[POS_FID], values[POS_G], values[POS_GID], values[POS_S], values[POS_SID], - values[POS_LFT], values[POS_RGT], acceptedValues, + Integer.parseInt(values[POS_LFT]), Integer.parseInt(values[POS_RGT]), acceptedValues, values[POS_SP_EPITHET], values[POS_INFRA_EPITHET], values[POS_AUTHOR], null, null, priority); //add the excluded information if applicable if ("T".equals(values[POS_EXCLUDED]) || "Y".equals(values[POS_EXCLUDED])) { - doc.add(new TextField(NameIndexField.SYNONYM_TYPE.toString(), SynonymType.EXCLUDES.getId().toString(), Store.YES)); + NameIndexField.SYNONYM_TYPE.store(SynonymType.EXCLUDES.getId().toString(), doc); } if (doc != null) { iw.addDocument(doc); @@ -442,44 +411,44 @@ protected void indexIrmngDwcA(IndexWriter iw, String archiveDirectory) throws Ex Document doc = new Document(); String kingdom = dwcr.value(DwcTerm.kingdom); if (StringUtils.isNotEmpty(kingdom)) { - doc.add(new TextField(RankType.KINGDOM.getRank(), kingdom, Store.YES)); + NameIndexField.KINGDOM.store(kingdom, doc); } String phylum = dwcr.value(DwcTerm.phylum); if (StringUtils.isNotEmpty(phylum)) { - doc.add(new TextField(RankType.PHYLUM.getRank(), phylum, Store.YES)); + NameIndexField.PHYLUM.store(phylum, doc); } String classs = dwcr.value(DwcTerm.class_); if (StringUtils.isNotEmpty(classs)) { - doc.add(new TextField(RankType.CLASS.getRank(), classs, Store.YES)); + NameIndexField.CLASS.store(classs, doc); } String order = dwcr.value(DwcTerm.order); if (StringUtils.isNotEmpty(order)) { - doc.add(new TextField(RankType.ORDER.getRank(), order, Store.YES)); + NameIndexField.ORDER.store(order, doc); } String family = dwcr.value(DwcTerm.family); if (StringUtils.isNotEmpty(family)) { - doc.add(new TextField(RankType.FAMILY.getRank(), family, Store.YES)); + NameIndexField.FAMILY.store(kingdom, doc); } String genus = dwcr.value(DwcTerm.genus); String calculatedRank = "genus"; if (StringUtils.isNotEmpty(genus)) { - doc.add(new TextField(RankType.GENUS.getRank(), genus, Store.YES)); + NameIndexField.GENUS.store(genus, doc); String specificEpithet = dwcr.value(DwcTerm.specificEpithet); if (StringUtils.isNotEmpty(specificEpithet)) { calculatedRank = "species"; - doc.add(new TextField(RankType.SPECIES.getRank(), genus + " " + specificEpithet, Store.YES)); + NameIndexField.SPECIES.store(genus + " " + specificEpithet, doc); } } String rank = dwcr.value(DwcTerm.taxonRank); if (StringUtils.isEmpty(rank)) rank = calculatedRank; - doc.add(new TextField(IndexField.RANK.toString(), rank, Store.YES)); + NameIndexField.RANK.store(rank, doc); //now add the author - we don't do anything about this on homonym resolution yet //Add the author information String author = dwcr.value(DwcTerm.scientificNameAuthorship); if (StringUtils.isNotEmpty(author)) { //TODO think about whether we need to treat the author string with the taxamatch - doc.add(new TextField(NameIndexField.AUTHOR.toString(), author, Store.YES)); + NameIndexField.AUTHOR.store(author, doc); } //now add it to the index iw.addDocument(doc); @@ -504,20 +473,21 @@ void indexIRMNG(IndexWriter iw, String irmngExport, RankType rank) throws Except while ((values = reader.readNext()) != null) { Document doc = new Document(); if (values != null && values.length >= 7) { - doc.add(new TextField(RankType.KINGDOM.getRank(), values[0], Store.YES)); - doc.add(new TextField(RankType.PHYLUM.getRank(), values[1], Store.YES)); - doc.add(new TextField(RankType.CLASS.getRank(), values[2], Store.YES)); - doc.add(new TextField(RankType.ORDER.getRank(), values[3], Store.YES)); - doc.add(new TextField(RankType.FAMILY.getRank(), values[4], Store.YES)); - doc.add(new TextField(RankType.GENUS.getRank(), values[5], Store.YES)); + NameIndexField.KINGDOM.store(values[0], doc); + NameIndexField.PHYLUM.store(values[1], doc); + NameIndexField.CLASS.store(values[2], doc); + NameIndexField.ORDER.store(values[3], doc); + NameIndexField.FAMILY.store(values[4], doc); + NameIndexField.GENUS.store(values[5], doc); if (rank == RankType.GENUS) { - doc.add(new TextField(IndexField.ID.toString(), values[6], Store.YES)); - doc.add(new TextField(IndexField.ACCEPTED.toString(), values[8], Store.YES)); - doc.add(new TextField(IndexField.HOMONYM.toString(), values[10], Store.YES)); + + NameIndexField.ID.store(values[6], doc); + NameIndexField.ACCEPTED.store(values[8], doc); + NameIndexField.HOMONYM.store(values[10], doc); } else if (rank == RankType.SPECIES) { - doc.add(new TextField(RankType.SPECIES.getRank(), values[6], Store.YES)); + NameIndexField.SPECIES.store(values[6], doc); } - doc.add(new TextField(IndexField.RANK.toString(), rank.getRank(), Store.YES)); + NameIndexField.RANK.store(rank.getRank(), doc); iw.addDocument(doc); count++; } @@ -651,9 +621,9 @@ protected void createExtraIdIndex(IndexWriter iw, File idFile) throws Exception if (values != null && values.length >= 3) { Document doc = new Document(); //doc.add(new Field("lsid", values[2], Store.NO, Index.NOT_ANALYZED)); - doc.add(new StringField("lsid", values[2], Store.YES)); + NameIndexField.LSID.store(values[2], doc); //doc.add(new Field("reallsid", values[1], Store.YES, Index.NO)); - doc.add(new StoredField("reallsid", values[1])); + NameIndexField.REAL_LSID.store(values[1], doc); iw.addDocument(doc); } } @@ -700,7 +670,7 @@ private IndexSearcher createTmpIndex(String tcFileName) throws Exception { //just add the LSID to the index Document doc = new Document(); - doc.add(new StringField("lsid", values[0], Store.YES)); + NameIndexField.LSID.store(values[0], doc); iw.addDocument(doc); } @@ -757,31 +727,28 @@ protected Document createCommonNameDocument(String cn, String sn, String lsid, S protected Document createCommonNameDocument(String cn, String sn, String lsid, String language, boolean checkAccepted) { Document doc = new Document(); - //we are only interested in keeping all the alphanumerical values of the common name - //when searching the same operations will need to be peformed on the search string - TextField searchAbleName = new TextField(IndexField.SEARCHABLE_COMMON_NAME.toString(), cn.toUpperCase().replaceAll("[^A-Z0-9ÏËÖÜÄÉÈČÁÀÆŒ]", ""), Store.YES); - doc.add(searchAbleName); + // Uses field type to normalise + NameIndexField.SEARCHABLE_COMMON_NAME.store(cn, doc); if (sn != null) { - doc.add(new TextField(IndexField.NAME.toString(), sn, Store.YES)); + NameIndexField.NAME.store(sn, doc); } String newLsid = getAcceptedLSID(lsid); - - doc.add(new TextField(IndexField.COMMON_NAME.toString(), cn, Store.YES)); - doc.add(new TextField(IndexField.LSID.toString(), newLsid, Store.YES)); + NameIndexField.COMMON_NAME.store(cn, doc); + NameIndexField.LSID.store(newLsid, doc); if(language != null) { - doc.add(new TextField(IndexField.LANGUAGE.toString(), language.toLowerCase().trim(), Store.YES)); - } + NameIndexField.LANGUAGE.store(language.toLowerCase().trim(), doc); + } return doc; } public Document createALAIndexDocument(String name, String id, String lsid, String author, LinnaeanRankClassification cl){ - return createALAIndexDocument(name,id, lsid, author,null,null, null, null, cl, null, null, MatchMetrics.DEFAULT_PRIORITY); + return createALAIndexDocument(name,id, lsid, author,null,null, 0, 0, cl, null, null, MatchMetrics.DEFAULT_PRIORITY); } - public Document createALAIndexDocument(String name, String id, String lsid, String author, String rank, String rankId, String left, String right, LinnaeanRankClassification cl, String nameComplete, Collection otherNames, int priority){ + public Document createALAIndexDocument(String name, String id, String lsid, String author, String rank, String rankId, int left, int right, LinnaeanRankClassification cl, String nameComplete, Collection otherNames, int priority){ if(cl == null) cl = new LinnaeanRankClassification(); return createALAIndexDocument(name, id, lsid, rankId, rank, cl.getKingdom(), cl.getKid(), cl.getPhylum() @@ -792,11 +759,11 @@ public Document createALAIndexDocument(String name, String id, String lsid, Stri protected Document createALASynonymDocument(String scientificName, String author, String nameComplete, Collection otherNames, String id, String lsid, String nameLsid, String acceptedLsid, String acceptedId, int priority, String synonymType) { lsid = StringUtils.isBlank(lsid) ? nameLsid : lsid; Document doc = createALAIndexDocument(scientificName, id, lsid, null, null, - null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, + null, null, null, null, null, null, null, null, null, null, null, null, null, null, 0, 0, acceptedLsid, null, null, author, nameComplete, otherNames, priority); if (doc != null && synonymType != null) { try { - doc.add(new TextField(NameIndexField.SYNONYM_TYPE.toString(), synonymType, Store.YES)); + NameIndexField.SYNONYM_TYPE.store(synonymType, doc); } catch (Exception e) { System.out.println("Error on " + scientificName + " " + author + " " + id + ". " + e.getMessage()); } @@ -811,7 +778,7 @@ private boolean isBlacklisted(String scientificName) { protected Document createALAIndexDocument(String name, String id, String lsid, String rank, String rankString, String kingdom, String kid, String phylum, String pid, String clazz, String cid, String order, String oid, String family, String fid, String genus, String gid, - String species, String sid, String left, String right, String acceptedConcept, String specificEpithet, + String species, String sid, int left, int right, String acceptedConcept, String specificEpithet, String infraspecificEpithet, String author, String nameComplete, Collection otherNames, int priority) { // @@ -820,6 +787,7 @@ protected Document createALAIndexDocument(String name, String id, String lsid, S return null; } + int rankIndex = rank == null || rankString.isEmpty() ? -1 : Integer.parseInt(rank); nameComplete = buildNameComplete(name, author, nameComplete); CleanedScientificName cname = new CleanedScientificName(name); CleanedScientificName cnameComplete = new CleanedScientificName(nameComplete); @@ -827,11 +795,10 @@ protected Document createALAIndexDocument(String name, String id, String lsid, S String soundexGenus = genus; //Add the ids - doc.add(new StringField(NameIndexField.ID.toString(), id, Store.YES)); - - doc.add(new StringField(NameIndexField.LSID.toString(), lsid, Store.YES)); + NameIndexField.ID.store(id, doc); + NameIndexField.LSID.store(lsid, doc); if (lsid.startsWith("ALA")) { - doc.add(new StringField(NameIndexField.ALA.toString(), "T", Store.YES)); + NameIndexField.ALA.store("T", doc); } @@ -843,85 +810,83 @@ protected Document createALAIndexDocument(String name, String id, String lsid, S nameSet.add(cnameComplete.getNormalised()); nameSet.add(cnameComplete.getBasic()); for (String n: nameSet) { - Field f = new TextField(NameIndexField.NAME.toString(), n, Store.YES); - doc.add(f); + NameIndexField.NAME.store(n, doc); } - - doc.add(new StringField(NameIndexField.NAME_CANONICAL.toString(), cname.getNormalised(), Store.YES)); - doc.add(new StringField(NameIndexField.NAME_COMPLETE.toString(), cnameComplete.getNormalised(), Store.YES)); + NameIndexField.NAME_CANONICAL.store(cname.getNormalised(), doc); + NameIndexField.NAME_COMPLETE.store(cnameComplete.getNormalised(), doc); //rank information - if (StringUtils.isNotEmpty(rank)) { - doc.add(new StringField(NameIndexField.RANK_ID.toString(), rank, Store.YES)); + if (rankIndex >= 0) { + NameIndexField.RANK_ID.store(rankIndex, doc); } if (StringUtils.isNotEmpty(rankString)) { - doc.add(new StringField(NameIndexField.RANK.toString(), rankString, Store.YES)); + NameIndexField.RANK.store(rankString, doc); } //handle the synonyms if (StringUtils.isNotEmpty(acceptedConcept)) { - doc.add(new StringField(NameIndexField.ACCEPTED.toString(), acceptedConcept, Store.YES)); - doc.add(new StringField(NameIndexField.iS_SYNONYM.toString(), "T", Store.YES)); - } else { - doc.add(new StringField(NameIndexField.iS_SYNONYM.toString(), "F", Store.YES)); + NameIndexField.ACCEPTED.store(acceptedConcept, doc); + NameIndexField.iS_SYNONYM.store("T", doc); + } else { + NameIndexField.iS_SYNONYM.store("F", doc); } //Add the classification information if (StringUtils.trimToNull(kingdom) != null) { - doc.add(new TextField(RankType.KINGDOM.getRank(), kingdom, Store.YES)); - if (StringUtils.isNotBlank(kid)) { - doc.add(new StoredField("kid", kid)); + NameIndexField.KINGDOM.store(kingdom, doc); + if (StringUtils.isNotBlank(kid)) { + NameIndexField.KINGDOM_ID.store(kid, doc); } } if (StringUtils.trimToNull(phylum) != null) { - doc.add(new TextField(RankType.PHYLUM.getRank(), phylum, Store.YES)); + NameIndexField.PHYLUM.store(phylum, doc); if (StringUtils.isNotBlank(pid)) { - doc.add(new StoredField("pid", pid)); + NameIndexField.PHYLUM_ID.store(pid, doc); } } if (StringUtils.trimToNull(clazz) != null) { - doc.add(new TextField(RankType.CLASS.getRank(), clazz, Store.YES)); + NameIndexField.CLASS.store(clazz, doc); if (StringUtils.isNotBlank(cid)) { - doc.add(new StoredField("cid", cid)); + NameIndexField.CLASS_ID.store(cid, doc); } } if (StringUtils.trimToNull(order) != null) { - doc.add(new TextField(RankType.ORDER.getRank(), order, Store.YES)); + NameIndexField.ORDER.store(order, doc); if (StringUtils.isNotBlank(oid)) { - doc.add(new StoredField("oid", oid)); + NameIndexField.ORDER_ID.store(oid, doc); } } if (StringUtils.trimToNull(family) != null) { - doc.add(new TextField(RankType.FAMILY.getRank(), family, Store.YES)); + NameIndexField.FAMILY.store(family, doc); if (StringUtils.isNotBlank(fid)) { - doc.add(new StoredField("fid", fid)); + NameIndexField.FAMILY_ID.store(fid, doc); } } if (StringUtils.trimToNull(genus) != null) { - doc.add(new TextField(RankType.GENUS.getRank(), genus, Store.YES)); + NameIndexField.GENUS.store(genus, doc); if (StringUtils.isNotBlank(gid)) { - doc.add(new StoredField("gid", gid)); + NameIndexField.GENUS_ID.store(gid, doc); } } if (StringUtils.trimToNull(species) != null) { - doc.add(new TextField(RankType.SPECIES.getRank(), species, Store.YES)); + NameIndexField.SPECIES.store(species, doc); if (StringUtils.isNotBlank(sid)) { - doc.add(new StoredField("sid", sid)); + NameIndexField.SPECIES_ID.store(sid, doc); } } - if (StringUtils.trimToNull(left) != null) { - doc.add(new StringField("left", left, Store.YES)); + if (left > 0) { + NameIndexField.LEFT.store(left, doc); } - if (StringUtils.trimToNull(right) != null) { - doc.add(new StringField("right", right, Store.YES)); + if (right > 0) { + NameIndexField.RIGHT.store(right, doc); } - doc.add(new StoredField("priority", priority)); + NameIndexField.PRIORITY.store(priority, doc); //Add the author information if (StringUtils.isNotEmpty(author)) { //TODO think about whether we need to treat the author string with the taxamatch - doc.add(new TextField(NameIndexField.AUTHOR.toString(), author, Store.YES)); + NameIndexField.AUTHOR.store(author, doc); } @@ -936,8 +901,7 @@ protected Document createALAIndexDocument(String name, String id, String lsid, S && cn.getType() != NameType.INFORMAL && !"6500".equals(rank) && cn.getType() != NameType.DOUBTFUL) { if (!nameSet.contains(cn.canonicalName())) { - Field f2 = new TextField(NameIndexField.NAME.toString(), cn.canonicalName(), Store.YES); - doc.add(f2); + NameIndexField.NAME.store(cn.canonicalName(), doc); } if (specificEpithet == null && cn.isBinomial()) { //check to see if we need to determine the epithets from the parse @@ -949,30 +913,28 @@ protected Document createALAIndexDocument(String name, String id, String lsid, S //check to see if the concept represents a phrase name if (cn != null && cn instanceof ALAParsedName) { //set up the field type that is stored and Index.ANALYZED_NO_NORMS - FieldType ft = new FieldType(TextField.TYPE_STORED); - ft.setOmitNorms(true); ALAParsedName alapn = (ALAParsedName) cn; if (alapn.getRank() != Rank.SPECIES && alapn.getSpecificEpithet() != null) { - doc.add(new Field(NameIndexField.SPECIFIC.toString(), alapn.getSpecificEpithet(), ft)); + NameIndexField.SPECIFIC.store(alapn.getSpecificEpithet(), doc); } else if (alapn.getRank() != Rank.SPECIES && alapn.getSpecificEpithet() == null) { log.warn(lsid + " " + name + " has an empty specific for non sp. phrase"); } if (StringUtils.trimToNull(alapn.getLocationPhraseDescription()) != null) { - doc.add(new Field(NameIndexField.PHRASE.toString(), alapn.cleanPhrase, ft)); + NameIndexField.PHRASE.store(alapn.cleanPhrase, doc); } if (alapn.getPhraseVoucher() != null) { - doc.add(new Field(NameIndexField.VOUCHER.toString(), alapn.cleanVoucher, ft)); + NameIndexField.VOUCHER.store(alapn.cleanVoucher, doc); } if (StringUtils.isBlank(genus) && StringUtils.isNotBlank(alapn.getGenusOrAbove())) { //add the genus to the index as it is necessary to match on the phrase name. - doc.add(new TextField(RankType.GENUS.getRank(), alapn.getGenusOrAbove(), Store.YES)); + NameIndexField.GENUS.store(alapn.getGenusOrAbove(), doc); } } } catch (org.gbif.api.exception.UnparsableException e) { //check to see if the name is a virus in which case an extra name is added without the virus key word if (e.type == NameType.VIRUS) { - doc.add(new TextField(NameIndexField.NAME.toString(), ALANameSearcher.virusStopPattern.matcher(name).replaceAll(" "), Store.YES)); + NameIndexField.NAME.store(ALANameSearcher.virusStopPattern.matcher(name).replaceAll(" "), doc); } } catch (Exception e) { @@ -983,24 +945,24 @@ protected Document createALAIndexDocument(String name, String id, String lsid, S //add the sound expressions for the name if required try { if (StringUtils.isNotBlank(soundexGenus)) { - doc.add(new TextField(NameIndexField.GENUS_EX.toString(), TaxonNameSoundEx.treatWord(soundexGenus, "genus"), Store.YES)); + NameIndexField.GENUS_EX.store(TaxonNameSoundEx.treatWord(soundexGenus, "genus"), doc); } if (StringUtils.isNotBlank(specificEpithet)) { String soundex = TaxonNameSoundEx.treatWord(specificEpithet, "species"); if (soundex == null) soundex = ""; - doc.add(new TextField(NameIndexField.SPECIES_EX.toString(), soundex, Store.YES)); + NameIndexField.SPECIES_EX.store(soundex, doc); } else if (StringUtils.isNotBlank(soundexGenus)) { - doc.add(new TextField(NameIndexField.SPECIES_EX.toString(), "", Store.YES)); + NameIndexField.SPECIES_EX.store("", doc); } if (StringUtils.isNotBlank(infraspecificEpithet)) { String soundex = TaxonNameSoundEx.treatWord(infraspecificEpithet, "species"); if (soundex == null) soundex = ""; - doc.add(new TextField(NameIndexField.INFRA_EX.toString(), soundex, Store.YES)); - } else if (StringUtils.isNotBlank(specificEpithet)) { + NameIndexField.INFRA_EX.store(soundex, doc); + } else if (StringUtils.isNotBlank(specificEpithet)) { //make searching for an empty infraspecific soudex easier - doc.add(new TextField(NameIndexField.INFRA_EX.toString(), "", Store.YES)); + NameIndexField.INFRA_EX.store("", doc); } } catch (Exception e) { log.warn(lsid + " " + name + " has issues creating a soundex: " + e.getMessage()); diff --git a/src/main/java/au/org/ala/names/search/ALANameSearcher.java b/src/main/java/au/org/ala/names/search/ALANameSearcher.java index 93e616193..d1355169a 100644 --- a/src/main/java/au/org/ala/names/search/ALANameSearcher.java +++ b/src/main/java/au/org/ala/names/search/ALANameSearcher.java @@ -14,7 +14,6 @@ */ package au.org.ala.names.search; -import au.org.ala.names.lucene.analyzer.LowerCaseKeywordAnalyzer; import au.org.ala.names.model.*; import au.org.ala.names.util.CleanedScientificName; import au.org.ala.names.util.TaxonNameSoundEx; @@ -22,12 +21,8 @@ import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; -import org.apache.lucene.index.*; -import org.apache.lucene.queryparser.classic.ParseException; -import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.search.*; import org.apache.lucene.store.FSDirectory; import org.gbif.api.exception.UnparsableException; @@ -36,10 +31,7 @@ import org.gbif.api.vocabulary.Rank; import org.gbif.nameparser.PhraseNameParser; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.OutputStreamWriter; +import java.io.*; import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; @@ -74,8 +66,6 @@ public class ALANameSearcher { protected Log log = LogFactory.getLog(ALANameSearcher.class); protected DirectoryReader cbReader, irmngReader, vernReader; protected IndexSearcher cbSearcher, irmngSearcher, vernSearcher, idSearcher; - protected ThreadLocal queryParser; - protected ThreadLocal idParser; protected TaxonNameSoundEx tnse; protected PhraseNameParser parser; public static final Pattern virusStopPattern = Pattern.compile(" virus| ictv| ICTV"); @@ -95,56 +85,33 @@ public ALANameSearcher(){} * as the source directory * * @param indexDirectory The directory that contains the index files for the scientific names, irmng and vernacular names. - * @throws CorruptIndexException * @throws IOException */ public ALANameSearcher(String indexDirectory) throws IOException { //Initialise CB index searching items log.debug("Creating the search object for the name matching api..."); - //make the query parsers thread safe - queryParser = new ThreadLocal() { - @Override - protected QueryParser initialValue() { - QueryParser qp = new QueryParser("genus", LowerCaseKeywordAnalyzer.newInstance()); - qp.setFuzzyMinSim(0.8f); //fuzzy match similarity setting. used to match the authorship. - return qp; - } - }; - idParser = new ThreadLocal() { - @Override - protected QueryParser initialValue() { - return new QueryParser( "lsid", new org.apache.lucene.analysis.core.KeywordAnalyzer()); - } - }; - - cbReader = DirectoryReader.open(FSDirectory.open(createIfNotExist(indexDirectory + File.separator + "cb")));//false + cbReader = DirectoryReader.open(FSDirectory.open(findPath(indexDirectory + File.separator + "cb")));//false cbSearcher = new IndexSearcher(cbReader); //Initialise the IRMNG index searching items - irmngReader = DirectoryReader.open(FSDirectory.open(createIfNotExist(indexDirectory + File.separator + "irmng"))); + irmngReader = DirectoryReader.open(FSDirectory.open(findPath(indexDirectory + File.separator + "irmng"))); irmngSearcher = new IndexSearcher(irmngReader); //initialise the Common name index searching items - vernReader = DirectoryReader.open(FSDirectory.open(createIfNotExist(indexDirectory + File.separator + "vernacular"))); + vernReader = DirectoryReader.open(FSDirectory.open(findPath(indexDirectory + File.separator + "vernacular"))); vernSearcher = new IndexSearcher(vernReader); //initialise the identifier index - idSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(createIfNotExist(indexDirectory + File.separator + "id")))); + idSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(findPath(indexDirectory + File.separator + "id")))); tnse = new TaxonNameSoundEx(); parser = new PhraseNameParser(); crossRankHomonyms = au.org.ala.names.util.FileUtils.streamToSet( this.getClass().getClassLoader().getResourceAsStream("au/org/ala/homonyms/cross_rank_homonyms.txt"), new java.util.HashSet(), true); } - private Path createIfNotExist(String indexDirectory) throws IOException { - + private Path findPath(String indexDirectory) throws IOException { File idxFile = new File(indexDirectory); - Path path = Paths.get(indexDirectory); if (!idxFile.exists()) { - FileUtils.forceMkdir(idxFile); - Analyzer analyzer = new StandardAnalyzer(); - IndexWriterConfig conf = new IndexWriterConfig(analyzer); - IndexWriter iw = new IndexWriter(FSDirectory.open(path), conf); - iw.commit(); - iw.close(); + throw new FileNotFoundException(idxFile.toString()); } + Path path = Paths.get(indexDirectory); return path; } @@ -154,8 +121,7 @@ private Path createIfNotExist(String indexDirectory) throws IOException { public void dumpSpecies() { try { OutputStreamWriter fileOut = new OutputStreamWriter(new FileOutputStream("/data/species.txt"), "UTF-8"); - Term term = new Term("rank", "species"); - TopDocs hits = cbSearcher.search(new TermQuery(term), 2000000); + TopDocs hits = cbSearcher.search(NameIndexField.RANK.search("species"), 2000000); for (ScoreDoc sdoc : hits.scoreDocs) { Document doc = cbReader.document(sdoc.doc); @@ -894,7 +860,7 @@ public NameSearchResult searchForRecord(String name, LinnaeanRankClassification */ public NameSearchResult searchForRecordByID(String id) { try { - List results = performSearch(ALANameIndexer.IndexField.ID.toString(), id, null, null, 1, null, false, idParser.get()); + List results = performSearch(NameIndexField.ID, id, null, null, 1, null, false); if (results.size() > 0) { results.get(0).setMatchType(MatchType.TAXON_ID); return results.get(0); @@ -1021,7 +987,7 @@ private List searchForRecords(String name, RankType rank, Linn log.warn("Unable to parse " + name + ". " + e.getMessage()); } //Check for the exact match - List hits = performSearch(NameIndexField.NAME.toString(), cleaned.getNormalised(), rank, cl, max, MatchType.EXACT, true, queryParser.get()); + List hits = performSearch(NameIndexField.NAME, cleaned.getNormalised(), rank, cl, max, MatchType.EXACT, true); if (hits == null) // situation where searcher has not been initialised { return null; @@ -1043,12 +1009,13 @@ private List searchForRecords(String name, RankType rank, Linn String voucher = alapn.cleanVoucher; //String voucher = alapn.phraseVoucher != null ? voucherRemovePattern.matcher(alapn.phraseVoucher).replaceAll("") :null; String specific = alapn.getRank() != null && alapn.getRank().equals(Rank.SPECIES) ? null : alapn.getSpecificEpithet(); - String[][] searchFields = new String[4][]; - searchFields[0] = new String[]{RankType.GENUS.getRank(), genus}; - searchFields[1] = new String[]{NameIndexField.PHRASE.toString(), phrase}; - searchFields[2] = new String[]{NameIndexField.VOUCHER.toString(), voucher}; - searchFields[3] = new String[]{NameIndexField.SPECIFIC.toString(), specific}; - hits = performSearch(searchFields, rank, cl, max, MatchType.PHRASE, false, queryParser.get()); //don't want to check for homonyms yet... + List searchFields = Arrays.asList( + Value.of(NameIndexField.GENUS, genus), + Value.of(NameIndexField.PHRASE, phrase), + Value.of(NameIndexField.VOUCHER, voucher), + Value.of(NameIndexField.SPECIFIC, specific) + ); + hits = performSearch(searchFields, rank, cl, max, MatchType.PHRASE, false); //don't want to check for homonyms yet... if (hits.size() == 1) { return hits; } else if (hits.size() > 1) { @@ -1069,7 +1036,7 @@ private List searchForRecords(String name, RankType rank, Linn if (cl.getAuthorship() == null && pn.isAuthorsParsed()) { cl.setAuthorship(pn.authorshipComplete()); } - hits = performSearch(ALANameIndexer.IndexField.NAME.toString(), canonicalName, rank, cl, max, MatchType.CANONICAL, true, queryParser.get()); + hits = performSearch(NameIndexField.NAME, canonicalName, rank, cl, max, MatchType.CANONICAL, true); if (hits.size() > 0) { return hits; } @@ -1079,12 +1046,13 @@ private List searchForRecords(String name, RankType rank, Linn String phrase = pn.getCultivarEpithet(); String voucher = null; String specific = pn.getRank() != null && pn.getRank().equals(Rank.SPECIES) ? null : pn.getSpecificEpithet(); - String[][] searchFields = new String[4][]; - searchFields[0] = new String[]{RankType.GENUS.getRank(), genus}; - searchFields[1] = new String[]{NameIndexField.PHRASE.toString(), phrase}; - searchFields[2] = new String[]{NameIndexField.VOUCHER.toString(), voucher}; - searchFields[3] = new String[]{NameIndexField.SPECIFIC.toString(), specific}; - hits = performSearch(searchFields, rank, cl, max, MatchType.PHRASE, false, queryParser.get()); + List searchFields = Arrays.asList( + Value.of(NameIndexField.GENUS, genus), + Value.of(NameIndexField.PHRASE, phrase), + Value.of(NameIndexField.VOUCHER, voucher), + Value.of(NameIndexField.SPECIFIC, specific) + ); + hits = performSearch(searchFields, rank, cl, max, MatchType.PHRASE, false); if (hits.size() > 0) { return hits; } @@ -1095,15 +1063,12 @@ private List searchForRecords(String name, RankType rank, Linn String genus = TaxonNameSoundEx.treatWord(pn.getGenusOrAbove(), "genus"); String specific = TaxonNameSoundEx.treatWord(pn.getSpecificEpithet(), "species"); String infra = pn.getInfraSpecificEpithet() == null ? null : TaxonNameSoundEx.treatWord(pn.getInfraSpecificEpithet(), "species"); - String[][] searchFields = new String[3][]; - searchFields[0] = new String[]{NameIndexField.GENUS_EX.toString(), genus}; - searchFields[1] = new String[]{NameIndexField.SPECIES_EX.toString(), specific}; - if (StringUtils.isNotEmpty(infra)) { - searchFields[2] = new String[]{NameIndexField.INFRA_EX.toString(), infra}; - } else { - searchFields[2] = new String[]{NameIndexField.INFRA_EX.toString(), ""}; - } - hits = performSearch(searchFields, rank, cl, max, MatchType.SOUNDEX, false, queryParser.get()); //don't want to check for homonyms yet... + List searchFields = Arrays.asList( + Value.of(NameIndexField.GENUS_EX, genus), + Value.of(NameIndexField.SPECIES_EX, specific), + Value.of(NameIndexField.INFRA_EX, StringUtils.isNotEmpty(infra) ? infra : "") + ); + hits = performSearch(searchFields, rank, cl, max, MatchType.SOUNDEX, false); //don't want to check for homonyms yet... if (hits.size() > 0) { return hits; } @@ -1146,140 +1111,126 @@ else if (hit.getAcceptedLsid() != null) { return acceptedLsid == null ? null : searchForRecordByLsid(acceptedLsid); } - private List performSearch(String field, String value, RankType rank, + private List performSearch(NameIndexField field, String value, RankType rank, LinnaeanRankClassification cl, int max, MatchType type, - boolean checkHomo, QueryParser parser) throws IOException, SearchResultException { - String[][] compValues = new String[1][]; - compValues[0] = new String[]{field, value}; - return performSearch(compValues, rank, cl, max, type, checkHomo, parser); + boolean checkHomo) throws IOException, SearchResultException { + return performSearch(Arrays.asList(Value.of(field, value)), rank, cl, max, type, checkHomo); } /** * Performs an index search based on the supplied field and name * - * @param compulsoryValues 2D array of field and value mappings to perform the search on + * @param compulsoryValues A list of required values * @param rank Optional rank of the value * @param cl The high taxa that form the classification for the search item * @param max The maximum number of results to return * @param type The type of search that is being performed * @param checkHomo Whether or not the result should check for homonyms. - * @param parser * @return * @throws IOException * @throws SearchResultException */ - private List performSearch(String[][] compulsoryValues, RankType rank, - LinnaeanRankClassification cl, int max, MatchType type, boolean checkHomo, - QueryParser parser) throws IOException, SearchResultException { + private List performSearch(List compulsoryValues, RankType rank, + LinnaeanRankClassification cl, int max, MatchType type, boolean checkHomo) throws IOException, SearchResultException { if (cbSearcher != null) { String scientificName = null; - StringBuilder query = new StringBuilder(); - for (String[] values : compulsoryValues) { - if (values[1] != null) { - - query.append("+" + values[0] + ":\"" + values[1] + "\""); - - if (values[0].equals(NameIndexField.NAME.toString())) - scientificName = values[1]; - } + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + for (Value value: compulsoryValues) { + if (value.value != null) { + builder.add(value.field.search(value.value), BooleanClause.Occur.MUST); + if (value.field == NameIndexField.NAME) + scientificName = value.value.toString(); + } } if (rank != null) { - //if the rank is below species include all names that are species level and below in case synonyms have changed ranks. - query.append("+("); - if (rank.getId() >= RankType.SPECIES.getId()) { - query.append(NameIndexField.RANK_ID.toString()).append(":[7000 TO 9999]"); - - } else - query.append(NameIndexField.RANK.toString() + ":\"" + rank.getRank() + "\""); - //cater for the situation where the search term could be a synonym that does not have a rank + int lower = rank.getId(); + int upper = rank.getId() >= RankType.SPECIES.getId() ? 9999 : rank.getId(); + BooleanQuery.Builder rankBuilder = new BooleanQuery.Builder(); + rankBuilder.add(NameIndexField.RANK_ID.searchRange(lower, upper), BooleanClause.Occur.SHOULD); + //cater for the situation where the search term could be a synonym that does not have a rank // also ALA added concepts do NOT have ranks. - query.append(" OR ").append(NameIndexField.iS_SYNONYM.toString()).append(":T OR ").append(NameIndexField.ALA).append(":T)"); - + rankBuilder.add(NameIndexField.iS_SYNONYM.search("T"), BooleanClause.Occur.SHOULD); + rankBuilder.add(NameIndexField.ALA.search("T"), BooleanClause.Occur.SHOULD); + builder.add(rankBuilder.build(), BooleanClause.Occur.MUST); } if (cl != null) { - query.append(cl.getLuceneSearchString(true)); - + cl.appendLuceneQuery(builder, true); } + Query query = builder.build(); - try { - Query scoreQuery = parser.parse(query.toString()); - TopDocs hits = cbSearcher.search(scoreQuery, max);//cbSearcher.search(boolQuery, max); + TopDocs hits = cbSearcher.search(query, max);//cbSearcher.search(boolQuery, max); - //now put the hits into the arrayof NameSearchResult - List results = new java.util.ArrayList(); + //now put the hits into the arrayof NameSearchResult + List results = new java.util.ArrayList(); - for (ScoreDoc sdoc : hits.scoreDocs) { - NameSearchResult nsr = new NameSearchResult(cbReader.document(sdoc.doc), type); - nsr.computeMatch(cl); - results.add(nsr); - } - results.sort(Comparator.comparing(NameSearchResult::getMatchMetrics).reversed()); - if (results.stream().filter(r -> r.getMatchMetrics().getMatch() > MATCH_LIMIT).count() > 0) { - results = results.stream().filter(r -> r.getMatchMetrics().getMatch() > MATCH_LIMIT).collect(Collectors.toList()); - } - //HOMONYM CHECKS and other checks - if (checkHomo) { - - //check to see if one of the results is excluded - if (results.size() > 0) { - int exclCount = 0; - NameSearchResult notExcludedResult = null; - NameSearchResult excludedResult = null; - for (NameSearchResult nsr : results) { - if (nsr.getSynonymType() == au.org.ala.names.model.SynonymType.EXCLUDES) { - exclCount++; - excludedResult = nsr; - } else if (notExcludedResult == null) { - notExcludedResult = nsr; - } + for (ScoreDoc sdoc : hits.scoreDocs) { + NameSearchResult nsr = new NameSearchResult(cbReader.document(sdoc.doc), type); + nsr.computeMatch(cl); + results.add(nsr); + } + results.sort(Comparator.comparing(NameSearchResult::getMatchMetrics).reversed()); + if (results.stream().filter(r -> r.getMatchMetrics().getMatch() > MATCH_LIMIT).count() > 0) { + results = results.stream().filter(r -> r.getMatchMetrics().getMatch() > MATCH_LIMIT).collect(Collectors.toList()); + } + //HOMONYM CHECKS and other checks + if (checkHomo) { + + //check to see if one of the results is excluded + if (results.size() > 0) { + int exclCount = 0; + NameSearchResult notExcludedResult = null; + NameSearchResult excludedResult = null; + for (NameSearchResult nsr : results) { + if (nsr.getSynonymType() == au.org.ala.names.model.SynonymType.EXCLUDES) { + exclCount++; + excludedResult = nsr; + } else if (notExcludedResult == null) { + notExcludedResult = nsr; } - if (exclCount > 0) { - //throw the basic exception if count == result size - if (exclCount == results.size()) { - throw new ExcludedNameException("The result is a name that has been excluded from the NSL", excludedResult); - } else if (notExcludedResult != null) { - //one of the results was an excluded concept - throw new ExcludedNameException("One of the results was excluded. Use the nonExcludedName for your match.", notExcludedResult, excludedResult); - } + } + if (exclCount > 0) { + //throw the basic exception if count == result size + if (exclCount == results.size()) { + throw new ExcludedNameException("The result is a name that has been excluded from the NSL", excludedResult); + } else if (notExcludedResult != null) { + //one of the results was an excluded concept + throw new ExcludedNameException("One of the results was excluded. Use the nonExcludedName for your match.", notExcludedResult, excludedResult); } } + } - //check to see if we have a situtation where a species has been split into subspecies and a synonym exists to the subspecies - checkForSpeciesSplit(results); + //check to see if we have a situtation where a species has been split into subspecies and a synonym exists to the subspecies + checkForSpeciesSplit(results); - //check to see if one of the results is a misapplied synonym - checkForMisapplied(results); + //check to see if one of the results is a misapplied synonym + checkForMisapplied(results); - //check result level homonyms - //TODO 2012-04-17: Work out edge case issues for canonical matches... - //checkResultLevelHomonym(results); + //check result level homonyms + //TODO 2012-04-17: Work out edge case issues for canonical matches... + //checkResultLevelHomonym(results); - //check to see if we have a cross rank homonym - //cross rank homonyms are resolvable if a rank has been supplied - if (rank == null) { - checkForCrossRankHomonym(results); - } + //check to see if we have a cross rank homonym + //cross rank homonyms are resolvable if a rank has been supplied + if (rank == null) { + checkForCrossRankHomonym(results); + } - //check to see if the search criteria could represent an unresolved genus or species homonym - if (results.size() > 0) { - RankType resRank = results.get(0).getRank(); - if ((resRank == RankType.GENUS || resRank == RankType.SPECIES) || (results.get(0).isSynonym() && (rank == null || rank == RankType.GENUS || rank == RankType.SPECIES))) { - NameSearchResult result = (cl != null && StringUtils.isNotBlank(cl.getAuthorship())) ? validateHomonymByAuthor(results, scientificName, cl) : validateHomonyms(results, scientificName, cl); - results.clear(); - results.add(result); - } + //check to see if the search criteria could represent an unresolved genus or species homonym + if (results.size() > 0) { + RankType resRank = results.get(0).getRank(); + if ((resRank == RankType.GENUS || resRank == RankType.SPECIES) || (results.get(0).isSynonym() && (rank == null || rank == RankType.GENUS || rank == RankType.SPECIES))) { + NameSearchResult result = (cl != null && StringUtils.isNotBlank(cl.getAuthorship())) ? validateHomonymByAuthor(results, scientificName, cl) : validateHomonyms(results, scientificName, cl); + results.clear(); + results.add(result); } } - - return results; - } catch (ParseException e) { - throw new SearchResultException("Error parsing " + query.toString() + "." + e.getMessage()); } + return results; } return null; } @@ -1528,12 +1479,10 @@ public TopDocs getIRMNGGenus(LinnaeanRankClassification cl, RankType rank) { if (cl != null && (cl.getGenus() != null || cl.getSpecies() != null)) { try { - - String searchString = "+rank:" + rank + " " + cl.getLuceneSearchString(false).trim(); - - - log.debug("Search string : " + searchString + " classification : " + cl); - Query query = queryParser.get().parse(searchString); + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + builder.add(NameIndexField.RANK.search(rank.getRank()), BooleanClause.Occur.MUST); + cl.appendLuceneQuery(builder, false); + Query query = builder.build(); log.debug("getIRMNG query: " + query.toString()); return irmngSearcher.search(query, 10); @@ -1627,13 +1576,13 @@ public String searchForLSIDCommonName(String commonName) { */ public String getCommonNameForLSID(String lsid) { if (lsid != null) { - TermQuery query = new TermQuery(new Term(ALANameIndexer.IndexField.LSID.toString(), lsid)); + Query query = NameIndexField.LSID.search(lsid); try { TopDocs results = vernSearcher.search(query, 1); log.debug("Number of matches for " + lsid + " " + results.totalHits); for (ScoreDoc sdoc : results.scoreDocs) { org.apache.lucene.document.Document doc = vernSearcher.doc(sdoc.doc); - return doc.get(ALANameIndexer.IndexField.COMMON_NAME.toString()); + return doc.get(NameIndexField.COMMON_NAME.toString()); } } catch (IOException e) { log.debug("Unable to access document for common name.", e); @@ -1652,16 +1601,14 @@ public String getCommonNameForLSID(String lsid, String[] languages) { if (lsid != null) { for (String language: languages) { try { - Query query = queryParser.get().parse( - ALANameIndexer.IndexField.LSID.toString() + ":\"" + lsid + "\" " + - " AND " + - ALANameIndexer.IndexField.LANGUAGE.toString() + ":\"" + language + "\" " - ); - TopDocs results = vernSearcher.search(query, 1); + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + builder.add(NameIndexField.LSID.search(lsid), BooleanClause.Occur.MUST); + builder.add(NameIndexField.LANGUAGE.search(language), BooleanClause.Occur.MUST); + TopDocs results = vernSearcher.search(builder.build(), 1); log.debug("Number of matches for " + lsid + " " + results.totalHits); for (ScoreDoc sdoc : results.scoreDocs) { org.apache.lucene.document.Document doc = vernSearcher.doc(sdoc.doc); - return doc.get(ALANameIndexer.IndexField.COMMON_NAME.toString()); + return doc.get(NameIndexField.COMMON_NAME.toString()); } } catch (Exception e) { log.debug("Unable to access document for common name.", e); @@ -1678,7 +1625,7 @@ public String getCommonNameForLSID(String lsid, String[] languages) { */ public Set getCommonNamesForLSID(String lsid, int maxNumberOfNames) { if (lsid != null) { - TermQuery query = new TermQuery(new Term(ALANameIndexer.IndexField.LSID.toString(), lsid)); + Query query = NameIndexField.LSID.search(lsid); try { TopDocs results = vernSearcher.search(query, maxNumberOfNames); //if all the results have the same scientific name result the LSID for the first @@ -1689,7 +1636,7 @@ public Set getCommonNamesForLSID(String lsid, int maxNumberOfNames) { int idx = 0; for (ScoreDoc sdoc : results.scoreDocs) { org.apache.lucene.document.Document doc = vernSearcher.doc(sdoc.doc); - String name = doc.get(ALANameIndexer.IndexField.COMMON_NAME.toString()); + String name = doc.get(NameIndexField.COMMON_NAME.toString()); if(!lowerCaseResults.contains(name.toLowerCase())){ lowerCaseResults.add(name.toLowerCase()); names.add(name); @@ -1715,7 +1662,7 @@ public Set getCommonNamesForLSID(String lsid, int maxNumberOfNames) { */ private String getLSIDForUniqueCommonName(String name) { if (name != null) { - TermQuery query = new TermQuery(new Term(ALANameIndexer.IndexField.SEARCHABLE_COMMON_NAME.toString(), name.toUpperCase().replaceAll("[^A-Z0-9ÏËÖÜÄÉÈČÁÀÆŒ]", ""))); + Query query = NameIndexField.SEARCHABLE_COMMON_NAME.search(name); try { TopDocs results = vernSearcher.search(query, 10); //if all the results have the same scientific name result the LSID for the first @@ -1725,10 +1672,10 @@ private String getLSIDForUniqueCommonName(String name) { for (ScoreDoc sdoc : results.scoreDocs) { org.apache.lucene.document.Document doc = vernSearcher.doc(sdoc.doc); if (firstLsid == null) { - firstLsid = doc.get(ALANameIndexer.IndexField.LSID.toString()); - firstName = doc.get(ALANameIndexer.IndexField.NAME.toString()); + firstLsid = doc.get(NameIndexField.LSID.toString()); + firstName = doc.get(NameIndexField.NAME.toString()); } else { - if (!doSciNamesMatch(firstName, doc.get(ALANameIndexer.IndexField.NAME.toString()))) + if (!doSciNamesMatch(firstName, doc.get(NameIndexField.NAME.toString()))) return null; } } @@ -1791,11 +1738,11 @@ public NameSearchResult searchForCommonName(String name) { */ public String getPrimaryLsid(String lsid) { if (lsid != null) { - TermQuery tq = new TermQuery(new Term("lsid", lsid)); + Query tq = NameIndexField.LSID.search(lsid); try { org.apache.lucene.search.TopDocs results = idSearcher.search(tq, 1); if (results.totalHits.value > 0) - return idSearcher.doc(results.scoreDocs[0].doc).get("reallsid"); + return idSearcher.doc(results.scoreDocs[0].doc).get(NameIndexField.REAL_LSID.toString()); } catch (IOException e) { } } @@ -1806,7 +1753,7 @@ public String getPrimaryLsid(String lsid) { public NameSearchResult searchForRecordByLsid(String lsid) { NameSearchResult result = null; try { - Query query = new TermQuery(new Term(NameIndexField.LSID.toString(), lsid)); + Query query = NameIndexField.LSID.search(lsid); TopDocs hits = this.idSearcher.search(query, 1); if (hits.totalHits.value == 0) hits = this.cbSearcher.search(query, 1); @@ -1905,16 +1852,16 @@ private void appendAutocompleteResults(Map output, TopDocs results, } } - private Query buildAutocompleteQuery(String field, String q, boolean allSearches) { + private Query buildAutocompleteQuery(NameIndexField field, String q, boolean allSearches) { //best match - Query fq1 = new BoostQuery(new TermQuery(new Term(field,q)), 12f); //exact match + Query fq1 = new BoostQuery(field.search(q), 12f); //exact match //partial matches - Query fq5 = new WildcardQuery(new Term(field,q + "*")); //begins with that begins with - Query fq6 = new WildcardQuery(new Term(field,"* " + q + "*")); //contains word that begins with + Query fq5 = field.searchWildcard(q + "*"); //begins with that begins with + Query fq6 = field.searchWildcard("* " + q + "*"); //contains word that begins with //any match - Query fq7 = new WildcardQuery(new Term(field,"*" + q + "*")); //any match + Query fq7 = field.searchWildcard("*" + q + "*"); //any match //join BooleanQuery o = new BooleanQuery.Builder() @@ -1927,8 +1874,8 @@ private Query buildAutocompleteQuery(String field, String q, boolean allSearches } private String getPreferredGuid(String taxonConceptGuid) throws Exception { - Query qGuid = new TermQuery(new Term("guid", taxonConceptGuid)); - Query qOtherGuid = new TermQuery(new Term("otherGuid", taxonConceptGuid)); + Query qGuid = NameIndexField.GUID.search(taxonConceptGuid); + Query qOtherGuid = NameIndexField.OTHER_GUID.search(taxonConceptGuid); BooleanQuery fullQuery = new BooleanQuery.Builder() .add(qGuid, BooleanClause.Occur.SHOULD) @@ -1937,7 +1884,7 @@ private String getPreferredGuid(String taxonConceptGuid) throws Exception { TopDocs topDocs = cbSearcher.search(fullQuery, 1); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { Document doc = cbSearcher.doc(scoreDoc.doc); - return doc.get("guid"); + return doc.get(NameIndexField.GUID.toString()); } return taxonConceptGuid; } @@ -2052,7 +1999,7 @@ private String findLSIDByConcatName(String name) { try { String concatName = concatName(name); - Query query = new TermQuery(new Term("concat_name", concatName)); + Query query = NameIndexField.CONCAT_NAME.search(concatName); TopDocs topDocs = cbSearcher.search(query, 2); if (topDocs != null && topDocs.totalHits.value == 1) { @@ -2106,10 +2053,10 @@ public List autocomplete(String q, int max, boolean includeSynonyms) { String uq = q.toUpperCase(); //name search - Query fq = buildAutocompleteQuery("name", lq, false); + Query fq = buildAutocompleteQuery(NameIndexField.NAME, lq, false); BooleanQuery b = new BooleanQuery.Builder() .add(fq, BooleanClause.Occur.MUST) - .add(new WildcardQuery(new Term("left", "*")), includeSynonyms ? BooleanClause.Occur.SHOULD : BooleanClause.Occur.MUST) + .add(NameIndexField.LEFT.searchWildcard("*"), includeSynonyms ? BooleanClause.Occur.SHOULD : BooleanClause.Occur.MUST) .build(); TopDocs results = cbSearcher.search(b, max); appendAutocompleteResults(output, results, includeSynonyms, false); @@ -2118,7 +2065,7 @@ public List autocomplete(String q, int max, boolean includeSynonyms) { uq = concatName(uq).toUpperCase(); //common name search - fq = buildAutocompleteQuery("common", uq, true); + fq = buildAutocompleteQuery(NameIndexField.SEARCHABLE_COMMON_NAME, uq, true); results = vernSearcher.search(fq, max); appendAutocompleteResults(output, results, includeSynonyms, true); @@ -2177,4 +2124,21 @@ public static void main(String[] args) throws IOException { } } + /** + * Values for fields + */ + private static class Value { + public NameIndexField field; + public T value; + + private Value(NameIndexField field, T value) { + this.field = field; + this.value = value; + } + + public static Value of(NameIndexField field, T value) { + return new Value<>(field, value); + } + } + } diff --git a/src/main/java/au/org/ala/names/search/DwcaNameIndexer.java b/src/main/java/au/org/ala/names/search/DwcaNameIndexer.java index 71ccbd30a..a74bdb705 100644 --- a/src/main/java/au/org/ala/names/search/DwcaNameIndexer.java +++ b/src/main/java/au/org/ala/names/search/DwcaNameIndexer.java @@ -213,10 +213,9 @@ public boolean create(File namesDwc) throws Exception{ } public void createIrmng(File irmngDwc) throws Exception { - if (irmngDwc == null || !irmngDwc.exists()) - return; - IndexWriter irmngWriter = this.createIndexWriter(new File(this.targetDir, "irmng"), this.analyzer, true); - this.indexIrmngDwcA(irmngWriter, irmngDwc.getCanonicalPath()); + IndexWriter irmngWriter = this.createIndexWriter(new File(this.targetDir, "irmng"), this.analyzer, true); + if (irmngDwc != null && irmngDwc.exists()) + this.indexIrmngDwcA(irmngWriter, irmngDwc.getCanonicalPath()); irmngWriter.commit(); irmngWriter.forceMerge(1); irmngWriter.close(); @@ -492,15 +491,18 @@ public boolean createLoadingIndex(File archiveDirectory) throws Exception{ RankType rt = RankType.getForStrRank(taxonRank); if(rt != null){ doc.add(new StringField(NameIndexField.RANK.toString(), rt.getRank(), Field.Store.YES)); - doc.add(new StringField(NameIndexField.RANK_ID.toString(), rt.getId().toString(), Field.Store.YES)); + doc.add(new IntPoint(NameIndexField.RANK_ID.toString(), rt.getId())); + doc.add(new StoredField(NameIndexField.RANK_ID.toString(), rt.getId())); } else { doc.add(new StringField(NameIndexField.RANK.toString(), taxonRank, Field.Store.YES)); - doc.add(new StringField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId().toString(), Field.Store.YES)); + doc.add(new IntPoint(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId())); + doc.add(new StoredField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId())); } } else { //put in unknown rank doc.add(new StringField(NameIndexField.RANK.toString(), "Unknown", Field.Store.YES)); - doc.add(new StringField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId().toString(), Field.Store.YES)); + doc.add(new IntPoint(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId())); + doc.add(new StoredField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId())); } if(StringUtils.equals(taxonID, acceptedNameUsageID) || StringUtils.equals(id, acceptedNameUsageID) || acceptedNameUsageID == null){ //mark this one as an accepted concept @@ -582,7 +584,7 @@ public void generateIndex() throws Exception{ //get all the records that don't have parents that are accepted log.info("Loading index from temporary index."); TopDocs rootConcepts = getLoadIdxResults(null, "root", "T", PAGE_SIZE); - int left = 0; + int left = 1; int right = left; int lastRight = right; int count = 0; @@ -729,8 +731,8 @@ private int addIndex(Document doc, int currentDepth, int currentLeft, LinnaeanRa doc.get(NameIndexField.AUTHOR.toString()), doc.get(NameIndexField.RANK.toString()), doc.get(NameIndexField.RANK_ID.toString()), - Integer.toString(left), - Integer.toString(right), + left, + right, newcl, nameComplete, otherNames, @@ -752,7 +754,7 @@ protected Document createALASynonymDocument(String scientificName, String author String genus = null; String specificEpithet = null; String infraspecificEpithet = null; - try { + try { TopDocs hits = this.cbSearcher.search(new TermQuery(new Term(NameIndexField.LSID.toString(), acceptedLsid)), 1); if (hits.totalHits.value > 0) accepted = this.cbSearcher.doc(hits.scoreDocs[0].doc); @@ -786,7 +788,7 @@ protected Document createALASynonymDocument(String scientificName, String author } Document doc = createALAIndexDocument(scientificName, id, lsid, null, null, - kingdom, null, phylum, null, clazz, null, order, null, family, null, genus, null, null, null, null, null, + kingdom, null, phylum, null, clazz, null, order, null, family, null, genus, null, null, null, 0, 0, acceptedLsid, specificEpithet, infraspecificEpithet, author, nameComplete, otherNames, priority); if (doc != null && synonymType != null) { try { diff --git a/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java b/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java index 0df2e9d9a..a005d1810 100644 --- a/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java +++ b/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java @@ -504,7 +504,7 @@ public void testSpMarker3() { try { String name = "Lindernia sp. Pilbara (M.N.Lyons & L.Lewis FV 1069)"; NameSearchResult nsr = null; - nsr = searcher.searchForRecord(name, RankType.SUBSPECIES); + nsr = searcher.searchForRecord(name, RankType.SPECIES); assertNotNull(nsr); assertEquals("https://id.biodiversity.org.au/name/apni/51306553", nsr.getLsid()); } catch (SearchResultException e) { @@ -630,8 +630,6 @@ public void testSynonymWithoutRank1() throws Exception { nsr = searcher.searchForRecord(cl, true, true); assertEquals("ALA_3267030", nsr.getLsid()); assertEquals("https://biodiversity.org.au/afd/taxa/5291343e-fdeb-4a65-8ba5-928f5b96acf5", nsr.getAcceptedLsid()); - nsr = searcher.searchForRecord("Cracticus tibicen", RankType.GENUS); - assertEquals(null, nsr); } @@ -1532,8 +1530,7 @@ public void testMetricsLookup1() throws Exception { assertTrue(metrics.getErrors().contains(ErrorType.PARENT_CHILD_SYNONYM)); } - - @Test + @Ignore // Until sub-taxon synonymy decided public void testMetricsLookup2() throws Exception { String name = "Trigonaphera vinnulum"; // Synonym of Trigonostoma vinnulum LinnaeanRankClassification cl = new LinnaeanRankClassification(); @@ -1665,7 +1662,7 @@ public void testHigherTaxonMatch2() { cl.setScientificName(name); NameSearchResult nsr = searcher.searchForRecord(cl, true); assertNotNull(nsr); - assertEquals("https://id.biodiversity.org.au/node/lichen/30088140", nsr.getLsid()); + assertEquals("NZOR-6-1843", nsr.getLsid()); assertEquals("Ramalina", nsr.getRankClassification().getGenus()); assertEquals(MatchType.RECURSIVE, nsr.getMatchType()); } catch (SearchResultException e) { @@ -1674,7 +1671,7 @@ public void testHigherTaxonMatch2() { } @Test - public void testHomonymWithOrderResolution1() { + public void testHomonymWithOrderResolution1() throws Exception { try { String name = "Abelia"; LinnaeanRankClassification cl = new LinnaeanRankClassification(); @@ -1682,22 +1679,16 @@ public void testHomonymWithOrderResolution1() { NameSearchResult nsr = searcher.searchForRecord(cl, true); fail("Expecting homonym exception"); } catch (HomonymException ex) { - assertEquals(1, ex.getResults().size()); - } catch (SearchResultException e) { - fail("Unexpected search exception " + e); + assertEquals(2, ex.getResults().size()); } - try { - String name = "Abelia"; - LinnaeanRankClassification cl = new LinnaeanRankClassification(); - cl.setScientificName(name); - cl.setOrder("Dipsacales"); - NameSearchResult nsr = searcher.searchForRecord(cl, true); - assertNotNull(nsr); - assertEquals("https://id.biodiversity.org.au/node/apni/2892114", nsr.getLsid()); - } catch (SearchResultException e) { - fail("Unexpected search exception " + e); - } - } + String name = "Abelia"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName(name); + cl.setOrder("Dipsacales"); + NameSearchResult nsr = searcher.searchForRecord(cl, true); + assertNotNull(nsr); + assertEquals("https://id.biodiversity.org.au/node/apni/2892114", nsr.getLsid()); + } @Test public void testMultipleMisappliedResolution1() throws Exception { diff --git a/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java b/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java index ed7805dd6..9d1b670b7 100644 --- a/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java +++ b/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java @@ -73,20 +73,15 @@ public void synonymHomonymIssue(){ } @Test - public void testRecursiveAuthorshipIssue() { - try { - LinnaeanRankClassification cl = new LinnaeanRankClassification(); - cl.setScientificName("Graphis notreallyaname Mull.Arg."); - cl.setAuthorship("Mull.Arg."); - cl.setKingdom("Animalia"); - cl.setGenus("Graphis"); - cl.setSpecificEpithet("notreallyaname"); - MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); - assertEquals("https://biodiversity.org.au/afd/taxa/2af76a1e-2086-46e3-90b9-6f00983b21a5", metrics.getResult().getLsid()); // Graphis from AFD - } catch (Exception e) { - e.printStackTrace(); - fail("Exception should not occur"); - } + public void testRecursiveAuthorshipIssue1() throws Exception { + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName("Graphis notreallyaname Mull.Arg."); + cl.setAuthorship("Mull.Arg."); + cl.setKingdom("Animalia"); + cl.setGenus("Graphis"); + cl.setSpecificEpithet("notreallyaname"); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertEquals("https://biodiversity.org.au/afd/taxa/2af76a1e-2086-46e3-90b9-6f00983b21a5", metrics.getResult().getLsid()); // Graphis from AFD } @Test @@ -98,7 +93,17 @@ public void testRecursiveAuthorshipIssue2() throws Exception { cl.setGenus("Graphis"); cl.setSpecificEpithet("notreallyaname"); MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); - assertEquals("https://id.biodiversity.org.au/taxon/lichen/30241431", metrics.getResult().getLsid()); // Can't find Graphis since not APC placed so gets Graphidaceae + assertEquals("NZOR-6-132826", metrics.getResult().getLsid()); // Can't find Graphis homonym so gets Graphidaceae + } + + @Test + public void testRecursiveAuthorshipIssue3() throws Exception { + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName("Graphis"); + cl.setKingdom("Fungi"); + cl.setGenus("Graphis"); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertEquals("NZOR-6-122770", metrics.getResult().getLsid()); // Can't find Graphis homonym so gets Graphidaceae } @Test @@ -147,7 +152,7 @@ public void testSPNovName() { cl.setSpecificEpithet(spEp); MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); //System.out.println(metrics.getResult()); - assertEquals("http://id.biodiversity.org.au/instance/apni/884433", metrics.getResult().getLsid()); + assertEquals("https://id.biodiversity.org.au/instance/apni/884433", metrics.getResult().getLsid()); assertTrue(metrics.getErrors().contains(ErrorType.HOMONYM)); } catch (Exception e) { From 1126bf48245e7f7455b6bb14e8b051d859d13566 Mon Sep 17 00:00:00 2001 From: pal155 Date: Mon, 26 Jul 2021 10:06:14 +1000 Subject: [PATCH 06/19] Update configuration with new name mappings and exclusions --- data/ala-taxon-config.json | 16 +++++++++++++--- .../ala/names/index/TaxonConceptInstance.java | 2 +- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/data/ala-taxon-config.json b/data/ala-taxon-config.json index b719d75f3..a21af6956 100644 --- a/data/ala-taxon-config.json +++ b/data/ala-taxon-config.json @@ -34,7 +34,7 @@ }, { "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", - "matchType": "REGEX", + "matchType": "INSENSITIVE", "scientificName": "Not assigned" }, { @@ -634,10 +634,12 @@ "Cerapus murrayae": "Cerapus murrayi", "Chelonaplysilla noevus": "Chelonaplysilla naevus", "Chromonephthea muironensis": "Chromonephthea murionensis", + "Cis munitus": "Cis minutus", "Compsopogon coeruleus": "Compsopogon caeruleus", "Cortinarius campbellae": "Cortinarius campbelliae", "Diastylopsis thileniusi": "Diastylopsis thilenuisi", "Difflugia garmen": "Difflugia gramen", + "Encyonema auerwaldsii": "Encyonema auerswaldii", "Euglypha loevis": "Euglypha laevis", "Eumida hawkseburyensis": "Eumida hawkesburyensis", "Euryspongia deliculata": "Euryspongia delicatula", @@ -654,26 +656,35 @@ "Liljeborgia aequiabilis": "Liljeborgia aequabilis", "Limnodriloides winckelmanni": "Limnodriloides wincklemanni", "Liocranchia valdiviae": "Liocranchia valdivae", + "Lyngbya digueti": "Lyngbya diguetii", + "Macromitrium ligulifolium": "Macromitrium ligulaefolium", "Marasmius crinis-equi": "Marasmius crinisequi", "Mesoplodon gingkodens": "Mesoplodon ginkgodens", "Metacirolana basteni": "Metacirolana bastenae", "Mycedium mancoi": "Mycedium mancaoi", "Mysticoncha wilsonae": "Mysticoncha wilsoni", + "Navicula fromenterae": "Navicula formenterae", + "Navicula laterostriata": "Navicula laterostrata", "Nectria quisquiliaris": "Nectria quisquilaris", "Neelaps calonotos": "Neelaps calonotus", "Odontosyllis langerhansaesetosa": "Odontosyllis langerhansiaesetosa", "Paraminabea aldersaldei": "Paraminabea aldersladei", "Phyllodoce madierensis": "Phyllodoce madeirensis", + "Phytophthora fragariifolia": "Phytophthora fragariaefolia", "Plumatella \"longigemmis\"": "Plumatella longigemmis", "Prionospio auckalndica": "Prionospio aucklandica", + "Porphyrosiphon notarisii": "Porphyrosiphon notarissi", "Puccinia duthiae": "Puccinia duthiei", "Puccinia argophyllae": "Puccinia argophylli", "Reteporella lacinata": "Reteporella laciniata", "Reteporella malleatia": "Reteporella malleata", "Ringicula doliaris": "Ringicula dolaris", + "Scytonema hofmanni": "Scytonema hofmannii", + "Scytonema viarium": "Scytonema varium", "Smittoidea discoverae": "Smittoidea discoveriae", "Sporisorium australiasiaticum": "Sporisorium australasiaticum", "Stereum amoenum": "Stereum amaenum", + "Sticta wiegelii": "Sticta weigelii", "Stylopoma thornelyae": "Stylopoma thornelyi", "Tectacingulum tumidum": "Tectacingulum tumidium", "Tesarius sulcipennis": "Tesarius suclipennis", @@ -741,8 +752,7 @@ "Plantae": 6000 }, "owner": [ - "Plantae", - "Solanum torvum" + "Plantae" ] }, { diff --git a/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java b/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java index df9c44ebd..74c5a1266 100644 --- a/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java +++ b/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java @@ -276,7 +276,7 @@ public NameProvider getProvider() { } /** - * Get the originating authorityfor the data + * Get the originating authority for the data * * @return The authority source */ From e025340ecf53e73882f6ff71b7e3584073698138 Mon Sep 17 00:00:00 2001 From: pal155 Date: Mon, 26 Jul 2021 12:16:59 +1000 Subject: [PATCH 07/19] Fix travis --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 56e72af80..6e6d70b5e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,7 +10,7 @@ branches: before_install: - mkdir -p ~/.m2; wget -q -O ~/.m2/settings.xml https://raw.githubusercontent.com/AtlasOfLivingAustralia/travis-build-configuration/master/travis_maven_settings_simple.xml -- sudo mkdir -p /data/lucene; sudo wget -O /data/lucene/namematching-20210621.tgz https://archives.ala.org.au/archives/nameindexes/20200214/namematching-20200214-lucene8.tgz +- sudo mkdir -p /data/lucene; sudo wget -O /data/lucene/namematching-20210621.tgz https://archives.ala.org.au/archives/nameindexes/20210629/namematching-20210629.tgz - cd /data/lucene - sudo tar zxvf namematching-20210629.tgz - sudo ln -s namematching-20210629 namematching From 6fd4936c08733689d94d5df844036c44639a45fb Mon Sep 17 00:00:00 2001 From: pal155 Date: Mon, 26 Jul 2021 12:39:20 +1000 Subject: [PATCH 08/19] Sigh --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 6e6d70b5e..db985dbf4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,7 +10,7 @@ branches: before_install: - mkdir -p ~/.m2; wget -q -O ~/.m2/settings.xml https://raw.githubusercontent.com/AtlasOfLivingAustralia/travis-build-configuration/master/travis_maven_settings_simple.xml -- sudo mkdir -p /data/lucene; sudo wget -O /data/lucene/namematching-20210621.tgz https://archives.ala.org.au/archives/nameindexes/20210629/namematching-20210629.tgz +- sudo mkdir -p /data/lucene; sudo wget -O /data/lucene/namematching-20210629.tgz https://archives.ala.org.au/archives/nameindexes/20210629/namematching-20210629.tgz - cd /data/lucene - sudo tar zxvf namematching-20210629.tgz - sudo ln -s namematching-20210629 namematching From 6d1a560493bb49253f815d74665ab79a8a8da690 Mon Sep 17 00:00:00 2001 From: Doug Palmer Date: Mon, 26 Jul 2021 21:51:26 +1000 Subject: [PATCH 09/19] Update for the 20210629 name index (#124) * Update to the new name matching data. * New index 20210629 * Forbid "Unknown" and "XXX sp." names * Forbid illegitimate-type names from APNI-only source * Get dataset name if available * Build taxonomy from a recursive list of directories * Don't allow forbidden instances when looking for accepted elements (default to scientific name in the expectation that this will be resolved) * Use lucene API for searches * Update configuration with new name mappings and exclusions --- .travis.yml | 6 +- README.md | 4 +- data/ala-taxon-config.json | 101 ++++- .../org/ala/names/index/DwcaNameSource.java | 11 +- .../au/org/ala/names/index/NameSource.java | 13 +- .../org/ala/names/index/ScientificName.java | 21 +- .../au/org/ala/names/index/TaxonConcept.java | 2 +- .../ala/names/index/TaxonConceptInstance.java | 27 +- .../java/au/org/ala/names/index/Taxonomy.java | 4 +- .../org/ala/names/index/TaxonomyBuilder.java | 52 ++- .../index/provider/MatchTaxonCondition.java | 2 +- .../au/org/ala/names/model/FieldType.java | 245 +++++++++++ .../model/LinnaeanRankClassification.java | 27 +- .../org/ala/names/model/NameIndexField.java | 133 ++++-- .../org/ala/names/search/ALANameIndexer.java | 216 ++++------ .../org/ala/names/search/ALANameSearcher.java | 356 +++++++--------- .../org/ala/names/search/DwcaNameIndexer.java | 26 +- src/main/resources/taxonomy.properties | 3 + .../provider/MatchTaxonConditionTest.java | 17 + .../ala/names/search/ALANameSearcherTest.java | 403 +++++++++++++----- .../ala/names/search/BiocacheMatchTest.java | 49 ++- .../ala/names/search/IconicSpeciesTest.java | 2 +- .../ala/names/search/VernacularMatchTest.java | 8 +- .../ala/names/search/iconic_species_list.csv | 10 +- 24 files changed, 1183 insertions(+), 555 deletions(-) create mode 100644 src/main/java/au/org/ala/names/model/FieldType.java diff --git a/.travis.yml b/.travis.yml index 57ba34a7f..db985dbf4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,10 +10,10 @@ branches: before_install: - mkdir -p ~/.m2; wget -q -O ~/.m2/settings.xml https://raw.githubusercontent.com/AtlasOfLivingAustralia/travis-build-configuration/master/travis_maven_settings_simple.xml -- sudo mkdir -p /data/lucene; sudo wget -O /data/lucene/namematching-20200214-lucene8.tgz https://archives.ala.org.au/archives/nameindexes/20200214/namematching-20200214-lucene8.tgz +- sudo mkdir -p /data/lucene; sudo wget -O /data/lucene/namematching-20210629.tgz https://archives.ala.org.au/archives/nameindexes/20210629/namematching-20210629.tgz - cd /data/lucene -- sudo tar zxvf namematching-20200214-lucene8.tgz -- sudo ln -s namematching-20200214-lucene8 namematching +- sudo tar zxvf namematching-20210629.tgz +- sudo ln -s namematching-20210629 namematching - ls -laF - cd $TRAVIS_BUILD_DIR diff --git a/README.md b/README.md index 8e0b9cfba..6d0dd4438 100644 --- a/README.md +++ b/README.md @@ -82,8 +82,8 @@ The build creates 3 artefacts in the ala-name-matching/target directory: * ala-name-matching-3.5-distribution.zip - zip containing the project jar and dependencies * ala-name-matching-3.5-sources.jar - source jar for the project code only -The name index for Australian names lists used in unit tests can be downloaded [from here](https://biocache.ala.org.au/archives/nameindexes/20200214) and needs to be extracted to the -directory `/data/lucene/namematching-20200214` +The name index for Australian names lists used in unit tests can be downloaded [from here](https://biocache.ala.org.au/archives/nameindexes/20220629) and needs to be extracted to the +directory `/data/lucene/namematching-20210629` ## ALA Names List diff --git a/data/ala-taxon-config.json b/data/ala-taxon-config.json index f04775e7e..a21af6956 100644 --- a/data/ala-taxon-config.json +++ b/data/ala-taxon-config.json @@ -26,6 +26,21 @@ { "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", "taxonomicStatus": "INFERRED_INVALID" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "matchType": "REGEX", + "scientificName": "Unknown( .*|)" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "matchType": "INSENSITIVE", + "scientificName": "Not assigned" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "matchType": "REGEX", + "scientificName": "[A-Z][A-Za-z]+ sp\\.?" } ], "adjustments": [ @@ -183,13 +198,6 @@ }, "adjustment": -20 }, - { - "condition": { - "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", - "nomenclaturalStatus": "FORGOTTEN" - }, - "adjustment": -20 - }, { "condition": { "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", @@ -626,10 +634,12 @@ "Cerapus murrayae": "Cerapus murrayi", "Chelonaplysilla noevus": "Chelonaplysilla naevus", "Chromonephthea muironensis": "Chromonephthea murionensis", + "Cis munitus": "Cis minutus", "Compsopogon coeruleus": "Compsopogon caeruleus", "Cortinarius campbellae": "Cortinarius campbelliae", "Diastylopsis thileniusi": "Diastylopsis thilenuisi", "Difflugia garmen": "Difflugia gramen", + "Encyonema auerwaldsii": "Encyonema auerswaldii", "Euglypha loevis": "Euglypha laevis", "Eumida hawkseburyensis": "Eumida hawkesburyensis", "Euryspongia deliculata": "Euryspongia delicatula", @@ -646,26 +656,35 @@ "Liljeborgia aequiabilis": "Liljeborgia aequabilis", "Limnodriloides winckelmanni": "Limnodriloides wincklemanni", "Liocranchia valdiviae": "Liocranchia valdivae", + "Lyngbya digueti": "Lyngbya diguetii", + "Macromitrium ligulifolium": "Macromitrium ligulaefolium", "Marasmius crinis-equi": "Marasmius crinisequi", "Mesoplodon gingkodens": "Mesoplodon ginkgodens", "Metacirolana basteni": "Metacirolana bastenae", "Mycedium mancoi": "Mycedium mancaoi", "Mysticoncha wilsonae": "Mysticoncha wilsoni", + "Navicula fromenterae": "Navicula formenterae", + "Navicula laterostriata": "Navicula laterostrata", "Nectria quisquiliaris": "Nectria quisquilaris", "Neelaps calonotos": "Neelaps calonotus", "Odontosyllis langerhansaesetosa": "Odontosyllis langerhansiaesetosa", "Paraminabea aldersaldei": "Paraminabea aldersladei", "Phyllodoce madierensis": "Phyllodoce madeirensis", + "Phytophthora fragariifolia": "Phytophthora fragariaefolia", "Plumatella \"longigemmis\"": "Plumatella longigemmis", "Prionospio auckalndica": "Prionospio aucklandica", + "Porphyrosiphon notarisii": "Porphyrosiphon notarissi", "Puccinia duthiae": "Puccinia duthiei", "Puccinia argophyllae": "Puccinia argophylli", "Reteporella lacinata": "Reteporella laciniata", "Reteporella malleatia": "Reteporella malleata", "Ringicula doliaris": "Ringicula dolaris", + "Scytonema hofmanni": "Scytonema hofmannii", + "Scytonema viarium": "Scytonema varium", "Smittoidea discoverae": "Smittoidea discoveriae", "Sporisorium australiasiaticum": "Sporisorium australasiaticum", "Stereum amoenum": "Stereum amaenum", + "Sticta wiegelii": "Sticta weigelii", "Stylopoma thornelyae": "Stylopoma thornelyi", "Tectacingulum tumidum": "Tectacingulum tumidium", "Tesarius sulcipennis": "Tesarius suclipennis", @@ -733,8 +752,7 @@ "Plantae": 6000 }, "owner": [ - "Plantae", - "Solanum torvum" + "Plantae" ] }, { @@ -743,7 +761,59 @@ "description": "Australian Plant Name Index entries not placed by the Australian Plant Census, given an assumed parent of Plantae", "parent": "apni-apc", "authority": false, - "defaultScore" : 4000 + "defaultScore" : 4000, + "adjuster": { + "forbidden": [ + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "FORGOTTEN" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "CONFUSED" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "ABORTED" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "SUPERFLUOUS" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "NUDUM" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "NULL_NAME" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "SUPPRESSED" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "REJECTED" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "REJECTED_OUTRIGHT" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "ILLEGITIMATE" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "INVALID" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "DENIED" + } + ] + } }, { "id" : "dr2699", @@ -792,6 +862,17 @@ ] } }, + { + "id" : "dr17664", + "name": "ABRSL", + "description": "ABRS Lichen Checklist", + "parent": "apni-apc", + "rightsHolder": "Commonwealth Scientific and Industrial Research Organisation", + "authority": false, + "defaultScore" : 2500, + "defaultParentTaxon": "Plantae", + "conceptResolutionPriority": "ADDITIONAL" + }, { "id" : "dr2704", "name": "CAAB", diff --git a/src/main/java/au/org/ala/names/index/DwcaNameSource.java b/src/main/java/au/org/ala/names/index/DwcaNameSource.java index d4473cca0..00e004961 100644 --- a/src/main/java/au/org/ala/names/index/DwcaNameSource.java +++ b/src/main/java/au/org/ala/names/index/DwcaNameSource.java @@ -227,6 +227,12 @@ public void loadVernacularDwCA(Taxonomy taxonomy) throws IndexBuilderException { * @throws IndexBuilderException if unable to load a record into the taxonomy. */ protected void loadTaxonDwCA(Taxonomy taxonomy) throws IndexBuilderException { + String defaultDatasetName = null; + try { + defaultDatasetName = archive.getMetadata().getTitle(); + } catch (MetadataException e) { + taxonomy.report(IssueType.PROBLEM, "provider.archive.noMetadata", (String) null, null); + } if (archive.getCore().getRowType() != DwcTerm.Taxon) throw new IndexBuilderException("Expecting a core row type of " + DwcTerm.Taxon); List classifiers = TaxonConceptInstance.CLASSIFICATION_FIELDS.stream().filter(t -> archive.getCore().hasTerm(t)).collect(Collectors.toList()); @@ -240,7 +246,10 @@ protected void loadTaxonDwCA(Taxonomy taxonomy) throws IndexBuilderException { Record core = record.core(); taxonID = core.value(DwcTerm.taxonID); String verbatimNomenclaturalCode = core.value(DwcTerm.nomenclaturalCode); - NameProvider provider = taxonomy.resolveProvider(core.value(DwcTerm.datasetID), core.value(DwcTerm.datasetName)); + String datasetName = core.value(DwcTerm.datasetName); + if (datasetName == null) + datasetName = defaultDatasetName; + NameProvider provider = taxonomy.resolveProvider(core.value(DwcTerm.datasetID), datasetName); NomenclaturalCode code = taxonomy.resolveCode(verbatimNomenclaturalCode); String scientificName = core.value(DwcTerm.scientificName); String scientificNameAuthorship = core.value(DwcTerm.scientificNameAuthorship); diff --git a/src/main/java/au/org/ala/names/index/NameSource.java b/src/main/java/au/org/ala/names/index/NameSource.java index 4b75daad0..926c766d7 100644 --- a/src/main/java/au/org/ala/names/index/NameSource.java +++ b/src/main/java/au/org/ala/names/index/NameSource.java @@ -345,16 +345,15 @@ abstract public class NameSource { * * @throws IndexBuilderException if unable to create the name source */ - public static NameSource create(String f) throws IndexBuilderException { + public static NameSource create(File f) throws IndexBuilderException { try { - File nf = new File(f); NameSource ns; - if (!nf.exists()) - throw new IndexBuilderException("Name source " + nf + " does not exist"); - if (nf.isDirectory()) - ns = new DwcaNameSource(nf); + if (!f.exists()) + throw new IndexBuilderException("Name source " + f + " does not exist"); + if (f.isDirectory()) + ns = new DwcaNameSource(f); else - ns = new CSVNameSource(nf.toPath(), "UTF-8", DwcTerm.Taxon); + ns = new CSVNameSource(f.toPath(), "UTF-8", DwcTerm.Taxon); ns.validate(); return ns; } catch (IOException ex) { diff --git a/src/main/java/au/org/ala/names/index/ScientificName.java b/src/main/java/au/org/ala/names/index/ScientificName.java index 74d8ce9a6..3a8ca1161 100644 --- a/src/main/java/au/org/ala/names/index/ScientificName.java +++ b/src/main/java/au/org/ala/names/index/ScientificName.java @@ -2,6 +2,8 @@ import au.org.ala.names.model.RankType; import au.org.ala.names.util.DwcaWriter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.*; @@ -30,6 +32,8 @@ * @copyright Copyright (c) 2017 CSIRO */ public class ScientificName extends Name implements Comparable { + private static final Logger logger = LoggerFactory.getLogger(ScientificName.class); + /** * Construct for a container and a key * @@ -100,13 +104,18 @@ public TaxonomicElement findElement(Taxonomy taxonomy, NameProvider provider) { */ @Override protected TaxonConcept findPrincipal(Taxonomy taxonomy) { - TaxonConcept principal = this.findBasePrincipal(taxonomy); - TaxonConceptInstance representative = principal.getRepresentative(); - TaxonConceptInstance resolved = representative.getResolvedAccepted(); + try { + TaxonConcept principal = this.findBasePrincipal(taxonomy); + TaxonConceptInstance representative = principal.getRepresentative(); + TaxonConceptInstance resolved = representative.getResolvedAccepted(); - if (resolved != representative && resolved.getContainer().getContainer() == this) - principal = resolved.getContainer(); - return principal; + if (resolved != representative && resolved.getContainer().getContainer() == this) + principal = resolved.getContainer(); + return principal; + } catch (RuntimeException ex) { + logger.error("Unable to find principal for " + this); + throw ex; + } } /** diff --git a/src/main/java/au/org/ala/names/index/TaxonConcept.java b/src/main/java/au/org/ala/names/index/TaxonConcept.java index 832b3eeb2..e3f512fdf 100644 --- a/src/main/java/au/org/ala/names/index/TaxonConcept.java +++ b/src/main/java/au/org/ala/names/index/TaxonConcept.java @@ -118,7 +118,7 @@ public TaxonConceptInstance addInstance(NameKey instanceKey, TaxonConceptInstanc */ public TaxonConceptInstance findInstance(NameProvider provider, boolean acceptedOnly) { for (TaxonConceptInstance instance: this.instances) - if (instance.getProvider().equals(provider) && (!acceptedOnly || instance.isAccepted())) + if (instance.getProvider().equals(provider) && !instance.isForbidden() && (!acceptedOnly || instance.isAccepted())) return instance; return null; } diff --git a/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java b/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java index 0bb5b539b..74c5a1266 100644 --- a/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java +++ b/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java @@ -12,6 +12,8 @@ import org.gbif.dwc.terms.DwcTerm; import org.gbif.dwc.terms.GbifTerm; import org.gbif.dwc.terms.Term; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import javax.annotation.Nullable; import java.io.IOException; @@ -29,6 +31,8 @@ * @copyright Copyright © 2017 Atlas of Living Australia */ public class TaxonConceptInstance extends TaxonomicElement { + private static final Logger logger = LoggerFactory.getLogger(TaxonConceptInstance.class); + /** Compare instance base (priovider only) scores */ public static Comparator PROVIDER_SCORE_COMPARATOR = new Comparator() { @Override @@ -272,7 +276,7 @@ public NameProvider getProvider() { } /** - * Get the originating authorityfor the data + * Get the originating authority for the data * * @return The authority source */ @@ -935,6 +939,10 @@ private TaxonConceptInstance getResolvedAccepted(TaxonConceptInstance original, if (trace != null) trace.add(ae); TaxonConceptInstance accepted = ae.getRepresentative(); + if (accepted == null) { + logger.warn("Null representative instance for " + ae + " when resolving " + this); + return resolved; + } accepted = accepted.getResolvedAccepted(original, steps - 1, trace, exception); if (!accepted.isForbidden()) return accepted; @@ -993,27 +1001,33 @@ public void normalise() throws IndexBuilderException { * * @param taxonomy The current taxonomy * + * @return True if successfully resolved + * * @throws IndexBuilderException If unable to make a link, usually due to a broken reference */ // If you plan to change this, it is called by a parallel stream, so consisder thread safety // At the moment, this fills out inferred information only - public void resolveLinks(Taxonomy taxonomy) throws IndexBuilderException { + public boolean resolveLinks(Taxonomy taxonomy) throws IndexBuilderException { if (this.parentNameUsageID != null) { this.parent = taxonomy.getInstance(this.parentNameUsageID); } if (this.parentNameUsage != null && this.parent == null) { this.parent = taxonomy.findElement(this.code, this.parentNameUsage, this.provider, null); } - if (this.parent == null && (this.parentNameUsage != null || this.parentNameUsageID != null)) - throw new IndexBuilderException("Unable to find parent taxon for " + this + " from " + this.parentNameUsageID + " - " + this.parentNameUsage); + if (this.parent == null && (this.parentNameUsage != null || this.parentNameUsageID != null)) { + taxonomy.report(IssueType.ERROR, "instance.parent.invalidLink", this.taxonID, this.scientificName, "Unable to find parent taxon for " + this + " from " + this.parentNameUsageID + " - " + this.parentNameUsage); + return false; + } if (this.acceptedNameUsageID != null) { this.accepted = taxonomy.getInstance(this.acceptedNameUsageID); } if (this.acceptedNameUsage != null && this.accepted == null) { this.accepted = taxonomy.findElement(this.code, this.acceptedNameUsage, this.provider, null); } - if (this.accepted == null && (this.acceptedNameUsage != null || this.acceptedNameUsageID != null)) - throw new IndexBuilderException("Unable to find accepted taxon for " + this + " from " + this.acceptedNameUsageID + " - " + this.acceptedNameUsage); + if (this.accepted == null && (this.acceptedNameUsage != null || this.acceptedNameUsageID != null)) { + taxonomy.report(IssueType.ERROR, "instance.accepted.invalidLink", this.taxonID, this.scientificName, "Unable to find accepted taxon for " + this + " from " + this.acceptedNameUsageID + " - " + this.acceptedNameUsage); + return false; + } // No parent or accepted taxon but has a classification, so see if we can deduce a parent if (this.parent == null && this.accepted == null && this.classification != null) { String genus = ""; @@ -1045,6 +1059,7 @@ public void resolveLinks(Taxonomy taxonomy) throws IndexBuilderException { if (this.parent == null) this.parent = this.provider.findDefaultParent(taxonomy, this); taxonomy.count("count.resolve.instance.links"); + return true; } /** diff --git a/src/main/java/au/org/ala/names/index/Taxonomy.java b/src/main/java/au/org/ala/names/index/Taxonomy.java index 881da9b75..b5e7eeeb0 100644 --- a/src/main/java/au/org/ala/names/index/Taxonomy.java +++ b/src/main/java/au/org/ala/names/index/Taxonomy.java @@ -478,7 +478,8 @@ public void provideUnknownTaxon() throws Exception { */ public void resolveLinks() throws IndexBuilderException { logger.info("Resolving links"); - this.instances.values().parallelStream().forEach(instance -> instance.resolveLinks(this)); + if (!this.instances.values().parallelStream().allMatch(instance -> instance.resolveLinks(this))) + throw new IndexBuilderException("Errors resolving links"); logger.info("Finished resolving links"); } @@ -1469,6 +1470,7 @@ public void createWorkingIndex() throws IOException { indexer.commitLoadingIndexes(); indexer.generateIndex(); indexer.create(interim); + indexer.createIrmng(null); indexer.commit(); } catch (Exception ex) { throw new IndexBuilderException("Unable to build working index"); diff --git a/src/main/java/au/org/ala/names/index/TaxonomyBuilder.java b/src/main/java/au/org/ala/names/index/TaxonomyBuilder.java index 0cf6380bc..d9784b94d 100644 --- a/src/main/java/au/org/ala/names/index/TaxonomyBuilder.java +++ b/src/main/java/au/org/ala/names/index/TaxonomyBuilder.java @@ -8,6 +8,7 @@ import org.slf4j.LoggerFactory; import java.io.*; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; @@ -22,6 +23,48 @@ public class TaxonomyBuilder { private static Logger logger = LoggerFactory.getLogger(TaxonomyBuilder.class); + /** + * Recursively find sources. + *

+ * The directory and sub-directory are first searched for a meta.xml file and, + * if present, the source is added as a DwCA. + * Otherwise, any csv files are added to the list and subdirectories recursively + * searched. + *

+ * @param path + * @return + */ + protected static List findSources(File path) { + List sources = new ArrayList<>(); + try { + if (!path.exists()) { + logger.info("Path does not exist " + path); + return sources; + } + if (path.isFile()) { + logger.info("Adding source file at " + path); + sources.add(NameSource.create(path)); + return sources; + } + if (!path.isDirectory()) { + logger.info("Unknown file type for " + path); + } + File meta = new File(path, "meta.xml"); + if (meta.exists()) { + logger.info("Adding DwCA at " + path); + sources.add(NameSource.create(path)); + return sources; + } else { + for (File f : path.listFiles()) { + if (f.isDirectory() || f.getName().endsWith(".csv")) + sources.addAll(findSources(f)); + } + } + } catch (Exception ex) { + logger.error("Unable to get sources for " + path, ex); + } + return sources; + } public static void main(String[] args) { try { @@ -36,12 +79,14 @@ public static void main(String[] args) { Integer samples = null; DwcaNameIndexer indexer; TaxonomyConfiguration config = null; + List sources; Option o = OptionBuilder.withLongOpt("output").withDescription("Output directory - defaults to 'combined' in the current directory").hasArg().withArgName("DIR").withType(File.class).create('o'); Option w = OptionBuilder.withLongOpt("work").withDescription("Working directory - defaults to the current directory").hasArg().withArgName("DIR").withType(File.class).create('w'); Option c = OptionBuilder.withLongOpt("config").withDescription("Configuration file").hasArg().withArgName("FILE").withType(File.class).create('c'); Option r = OptionBuilder.withLongOpt("report").withDescription("Report file").hasArg().withArgName("FILE").withType(File.class).create('r'); Option p = OptionBuilder.withLongOpt("previous").withDescription("Previous taxonomy DwCA").hasArg().withArgName("DIR").withType(File.class).create('p'); + Option recurse = OptionBuilder.withLongOpt("recurse").withDescription("Input file is a directory, recurse through subdirectories").create('R'); Option ncl = OptionBuilder.withLongOpt("noclean").withDescription("Don't clean up work area").create(); Option nc = OptionBuilder.withLongOpt("nocreate").withDescription("Don't create an output taxonomy").create(); Option s = OptionBuilder.withLongOpt("sample").withDescription("Output a sample taxonomy, consisting of n concepts plus their parents/accepted").hasArg().withArgName("N").withType(Integer.class).create(); @@ -50,6 +95,7 @@ public static void main(String[] args) { options.addOption(c); options.addOption(r); options.addOption(p); + options.addOption(recurse); options.addOption(ncl); options.addOption(nc); options.addOption(s); @@ -80,7 +126,11 @@ public static void main(String[] args) { if (cmd.hasOption("sample")) { samples = Integer.parseInt(cmd.getOptionValue("sample")); } - List sources = Arrays.asList(cmd.getArgs()).stream().map(f -> NameSource.create(f)).collect(Collectors.toList()); + if (cmd.hasOption("recurse")) { + sources = Arrays.asList(cmd.getArgs()).stream().map(File::new).map(f -> findSources(f)).flatMap(List::stream).collect(Collectors.toList()); + } else { + sources = Arrays.asList(cmd.getArgs()).stream().map(File::new).map(f -> NameSource.create(f)).collect(Collectors.toList()); + } Taxonomy taxonomy = new Taxonomy(config, work); taxonomy.begin(); taxonomy.load(sources); diff --git a/src/main/java/au/org/ala/names/index/provider/MatchTaxonCondition.java b/src/main/java/au/org/ala/names/index/provider/MatchTaxonCondition.java index c9992a01d..52047e6dd 100644 --- a/src/main/java/au/org/ala/names/index/provider/MatchTaxonCondition.java +++ b/src/main/java/au/org/ala/names/index/provider/MatchTaxonCondition.java @@ -211,7 +211,7 @@ private boolean matchScientificName(String name) { return this.matchScientificName.equals(name); case REGEX: if (this.patternScientificName == null) - this.patternScientificName = Pattern.compile(this.scientificName); + this.patternScientificName = Pattern.compile(this.scientificName, Pattern.CASE_INSENSITIVE); return this.patternScientificName.matcher(name).matches(); default: if (this.matchScientificName == null) diff --git a/src/main/java/au/org/ala/names/model/FieldType.java b/src/main/java/au/org/ala/names/model/FieldType.java new file mode 100644 index 000000000..16231e761 --- /dev/null +++ b/src/main/java/au/org/ala/names/model/FieldType.java @@ -0,0 +1,245 @@ +package au.org.ala.names.model; + +import au.org.ala.names.lucene.analyzer.LowerCaseKeywordAnalyzer; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.document.*; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.util.QueryBuilder; + +import java.util.function.BiConsumer; +import java.util.function.BiFunction; + +/** + * The type of field stored in the lucene index. + *

+ * Used to determine how to store and search for a field. + *

+ */ +abstract public class FieldType { + protected static final ThreadLocal ANALYZER = ThreadLocal.withInitial( + () -> LowerCaseKeywordAnalyzer.newInstance() + ); + protected static final ThreadLocal QUERY_BUILDER = ThreadLocal.withInitial( + () -> new QueryBuilder(ANALYZER.get()) + ); + protected static final ThreadLocal TERM_FIELD_TYPE = ThreadLocal.withInitial( + () -> { + org.apache.lucene.document.FieldType ft = new org.apache.lucene.document.FieldType(TextField.TYPE_STORED); + ft.setOmitNorms(true); + return ft; + } + ); + + /** The class of term stored */ + private Class class_; + /** The name of the field type */ + private String name; + + /** + * Construct with a name + * + * @param name The name + */ + public FieldType(Class class_, String name) { + this.class_ = class_; + this.name = name; + } + + /** + * Store a field into a lucene document. + *

+ * This may involve storing multiple lucene fields for range types. + *

+ * + * @param value The value to store + * @param name The name of the field + * @param document The document to add the field to + */ + abstract public void store(T value, String name, Document document); + + /** + * Generate a query for a field of this type. + * + * @param value The value to search for + * @param name The field name + * @return A query that searches for the value + */ + abstract public Query search(T value, String name); + + /** + * Search for a value in a range (inclusive). + *

+ * By default, this throws a {@link UnsupportedOperationException}. + * Types that have a concept of range can use this to implement a range search. + *

+ * + * @param lower The lower bound + * @param upper The upper bound + * @param name The field name + * + * @return A query based on the range + */ + public Query searchRange(T lower, T upper, String name) { + throw new UnsupportedOperationException("Field type " + this.name + " does not support ranges"); + } + + + /** + * Store-only field. + */ + public static final FieldType STORE = new FieldType(String.class,"store") { + @Override + public void store(String value, String name, Document document) { + document.add(new StoredField(name, value)); + } + + @Override + public Query search(String value, String name) { + throw new UnsupportedOperationException("Store-only field"); + } + }; + + /** + * An exact identifier. + *

+ * Storage and search is accomplished via extact lookup. + *

+ */ + public static final FieldType IDENTIFIER = new FieldType(String.class,"identifier") { + @Override + public void store(String value, String name, Document document) { + document.add(new StringField(name, value, Field.Store.YES)); + } + + @Override + public Query search(String value, String name) { + return new TermQuery(new Term(name, value)); + } + }; + + /** + * A simple term. + *

+ * Storage and search is accomplished via case-insensitive storage and lookup. + *

+ */ + public static final FieldType TERM = new FieldType(String.class, "term") { + @Override + public void store(String value, String name, Document document) { + Field field = new Field(name, value, TERM_FIELD_TYPE.get()); + document.add(field); + } + + @Override + public Query search(String value, String name) { + return QUERY_BUILDER.get().createPhraseQuery(name, value); + } + }; + + /** + * A tokenisable term. + *

+ * Storage and search is accomplished via case-insensitive tokenisation and search + *

+ */ + public static final FieldType TEXT = new FieldType(String.class, "text") { + + @Override + public void store(String value, String name, Document document) { + document.add(new TextField(name, value, Field.Store.YES)); + } + + @Override + public Query search(String value, String name) { + return QUERY_BUILDER.get().createPhraseQuery(name, value); + } + }; + + /** + * A common name. + *

+ * Storage and search is based on a simplified lookup where non alpha-numeric characters are removed + * and made case insensitive. + *

+ */ + public static final FieldType COMMON = new FieldType(String.class,"common") { + @Override + public void store(String value, String name, Document document) { + value = value.toUpperCase().replaceAll("[^A-Z0-9ÏËÖÜÄÉÈČÁÀÆŒ]", ""); + document.add(new StringField(name, value, Field.Store.YES)); + } + + @Override + public Query search(String value, String name) { + value = value.toUpperCase().replaceAll("[^A-Z0-9ÏËÖÜÄÉÈČÁÀÆŒ]", ""); + return new TermQuery(new Term(name, value)); + } + }; + + /** + * An integer term. + *

+ * Storage and search allow range-based queries. + *

+ */ + public static final FieldType INTEGER = new FieldType(Integer.class, "integer") { + @Override + public void store(Integer value, String name, Document document) { + document.add(new IntPoint(name, value)); + document.add(new StoredField(name, value)); + } + + @Override + public Query search(Integer value, String name) { + return IntPoint.newExactQuery(name, value); + } + + /** + * Search for a value in a range (inclusive). + * + * @param lower The lower bound + * @param upper The upper bound + * @param name The field name + * @return A query based on the range + */ + @Override + public Query searchRange(Integer lower, Integer upper, String name) { + return IntPoint.newRangeQuery(name, lower, upper); + } + }; + + /** + * A double term. + *

+ * Storage and search allow range-based queries. + *

+ */ + public static final FieldType DOUBLE = new FieldType(Double.class, "double") { + @Override + public void store(Double value, String name, Document document) { + document.add(new DoublePoint(name, value)); + document.add(new StoredField(name, value)); + } + + @Override + public Query search(Double value, String name) { + return DoublePoint.newExactQuery(name, value); + } + + /** + * Search for a value in a range (inclusive). + * + * @param lower The lower bound + * @param upper The upper bound + * @param name The field name + * @return A query based on the range + */ + @Override + public Query searchRange(Double lower, Double upper, String name) { + return DoublePoint.newRangeQuery(name, lower, upper); + } + }; + +} diff --git a/src/main/java/au/org/ala/names/model/LinnaeanRankClassification.java b/src/main/java/au/org/ala/names/model/LinnaeanRankClassification.java index 244b4bf49..d21949828 100644 --- a/src/main/java/au/org/ala/names/model/LinnaeanRankClassification.java +++ b/src/main/java/au/org/ala/names/model/LinnaeanRankClassification.java @@ -5,6 +5,8 @@ import org.apache.commons.lang.builder.HashCodeBuilder; import org.apache.commons.lang.builder.ToStringBuilder; import org.apache.commons.lang3.StringUtils; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; /** * A model object that represents a Linnaean Classification. @@ -465,27 +467,26 @@ public boolean hasIdenticalClassification(LinnaeanRankClassification lrc, RankTy * @param optional Indicates whether the the terms should be optional * @return */ - public String getLuceneSearchString(boolean optional) { - String prefix = optional ? " " : " +"; + public void appendLuceneQuery(BooleanQuery.Builder builder, boolean optional) { + BooleanClause.Occur occurs = optional ? BooleanClause.Occur.SHOULD : BooleanClause.Occur.FILTER; StringBuilder sb = new StringBuilder(); if (StringUtils.isNotEmpty(kingdom)) - sb.append(prefix).append(RankType.KINGDOM.getRank()).append(":\"").append(kingdom).append("\""); - if (StringUtils.isNotEmpty(phylum)) - sb.append(prefix).append(RankType.PHYLUM.getRank()).append(":\"").append(phylum).append("\""); + builder.add(NameIndexField.KINGDOM.search(this.kingdom), occurs); + if (StringUtils.isNotEmpty(phylum)) + builder.add(NameIndexField.PHYLUM.search(this.phylum), occurs); if (StringUtils.isNotEmpty(klass)) - sb.append(prefix).append(RankType.CLASS.getRank()).append(":\"").append(klass).append("\""); - if (StringUtils.isNotEmpty(order)) - sb.append(prefix).append(RankType.ORDER.getRank()).append(":\"").append(order).append("\""); + builder.add(NameIndexField.CLASS.search(this.klass), occurs); + if (StringUtils.isNotEmpty(order)) + builder.add(NameIndexField.ORDER.search(this.order), occurs); if (StringUtils.isNotEmpty(family)) - sb.append(prefix).append(RankType.FAMILY.getRank()).append(":\"").append(family).append("\""); + builder.add(NameIndexField.FAMILY.search(this.family), occurs); if (StringUtils.isNotEmpty(genus)) - sb.append(prefix).append(RankType.GENUS.getRank()).append(":\"").append(genus).append("\""); + builder.add(NameIndexField.GENUS.search(this.genus), occurs); if (StringUtils.isNotEmpty(species)) - sb.append(prefix).append(RankType.SPECIES.getRank()).append(":\"").append(species).append("\""); + builder.add(NameIndexField.SPECIES.search(this.species), occurs); //authorship is always optional due to inconsistencies in the name format etc... if (StringUtils.isNotEmpty(authorship)) - sb.append(" ").append(NameIndexField.AUTHOR.toString()).append(":\"").append(authorship).append("\"~"); - return sb.toString(); + builder.add(NameIndexField.AUTHOR.search(this.authorship), BooleanClause.Occur.SHOULD); } diff --git a/src/main/java/au/org/ala/names/model/NameIndexField.java b/src/main/java/au/org/ala/names/model/NameIndexField.java index baafd43ae..ec5237840 100644 --- a/src/main/java/au/org/ala/names/model/NameIndexField.java +++ b/src/main/java/au/org/ala/names/model/NameIndexField.java @@ -14,6 +14,11 @@ */ package au.org.ala.names.model; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.WildcardQuery; + /** * An Enum for all the fields that are indexed for the name matching. This enum is used by * {@link au.org.ala.names.search.ALANameIndexer} to create the index and @@ -22,39 +27,115 @@ * @author Natasha Carter */ public enum NameIndexField { - ID("id"), - LSID("lsid"), - PARENT_ID("parent_id"), - DOCUMENT_TYPE("doctype"), - ACCEPTED("accepted_lsid"), - iS_SYNONYM("is_synonym"),//whether or not the record is a synonym - GENUS("genus"), - GENUS_EX("genus_ex"), //genus sounds like expression - handles masculine and feminine too. - SPECIES_EX("specific_ex"),// specific epithet sounds like expression - INFRA_EX("infra_ex"),//infra specific epithet sounds like expression - SPECIFIC("specific"), - INFRA_SPECIFIC("infra"), - NAME("name"),// search name - OTHER_NAMES("other_names"),// Alternative names - NAME_CANONICAL("name_canonical"), // Canonical name - NAME_COMPLETE("name_complete"), // Complete name - RANK_ID("rank_id"), - RANK("rank"), - AUTHOR("author"), - PHRASE("phrase"),//stores the values of a "phrase" name. Some more intelligence will be needed when matching these - VOUCHER("voucher"), //stores a voucher value minus the spaces and fullstops. - ALA("ala"), //stores whether or not it is an ALA generated name - DATASET_ID("dataset_id"), // The source dataset - SYNONYM_TYPE("syn_type"), //stores the type of synonym that it represents + ID("id", FieldType.IDENTIFIER), + GUID("guid", FieldType.IDENTIFIER), + OTHER_GUID("otherGuid", FieldType.IDENTIFIER), + LEFT("left", FieldType.INTEGER), + RIGHT("right", FieldType.INTEGER), + LSID("lsid", FieldType.IDENTIFIER), + REAL_LSID("reallsid", FieldType.STORE), + PARENT_ID("parent_id", FieldType.IDENTIFIER), + DOCUMENT_TYPE("doctype", FieldType.IDENTIFIER), + ACCEPTED("accepted_lsid", FieldType.IDENTIFIER), + iS_SYNONYM("is_synonym", FieldType.IDENTIFIER),//whether or not the record is a synonym + KINGDOM("kingdom", FieldType.TERM), + KINGDOM_ID("kid", FieldType.STORE), + PHYLUM("phylum", FieldType.TERM), + PHYLUM_ID("pid", FieldType.STORE), + CLASS("class", FieldType.TERM), + CLASS_ID("cid", FieldType.STORE), + ORDER("order", FieldType.TERM), + ORDER_ID("oid", FieldType.STORE), + FAMILY("family", FieldType.TERM), + FAMILY_ID("fid", FieldType.STORE), + GENUS("genus", FieldType.TERM), + GENUS_ID("gid", FieldType.STORE), + GENUS_EX("genus_ex", FieldType.TERM), //genus sounds like expression - handles masculine and feminine too. + SPECIES("species", FieldType.TERM), + SPECIES_ID("sid", FieldType.STORE), + SPECIES_EX("specific_ex", FieldType.TERM),// specific epithet sounds like expression + INFRA_EX("infra_ex", FieldType.TERM),//infra specific epithet sounds like expression + SPECIFIC("specific", FieldType.TERM), + INFRA_SPECIFIC("infra", FieldType.TERM), + NAME("name", FieldType.TEXT),// search name + OTHER_NAMES("other_names", FieldType.TEXT),// Alternative names + NAME_CANONICAL("name_canonical", FieldType.TEXT), // Canonical name + NAME_COMPLETE("name_complete", FieldType.TEXT), // Complete name + SEARCHABLE_COMMON_NAME("common", FieldType.COMMON), + COMMON_NAME("common_orig", FieldType.TEXT), + CONCAT_NAME("concat_name", FieldType.TERM), + RANK_ID("rank_id", FieldType.INTEGER), + RANK("rank", FieldType.TERM), + AUTHOR("author", FieldType.TEXT), + PHRASE("phrase", FieldType.TEXT),//stores the values of a "phrase" name. Some more intelligence will be needed when matching these + VOUCHER("voucher", FieldType.TEXT), //stores a voucher value minus the spaces and fullstops. + ALA("ala", FieldType.IDENTIFIER), //stores whether or not it is an ALA generated name + DATASET_ID("dataset_id", FieldType.IDENTIFIER), // The source dataset + SYNONYM_TYPE("syn_type", FieldType.IDENTIFIER), //stores the type of synonym that it represents + HOMONYM("homonym", FieldType.IDENTIFIER), + LANGUAGE("lang", FieldType.IDENTIFIER), /* Stores the priority score associated with a taxon */ - PRIORITY("priority"); + PRIORITY("priority", FieldType.INTEGER); + + /** The field name */ String name; + /** The field type */ + FieldType type; - NameIndexField(String name) { + NameIndexField(String name, FieldType type) { this.name = name; + this.type = type; } public String toString() { return name; } + + /** + * Store a value into this field in a document + * + * @param value The value + * @param document The document + */ + public void store(Object value, Document document) { + if (value == null) + return; + this.type.store(value, this.name, document); + } + + /** + * Make a query for this field for a value. + * + * @param value The value + * + * @return A matching query + */ + public Query search(Object value) { + return this.type.search(value, this.name); + } + + /** + * Make a range query for this field for a value. + * + * @param lower The lower value (inclusive) + * @param upper The upper value (inclusive) + * + * @return A matching query + */ + public Query searchRange(Object lower, Object upper) { + return this.type.searchRange(lower, upper, this.name); + } + + + /** + * Make a wildcard query for this field for a value. + * + * @param value The value, including "*" for wildcards + * + * @return A matching query + */ + public Query searchWildcard(String value) { + return new WildcardQuery(new Term(this.name, value)); + } + } diff --git a/src/main/java/au/org/ala/names/search/ALANameIndexer.java b/src/main/java/au/org/ala/names/search/ALANameIndexer.java index 0661cf64a..f96890c65 100644 --- a/src/main/java/au/org/ala/names/search/ALANameIndexer.java +++ b/src/main/java/au/org/ala/names/search/ALANameIndexer.java @@ -29,8 +29,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.KeywordAnalyzer; -import org.apache.lucene.document.*; -import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; @@ -117,35 +116,6 @@ public class ALANameIndexer { private String indexDirectory; private IndexWriter cbIndexWriter; - //Fields that are being indexed or stored in the lucene index - public enum IndexField { - - NAME("name"), - NAMES("names"), - ID("id"), - RANK("rank"), - SEARCHABLE_NAME("searchcan"), - LSID("lsid"), - HOMONYM("homonym"), - ACCEPTED("synonym"), - LEFT("left"), - RIGHT("right"), - PRIORITY("priority"), - SEARCHABLE_COMMON_NAME("common"), - COMMON_NAME("common_orig"), - LANGUAGE("lang"); - - String name; - - IndexField(String name) { - this.name = name; - } - - public String toString() { - return name; - } - } - PhraseNameParser parser = new PhraseNameParser(); Set knownHomonyms = new HashSet(); Set blacklist = new HashSet(); @@ -244,11 +214,10 @@ private IndexSearcher createTmpGuidIndex(String cbExportFile) throws Exception { Document doc = new Document(); String id = values[POS_ID]; String guid = values[POS_LSID]; - doc.add(new StringField("id", id, Store.YES)); - if (StringUtils.isEmpty(id)) + NameIndexField.ID.store(id, doc); + if (StringUtils.isEmpty(id)) guid = id; - - doc.add(new StoredField("guid", guid)); + NameIndexField.GUID.store(guid, doc); iw.addDocument(doc); } System.out.println("Finished writing the tmp guid index..."); @@ -348,13 +317,13 @@ private void indexALA(IndexWriter iw, String file, String synonymFile) throws Ex values[POS_PID], values[POS_C], values[POS_CID], values[POS_O], values[POS_OID], values[POS_F], values[POS_FID], values[POS_G], values[POS_GID], values[POS_S], values[POS_SID], - values[POS_LFT], values[POS_RGT], acceptedValues, + Integer.parseInt(values[POS_LFT]), Integer.parseInt(values[POS_RGT]), acceptedValues, values[POS_SP_EPITHET], values[POS_INFRA_EPITHET], values[POS_AUTHOR], null, null, priority); //add the excluded information if applicable if ("T".equals(values[POS_EXCLUDED]) || "Y".equals(values[POS_EXCLUDED])) { - doc.add(new TextField(NameIndexField.SYNONYM_TYPE.toString(), SynonymType.EXCLUDES.getId().toString(), Store.YES)); + NameIndexField.SYNONYM_TYPE.store(SynonymType.EXCLUDES.getId().toString(), doc); } if (doc != null) { iw.addDocument(doc); @@ -442,44 +411,44 @@ protected void indexIrmngDwcA(IndexWriter iw, String archiveDirectory) throws Ex Document doc = new Document(); String kingdom = dwcr.value(DwcTerm.kingdom); if (StringUtils.isNotEmpty(kingdom)) { - doc.add(new TextField(RankType.KINGDOM.getRank(), kingdom, Store.YES)); + NameIndexField.KINGDOM.store(kingdom, doc); } String phylum = dwcr.value(DwcTerm.phylum); if (StringUtils.isNotEmpty(phylum)) { - doc.add(new TextField(RankType.PHYLUM.getRank(), phylum, Store.YES)); + NameIndexField.PHYLUM.store(phylum, doc); } String classs = dwcr.value(DwcTerm.class_); if (StringUtils.isNotEmpty(classs)) { - doc.add(new TextField(RankType.CLASS.getRank(), classs, Store.YES)); + NameIndexField.CLASS.store(classs, doc); } String order = dwcr.value(DwcTerm.order); if (StringUtils.isNotEmpty(order)) { - doc.add(new TextField(RankType.ORDER.getRank(), order, Store.YES)); + NameIndexField.ORDER.store(order, doc); } String family = dwcr.value(DwcTerm.family); if (StringUtils.isNotEmpty(family)) { - doc.add(new TextField(RankType.FAMILY.getRank(), family, Store.YES)); + NameIndexField.FAMILY.store(kingdom, doc); } String genus = dwcr.value(DwcTerm.genus); String calculatedRank = "genus"; if (StringUtils.isNotEmpty(genus)) { - doc.add(new TextField(RankType.GENUS.getRank(), genus, Store.YES)); + NameIndexField.GENUS.store(genus, doc); String specificEpithet = dwcr.value(DwcTerm.specificEpithet); if (StringUtils.isNotEmpty(specificEpithet)) { calculatedRank = "species"; - doc.add(new TextField(RankType.SPECIES.getRank(), genus + " " + specificEpithet, Store.YES)); + NameIndexField.SPECIES.store(genus + " " + specificEpithet, doc); } } String rank = dwcr.value(DwcTerm.taxonRank); if (StringUtils.isEmpty(rank)) rank = calculatedRank; - doc.add(new TextField(IndexField.RANK.toString(), rank, Store.YES)); + NameIndexField.RANK.store(rank, doc); //now add the author - we don't do anything about this on homonym resolution yet //Add the author information String author = dwcr.value(DwcTerm.scientificNameAuthorship); if (StringUtils.isNotEmpty(author)) { //TODO think about whether we need to treat the author string with the taxamatch - doc.add(new TextField(NameIndexField.AUTHOR.toString(), author, Store.YES)); + NameIndexField.AUTHOR.store(author, doc); } //now add it to the index iw.addDocument(doc); @@ -504,20 +473,21 @@ void indexIRMNG(IndexWriter iw, String irmngExport, RankType rank) throws Except while ((values = reader.readNext()) != null) { Document doc = new Document(); if (values != null && values.length >= 7) { - doc.add(new TextField(RankType.KINGDOM.getRank(), values[0], Store.YES)); - doc.add(new TextField(RankType.PHYLUM.getRank(), values[1], Store.YES)); - doc.add(new TextField(RankType.CLASS.getRank(), values[2], Store.YES)); - doc.add(new TextField(RankType.ORDER.getRank(), values[3], Store.YES)); - doc.add(new TextField(RankType.FAMILY.getRank(), values[4], Store.YES)); - doc.add(new TextField(RankType.GENUS.getRank(), values[5], Store.YES)); + NameIndexField.KINGDOM.store(values[0], doc); + NameIndexField.PHYLUM.store(values[1], doc); + NameIndexField.CLASS.store(values[2], doc); + NameIndexField.ORDER.store(values[3], doc); + NameIndexField.FAMILY.store(values[4], doc); + NameIndexField.GENUS.store(values[5], doc); if (rank == RankType.GENUS) { - doc.add(new TextField(IndexField.ID.toString(), values[6], Store.YES)); - doc.add(new TextField(IndexField.ACCEPTED.toString(), values[8], Store.YES)); - doc.add(new TextField(IndexField.HOMONYM.toString(), values[10], Store.YES)); + + NameIndexField.ID.store(values[6], doc); + NameIndexField.ACCEPTED.store(values[8], doc); + NameIndexField.HOMONYM.store(values[10], doc); } else if (rank == RankType.SPECIES) { - doc.add(new TextField(RankType.SPECIES.getRank(), values[6], Store.YES)); + NameIndexField.SPECIES.store(values[6], doc); } - doc.add(new TextField(IndexField.RANK.toString(), rank.getRank(), Store.YES)); + NameIndexField.RANK.store(rank.getRank(), doc); iw.addDocument(doc); count++; } @@ -651,9 +621,9 @@ protected void createExtraIdIndex(IndexWriter iw, File idFile) throws Exception if (values != null && values.length >= 3) { Document doc = new Document(); //doc.add(new Field("lsid", values[2], Store.NO, Index.NOT_ANALYZED)); - doc.add(new StringField("lsid", values[2], Store.YES)); + NameIndexField.LSID.store(values[2], doc); //doc.add(new Field("reallsid", values[1], Store.YES, Index.NO)); - doc.add(new StoredField("reallsid", values[1])); + NameIndexField.REAL_LSID.store(values[1], doc); iw.addDocument(doc); } } @@ -700,7 +670,7 @@ private IndexSearcher createTmpIndex(String tcFileName) throws Exception { //just add the LSID to the index Document doc = new Document(); - doc.add(new StringField("lsid", values[0], Store.YES)); + NameIndexField.LSID.store(values[0], doc); iw.addDocument(doc); } @@ -757,31 +727,28 @@ protected Document createCommonNameDocument(String cn, String sn, String lsid, S protected Document createCommonNameDocument(String cn, String sn, String lsid, String language, boolean checkAccepted) { Document doc = new Document(); - //we are only interested in keeping all the alphanumerical values of the common name - //when searching the same operations will need to be peformed on the search string - TextField searchAbleName = new TextField(IndexField.SEARCHABLE_COMMON_NAME.toString(), cn.toUpperCase().replaceAll("[^A-Z0-9ÏËÖÜÄÉÈČÁÀÆŒ]", ""), Store.YES); - doc.add(searchAbleName); + // Uses field type to normalise + NameIndexField.SEARCHABLE_COMMON_NAME.store(cn, doc); if (sn != null) { - doc.add(new TextField(IndexField.NAME.toString(), sn, Store.YES)); + NameIndexField.NAME.store(sn, doc); } String newLsid = getAcceptedLSID(lsid); - - doc.add(new TextField(IndexField.COMMON_NAME.toString(), cn, Store.YES)); - doc.add(new TextField(IndexField.LSID.toString(), newLsid, Store.YES)); + NameIndexField.COMMON_NAME.store(cn, doc); + NameIndexField.LSID.store(newLsid, doc); if(language != null) { - doc.add(new TextField(IndexField.LANGUAGE.toString(), language.toLowerCase().trim(), Store.YES)); - } + NameIndexField.LANGUAGE.store(language.toLowerCase().trim(), doc); + } return doc; } public Document createALAIndexDocument(String name, String id, String lsid, String author, LinnaeanRankClassification cl){ - return createALAIndexDocument(name,id, lsid, author,null,null, null, null, cl, null, null, MatchMetrics.DEFAULT_PRIORITY); + return createALAIndexDocument(name,id, lsid, author,null,null, 0, 0, cl, null, null, MatchMetrics.DEFAULT_PRIORITY); } - public Document createALAIndexDocument(String name, String id, String lsid, String author, String rank, String rankId, String left, String right, LinnaeanRankClassification cl, String nameComplete, Collection otherNames, int priority){ + public Document createALAIndexDocument(String name, String id, String lsid, String author, String rank, String rankId, int left, int right, LinnaeanRankClassification cl, String nameComplete, Collection otherNames, int priority){ if(cl == null) cl = new LinnaeanRankClassification(); return createALAIndexDocument(name, id, lsid, rankId, rank, cl.getKingdom(), cl.getKid(), cl.getPhylum() @@ -792,11 +759,11 @@ public Document createALAIndexDocument(String name, String id, String lsid, Stri protected Document createALASynonymDocument(String scientificName, String author, String nameComplete, Collection otherNames, String id, String lsid, String nameLsid, String acceptedLsid, String acceptedId, int priority, String synonymType) { lsid = StringUtils.isBlank(lsid) ? nameLsid : lsid; Document doc = createALAIndexDocument(scientificName, id, lsid, null, null, - null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, + null, null, null, null, null, null, null, null, null, null, null, null, null, null, 0, 0, acceptedLsid, null, null, author, nameComplete, otherNames, priority); if (doc != null && synonymType != null) { try { - doc.add(new TextField(NameIndexField.SYNONYM_TYPE.toString(), synonymType, Store.YES)); + NameIndexField.SYNONYM_TYPE.store(synonymType, doc); } catch (Exception e) { System.out.println("Error on " + scientificName + " " + author + " " + id + ". " + e.getMessage()); } @@ -811,7 +778,7 @@ private boolean isBlacklisted(String scientificName) { protected Document createALAIndexDocument(String name, String id, String lsid, String rank, String rankString, String kingdom, String kid, String phylum, String pid, String clazz, String cid, String order, String oid, String family, String fid, String genus, String gid, - String species, String sid, String left, String right, String acceptedConcept, String specificEpithet, + String species, String sid, int left, int right, String acceptedConcept, String specificEpithet, String infraspecificEpithet, String author, String nameComplete, Collection otherNames, int priority) { // @@ -820,6 +787,7 @@ protected Document createALAIndexDocument(String name, String id, String lsid, S return null; } + int rankIndex = rank == null || rankString.isEmpty() ? -1 : Integer.parseInt(rank); nameComplete = buildNameComplete(name, author, nameComplete); CleanedScientificName cname = new CleanedScientificName(name); CleanedScientificName cnameComplete = new CleanedScientificName(nameComplete); @@ -827,11 +795,10 @@ protected Document createALAIndexDocument(String name, String id, String lsid, S String soundexGenus = genus; //Add the ids - doc.add(new StringField(NameIndexField.ID.toString(), id, Store.YES)); - - doc.add(new StringField(NameIndexField.LSID.toString(), lsid, Store.YES)); + NameIndexField.ID.store(id, doc); + NameIndexField.LSID.store(lsid, doc); if (lsid.startsWith("ALA")) { - doc.add(new StringField(NameIndexField.ALA.toString(), "T", Store.YES)); + NameIndexField.ALA.store("T", doc); } @@ -843,85 +810,83 @@ protected Document createALAIndexDocument(String name, String id, String lsid, S nameSet.add(cnameComplete.getNormalised()); nameSet.add(cnameComplete.getBasic()); for (String n: nameSet) { - Field f = new TextField(NameIndexField.NAME.toString(), n, Store.YES); - doc.add(f); + NameIndexField.NAME.store(n, doc); } - - doc.add(new StringField(NameIndexField.NAME_CANONICAL.toString(), cname.getNormalised(), Store.YES)); - doc.add(new StringField(NameIndexField.NAME_COMPLETE.toString(), cnameComplete.getNormalised(), Store.YES)); + NameIndexField.NAME_CANONICAL.store(cname.getNormalised(), doc); + NameIndexField.NAME_COMPLETE.store(cnameComplete.getNormalised(), doc); //rank information - if (StringUtils.isNotEmpty(rank)) { - doc.add(new StringField(NameIndexField.RANK_ID.toString(), rank, Store.YES)); + if (rankIndex >= 0) { + NameIndexField.RANK_ID.store(rankIndex, doc); } if (StringUtils.isNotEmpty(rankString)) { - doc.add(new StringField(NameIndexField.RANK.toString(), rankString, Store.YES)); + NameIndexField.RANK.store(rankString, doc); } //handle the synonyms if (StringUtils.isNotEmpty(acceptedConcept)) { - doc.add(new StringField(NameIndexField.ACCEPTED.toString(), acceptedConcept, Store.YES)); - doc.add(new StringField(NameIndexField.iS_SYNONYM.toString(), "T", Store.YES)); - } else { - doc.add(new StringField(NameIndexField.iS_SYNONYM.toString(), "F", Store.YES)); + NameIndexField.ACCEPTED.store(acceptedConcept, doc); + NameIndexField.iS_SYNONYM.store("T", doc); + } else { + NameIndexField.iS_SYNONYM.store("F", doc); } //Add the classification information if (StringUtils.trimToNull(kingdom) != null) { - doc.add(new TextField(RankType.KINGDOM.getRank(), kingdom, Store.YES)); - if (StringUtils.isNotBlank(kid)) { - doc.add(new StoredField("kid", kid)); + NameIndexField.KINGDOM.store(kingdom, doc); + if (StringUtils.isNotBlank(kid)) { + NameIndexField.KINGDOM_ID.store(kid, doc); } } if (StringUtils.trimToNull(phylum) != null) { - doc.add(new TextField(RankType.PHYLUM.getRank(), phylum, Store.YES)); + NameIndexField.PHYLUM.store(phylum, doc); if (StringUtils.isNotBlank(pid)) { - doc.add(new StoredField("pid", pid)); + NameIndexField.PHYLUM_ID.store(pid, doc); } } if (StringUtils.trimToNull(clazz) != null) { - doc.add(new TextField(RankType.CLASS.getRank(), clazz, Store.YES)); + NameIndexField.CLASS.store(clazz, doc); if (StringUtils.isNotBlank(cid)) { - doc.add(new StoredField("cid", cid)); + NameIndexField.CLASS_ID.store(cid, doc); } } if (StringUtils.trimToNull(order) != null) { - doc.add(new TextField(RankType.ORDER.getRank(), order, Store.YES)); + NameIndexField.ORDER.store(order, doc); if (StringUtils.isNotBlank(oid)) { - doc.add(new StoredField("oid", oid)); + NameIndexField.ORDER_ID.store(oid, doc); } } if (StringUtils.trimToNull(family) != null) { - doc.add(new TextField(RankType.FAMILY.getRank(), family, Store.YES)); + NameIndexField.FAMILY.store(family, doc); if (StringUtils.isNotBlank(fid)) { - doc.add(new StoredField("fid", fid)); + NameIndexField.FAMILY_ID.store(fid, doc); } } if (StringUtils.trimToNull(genus) != null) { - doc.add(new TextField(RankType.GENUS.getRank(), genus, Store.YES)); + NameIndexField.GENUS.store(genus, doc); if (StringUtils.isNotBlank(gid)) { - doc.add(new StoredField("gid", gid)); + NameIndexField.GENUS_ID.store(gid, doc); } } if (StringUtils.trimToNull(species) != null) { - doc.add(new TextField(RankType.SPECIES.getRank(), species, Store.YES)); + NameIndexField.SPECIES.store(species, doc); if (StringUtils.isNotBlank(sid)) { - doc.add(new StoredField("sid", sid)); + NameIndexField.SPECIES_ID.store(sid, doc); } } - if (StringUtils.trimToNull(left) != null) { - doc.add(new StringField("left", left, Store.YES)); + if (left > 0) { + NameIndexField.LEFT.store(left, doc); } - if (StringUtils.trimToNull(right) != null) { - doc.add(new StringField("right", right, Store.YES)); + if (right > 0) { + NameIndexField.RIGHT.store(right, doc); } - doc.add(new StoredField("priority", priority)); + NameIndexField.PRIORITY.store(priority, doc); //Add the author information if (StringUtils.isNotEmpty(author)) { //TODO think about whether we need to treat the author string with the taxamatch - doc.add(new TextField(NameIndexField.AUTHOR.toString(), author, Store.YES)); + NameIndexField.AUTHOR.store(author, doc); } @@ -936,8 +901,7 @@ protected Document createALAIndexDocument(String name, String id, String lsid, S && cn.getType() != NameType.INFORMAL && !"6500".equals(rank) && cn.getType() != NameType.DOUBTFUL) { if (!nameSet.contains(cn.canonicalName())) { - Field f2 = new TextField(NameIndexField.NAME.toString(), cn.canonicalName(), Store.YES); - doc.add(f2); + NameIndexField.NAME.store(cn.canonicalName(), doc); } if (specificEpithet == null && cn.isBinomial()) { //check to see if we need to determine the epithets from the parse @@ -949,30 +913,28 @@ protected Document createALAIndexDocument(String name, String id, String lsid, S //check to see if the concept represents a phrase name if (cn != null && cn instanceof ALAParsedName) { //set up the field type that is stored and Index.ANALYZED_NO_NORMS - FieldType ft = new FieldType(TextField.TYPE_STORED); - ft.setOmitNorms(true); ALAParsedName alapn = (ALAParsedName) cn; if (alapn.getRank() != Rank.SPECIES && alapn.getSpecificEpithet() != null) { - doc.add(new Field(NameIndexField.SPECIFIC.toString(), alapn.getSpecificEpithet(), ft)); + NameIndexField.SPECIFIC.store(alapn.getSpecificEpithet(), doc); } else if (alapn.getRank() != Rank.SPECIES && alapn.getSpecificEpithet() == null) { log.warn(lsid + " " + name + " has an empty specific for non sp. phrase"); } if (StringUtils.trimToNull(alapn.getLocationPhraseDescription()) != null) { - doc.add(new Field(NameIndexField.PHRASE.toString(), alapn.cleanPhrase, ft)); + NameIndexField.PHRASE.store(alapn.cleanPhrase, doc); } if (alapn.getPhraseVoucher() != null) { - doc.add(new Field(NameIndexField.VOUCHER.toString(), alapn.cleanVoucher, ft)); + NameIndexField.VOUCHER.store(alapn.cleanVoucher, doc); } if (StringUtils.isBlank(genus) && StringUtils.isNotBlank(alapn.getGenusOrAbove())) { //add the genus to the index as it is necessary to match on the phrase name. - doc.add(new TextField(RankType.GENUS.getRank(), alapn.getGenusOrAbove(), Store.YES)); + NameIndexField.GENUS.store(alapn.getGenusOrAbove(), doc); } } } catch (org.gbif.api.exception.UnparsableException e) { //check to see if the name is a virus in which case an extra name is added without the virus key word if (e.type == NameType.VIRUS) { - doc.add(new TextField(NameIndexField.NAME.toString(), ALANameSearcher.virusStopPattern.matcher(name).replaceAll(" "), Store.YES)); + NameIndexField.NAME.store(ALANameSearcher.virusStopPattern.matcher(name).replaceAll(" "), doc); } } catch (Exception e) { @@ -983,24 +945,24 @@ protected Document createALAIndexDocument(String name, String id, String lsid, S //add the sound expressions for the name if required try { if (StringUtils.isNotBlank(soundexGenus)) { - doc.add(new TextField(NameIndexField.GENUS_EX.toString(), TaxonNameSoundEx.treatWord(soundexGenus, "genus"), Store.YES)); + NameIndexField.GENUS_EX.store(TaxonNameSoundEx.treatWord(soundexGenus, "genus"), doc); } if (StringUtils.isNotBlank(specificEpithet)) { String soundex = TaxonNameSoundEx.treatWord(specificEpithet, "species"); if (soundex == null) soundex = ""; - doc.add(new TextField(NameIndexField.SPECIES_EX.toString(), soundex, Store.YES)); + NameIndexField.SPECIES_EX.store(soundex, doc); } else if (StringUtils.isNotBlank(soundexGenus)) { - doc.add(new TextField(NameIndexField.SPECIES_EX.toString(), "", Store.YES)); + NameIndexField.SPECIES_EX.store("", doc); } if (StringUtils.isNotBlank(infraspecificEpithet)) { String soundex = TaxonNameSoundEx.treatWord(infraspecificEpithet, "species"); if (soundex == null) soundex = ""; - doc.add(new TextField(NameIndexField.INFRA_EX.toString(), soundex, Store.YES)); - } else if (StringUtils.isNotBlank(specificEpithet)) { + NameIndexField.INFRA_EX.store(soundex, doc); + } else if (StringUtils.isNotBlank(specificEpithet)) { //make searching for an empty infraspecific soudex easier - doc.add(new TextField(NameIndexField.INFRA_EX.toString(), "", Store.YES)); + NameIndexField.INFRA_EX.store("", doc); } } catch (Exception e) { log.warn(lsid + " " + name + " has issues creating a soundex: " + e.getMessage()); diff --git a/src/main/java/au/org/ala/names/search/ALANameSearcher.java b/src/main/java/au/org/ala/names/search/ALANameSearcher.java index 93e616193..d1355169a 100644 --- a/src/main/java/au/org/ala/names/search/ALANameSearcher.java +++ b/src/main/java/au/org/ala/names/search/ALANameSearcher.java @@ -14,7 +14,6 @@ */ package au.org.ala.names.search; -import au.org.ala.names.lucene.analyzer.LowerCaseKeywordAnalyzer; import au.org.ala.names.model.*; import au.org.ala.names.util.CleanedScientificName; import au.org.ala.names.util.TaxonNameSoundEx; @@ -22,12 +21,8 @@ import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; -import org.apache.lucene.index.*; -import org.apache.lucene.queryparser.classic.ParseException; -import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.search.*; import org.apache.lucene.store.FSDirectory; import org.gbif.api.exception.UnparsableException; @@ -36,10 +31,7 @@ import org.gbif.api.vocabulary.Rank; import org.gbif.nameparser.PhraseNameParser; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.OutputStreamWriter; +import java.io.*; import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; @@ -74,8 +66,6 @@ public class ALANameSearcher { protected Log log = LogFactory.getLog(ALANameSearcher.class); protected DirectoryReader cbReader, irmngReader, vernReader; protected IndexSearcher cbSearcher, irmngSearcher, vernSearcher, idSearcher; - protected ThreadLocal queryParser; - protected ThreadLocal idParser; protected TaxonNameSoundEx tnse; protected PhraseNameParser parser; public static final Pattern virusStopPattern = Pattern.compile(" virus| ictv| ICTV"); @@ -95,56 +85,33 @@ public ALANameSearcher(){} * as the source directory * * @param indexDirectory The directory that contains the index files for the scientific names, irmng and vernacular names. - * @throws CorruptIndexException * @throws IOException */ public ALANameSearcher(String indexDirectory) throws IOException { //Initialise CB index searching items log.debug("Creating the search object for the name matching api..."); - //make the query parsers thread safe - queryParser = new ThreadLocal() { - @Override - protected QueryParser initialValue() { - QueryParser qp = new QueryParser("genus", LowerCaseKeywordAnalyzer.newInstance()); - qp.setFuzzyMinSim(0.8f); //fuzzy match similarity setting. used to match the authorship. - return qp; - } - }; - idParser = new ThreadLocal() { - @Override - protected QueryParser initialValue() { - return new QueryParser( "lsid", new org.apache.lucene.analysis.core.KeywordAnalyzer()); - } - }; - - cbReader = DirectoryReader.open(FSDirectory.open(createIfNotExist(indexDirectory + File.separator + "cb")));//false + cbReader = DirectoryReader.open(FSDirectory.open(findPath(indexDirectory + File.separator + "cb")));//false cbSearcher = new IndexSearcher(cbReader); //Initialise the IRMNG index searching items - irmngReader = DirectoryReader.open(FSDirectory.open(createIfNotExist(indexDirectory + File.separator + "irmng"))); + irmngReader = DirectoryReader.open(FSDirectory.open(findPath(indexDirectory + File.separator + "irmng"))); irmngSearcher = new IndexSearcher(irmngReader); //initialise the Common name index searching items - vernReader = DirectoryReader.open(FSDirectory.open(createIfNotExist(indexDirectory + File.separator + "vernacular"))); + vernReader = DirectoryReader.open(FSDirectory.open(findPath(indexDirectory + File.separator + "vernacular"))); vernSearcher = new IndexSearcher(vernReader); //initialise the identifier index - idSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(createIfNotExist(indexDirectory + File.separator + "id")))); + idSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(findPath(indexDirectory + File.separator + "id")))); tnse = new TaxonNameSoundEx(); parser = new PhraseNameParser(); crossRankHomonyms = au.org.ala.names.util.FileUtils.streamToSet( this.getClass().getClassLoader().getResourceAsStream("au/org/ala/homonyms/cross_rank_homonyms.txt"), new java.util.HashSet(), true); } - private Path createIfNotExist(String indexDirectory) throws IOException { - + private Path findPath(String indexDirectory) throws IOException { File idxFile = new File(indexDirectory); - Path path = Paths.get(indexDirectory); if (!idxFile.exists()) { - FileUtils.forceMkdir(idxFile); - Analyzer analyzer = new StandardAnalyzer(); - IndexWriterConfig conf = new IndexWriterConfig(analyzer); - IndexWriter iw = new IndexWriter(FSDirectory.open(path), conf); - iw.commit(); - iw.close(); + throw new FileNotFoundException(idxFile.toString()); } + Path path = Paths.get(indexDirectory); return path; } @@ -154,8 +121,7 @@ private Path createIfNotExist(String indexDirectory) throws IOException { public void dumpSpecies() { try { OutputStreamWriter fileOut = new OutputStreamWriter(new FileOutputStream("/data/species.txt"), "UTF-8"); - Term term = new Term("rank", "species"); - TopDocs hits = cbSearcher.search(new TermQuery(term), 2000000); + TopDocs hits = cbSearcher.search(NameIndexField.RANK.search("species"), 2000000); for (ScoreDoc sdoc : hits.scoreDocs) { Document doc = cbReader.document(sdoc.doc); @@ -894,7 +860,7 @@ public NameSearchResult searchForRecord(String name, LinnaeanRankClassification */ public NameSearchResult searchForRecordByID(String id) { try { - List results = performSearch(ALANameIndexer.IndexField.ID.toString(), id, null, null, 1, null, false, idParser.get()); + List results = performSearch(NameIndexField.ID, id, null, null, 1, null, false); if (results.size() > 0) { results.get(0).setMatchType(MatchType.TAXON_ID); return results.get(0); @@ -1021,7 +987,7 @@ private List searchForRecords(String name, RankType rank, Linn log.warn("Unable to parse " + name + ". " + e.getMessage()); } //Check for the exact match - List hits = performSearch(NameIndexField.NAME.toString(), cleaned.getNormalised(), rank, cl, max, MatchType.EXACT, true, queryParser.get()); + List hits = performSearch(NameIndexField.NAME, cleaned.getNormalised(), rank, cl, max, MatchType.EXACT, true); if (hits == null) // situation where searcher has not been initialised { return null; @@ -1043,12 +1009,13 @@ private List searchForRecords(String name, RankType rank, Linn String voucher = alapn.cleanVoucher; //String voucher = alapn.phraseVoucher != null ? voucherRemovePattern.matcher(alapn.phraseVoucher).replaceAll("") :null; String specific = alapn.getRank() != null && alapn.getRank().equals(Rank.SPECIES) ? null : alapn.getSpecificEpithet(); - String[][] searchFields = new String[4][]; - searchFields[0] = new String[]{RankType.GENUS.getRank(), genus}; - searchFields[1] = new String[]{NameIndexField.PHRASE.toString(), phrase}; - searchFields[2] = new String[]{NameIndexField.VOUCHER.toString(), voucher}; - searchFields[3] = new String[]{NameIndexField.SPECIFIC.toString(), specific}; - hits = performSearch(searchFields, rank, cl, max, MatchType.PHRASE, false, queryParser.get()); //don't want to check for homonyms yet... + List searchFields = Arrays.asList( + Value.of(NameIndexField.GENUS, genus), + Value.of(NameIndexField.PHRASE, phrase), + Value.of(NameIndexField.VOUCHER, voucher), + Value.of(NameIndexField.SPECIFIC, specific) + ); + hits = performSearch(searchFields, rank, cl, max, MatchType.PHRASE, false); //don't want to check for homonyms yet... if (hits.size() == 1) { return hits; } else if (hits.size() > 1) { @@ -1069,7 +1036,7 @@ private List searchForRecords(String name, RankType rank, Linn if (cl.getAuthorship() == null && pn.isAuthorsParsed()) { cl.setAuthorship(pn.authorshipComplete()); } - hits = performSearch(ALANameIndexer.IndexField.NAME.toString(), canonicalName, rank, cl, max, MatchType.CANONICAL, true, queryParser.get()); + hits = performSearch(NameIndexField.NAME, canonicalName, rank, cl, max, MatchType.CANONICAL, true); if (hits.size() > 0) { return hits; } @@ -1079,12 +1046,13 @@ private List searchForRecords(String name, RankType rank, Linn String phrase = pn.getCultivarEpithet(); String voucher = null; String specific = pn.getRank() != null && pn.getRank().equals(Rank.SPECIES) ? null : pn.getSpecificEpithet(); - String[][] searchFields = new String[4][]; - searchFields[0] = new String[]{RankType.GENUS.getRank(), genus}; - searchFields[1] = new String[]{NameIndexField.PHRASE.toString(), phrase}; - searchFields[2] = new String[]{NameIndexField.VOUCHER.toString(), voucher}; - searchFields[3] = new String[]{NameIndexField.SPECIFIC.toString(), specific}; - hits = performSearch(searchFields, rank, cl, max, MatchType.PHRASE, false, queryParser.get()); + List searchFields = Arrays.asList( + Value.of(NameIndexField.GENUS, genus), + Value.of(NameIndexField.PHRASE, phrase), + Value.of(NameIndexField.VOUCHER, voucher), + Value.of(NameIndexField.SPECIFIC, specific) + ); + hits = performSearch(searchFields, rank, cl, max, MatchType.PHRASE, false); if (hits.size() > 0) { return hits; } @@ -1095,15 +1063,12 @@ private List searchForRecords(String name, RankType rank, Linn String genus = TaxonNameSoundEx.treatWord(pn.getGenusOrAbove(), "genus"); String specific = TaxonNameSoundEx.treatWord(pn.getSpecificEpithet(), "species"); String infra = pn.getInfraSpecificEpithet() == null ? null : TaxonNameSoundEx.treatWord(pn.getInfraSpecificEpithet(), "species"); - String[][] searchFields = new String[3][]; - searchFields[0] = new String[]{NameIndexField.GENUS_EX.toString(), genus}; - searchFields[1] = new String[]{NameIndexField.SPECIES_EX.toString(), specific}; - if (StringUtils.isNotEmpty(infra)) { - searchFields[2] = new String[]{NameIndexField.INFRA_EX.toString(), infra}; - } else { - searchFields[2] = new String[]{NameIndexField.INFRA_EX.toString(), ""}; - } - hits = performSearch(searchFields, rank, cl, max, MatchType.SOUNDEX, false, queryParser.get()); //don't want to check for homonyms yet... + List searchFields = Arrays.asList( + Value.of(NameIndexField.GENUS_EX, genus), + Value.of(NameIndexField.SPECIES_EX, specific), + Value.of(NameIndexField.INFRA_EX, StringUtils.isNotEmpty(infra) ? infra : "") + ); + hits = performSearch(searchFields, rank, cl, max, MatchType.SOUNDEX, false); //don't want to check for homonyms yet... if (hits.size() > 0) { return hits; } @@ -1146,140 +1111,126 @@ else if (hit.getAcceptedLsid() != null) { return acceptedLsid == null ? null : searchForRecordByLsid(acceptedLsid); } - private List performSearch(String field, String value, RankType rank, + private List performSearch(NameIndexField field, String value, RankType rank, LinnaeanRankClassification cl, int max, MatchType type, - boolean checkHomo, QueryParser parser) throws IOException, SearchResultException { - String[][] compValues = new String[1][]; - compValues[0] = new String[]{field, value}; - return performSearch(compValues, rank, cl, max, type, checkHomo, parser); + boolean checkHomo) throws IOException, SearchResultException { + return performSearch(Arrays.asList(Value.of(field, value)), rank, cl, max, type, checkHomo); } /** * Performs an index search based on the supplied field and name * - * @param compulsoryValues 2D array of field and value mappings to perform the search on + * @param compulsoryValues A list of required values * @param rank Optional rank of the value * @param cl The high taxa that form the classification for the search item * @param max The maximum number of results to return * @param type The type of search that is being performed * @param checkHomo Whether or not the result should check for homonyms. - * @param parser * @return * @throws IOException * @throws SearchResultException */ - private List performSearch(String[][] compulsoryValues, RankType rank, - LinnaeanRankClassification cl, int max, MatchType type, boolean checkHomo, - QueryParser parser) throws IOException, SearchResultException { + private List performSearch(List compulsoryValues, RankType rank, + LinnaeanRankClassification cl, int max, MatchType type, boolean checkHomo) throws IOException, SearchResultException { if (cbSearcher != null) { String scientificName = null; - StringBuilder query = new StringBuilder(); - for (String[] values : compulsoryValues) { - if (values[1] != null) { - - query.append("+" + values[0] + ":\"" + values[1] + "\""); - - if (values[0].equals(NameIndexField.NAME.toString())) - scientificName = values[1]; - } + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + for (Value value: compulsoryValues) { + if (value.value != null) { + builder.add(value.field.search(value.value), BooleanClause.Occur.MUST); + if (value.field == NameIndexField.NAME) + scientificName = value.value.toString(); + } } if (rank != null) { - //if the rank is below species include all names that are species level and below in case synonyms have changed ranks. - query.append("+("); - if (rank.getId() >= RankType.SPECIES.getId()) { - query.append(NameIndexField.RANK_ID.toString()).append(":[7000 TO 9999]"); - - } else - query.append(NameIndexField.RANK.toString() + ":\"" + rank.getRank() + "\""); - //cater for the situation where the search term could be a synonym that does not have a rank + int lower = rank.getId(); + int upper = rank.getId() >= RankType.SPECIES.getId() ? 9999 : rank.getId(); + BooleanQuery.Builder rankBuilder = new BooleanQuery.Builder(); + rankBuilder.add(NameIndexField.RANK_ID.searchRange(lower, upper), BooleanClause.Occur.SHOULD); + //cater for the situation where the search term could be a synonym that does not have a rank // also ALA added concepts do NOT have ranks. - query.append(" OR ").append(NameIndexField.iS_SYNONYM.toString()).append(":T OR ").append(NameIndexField.ALA).append(":T)"); - + rankBuilder.add(NameIndexField.iS_SYNONYM.search("T"), BooleanClause.Occur.SHOULD); + rankBuilder.add(NameIndexField.ALA.search("T"), BooleanClause.Occur.SHOULD); + builder.add(rankBuilder.build(), BooleanClause.Occur.MUST); } if (cl != null) { - query.append(cl.getLuceneSearchString(true)); - + cl.appendLuceneQuery(builder, true); } + Query query = builder.build(); - try { - Query scoreQuery = parser.parse(query.toString()); - TopDocs hits = cbSearcher.search(scoreQuery, max);//cbSearcher.search(boolQuery, max); + TopDocs hits = cbSearcher.search(query, max);//cbSearcher.search(boolQuery, max); - //now put the hits into the arrayof NameSearchResult - List results = new java.util.ArrayList(); + //now put the hits into the arrayof NameSearchResult + List results = new java.util.ArrayList(); - for (ScoreDoc sdoc : hits.scoreDocs) { - NameSearchResult nsr = new NameSearchResult(cbReader.document(sdoc.doc), type); - nsr.computeMatch(cl); - results.add(nsr); - } - results.sort(Comparator.comparing(NameSearchResult::getMatchMetrics).reversed()); - if (results.stream().filter(r -> r.getMatchMetrics().getMatch() > MATCH_LIMIT).count() > 0) { - results = results.stream().filter(r -> r.getMatchMetrics().getMatch() > MATCH_LIMIT).collect(Collectors.toList()); - } - //HOMONYM CHECKS and other checks - if (checkHomo) { - - //check to see if one of the results is excluded - if (results.size() > 0) { - int exclCount = 0; - NameSearchResult notExcludedResult = null; - NameSearchResult excludedResult = null; - for (NameSearchResult nsr : results) { - if (nsr.getSynonymType() == au.org.ala.names.model.SynonymType.EXCLUDES) { - exclCount++; - excludedResult = nsr; - } else if (notExcludedResult == null) { - notExcludedResult = nsr; - } + for (ScoreDoc sdoc : hits.scoreDocs) { + NameSearchResult nsr = new NameSearchResult(cbReader.document(sdoc.doc), type); + nsr.computeMatch(cl); + results.add(nsr); + } + results.sort(Comparator.comparing(NameSearchResult::getMatchMetrics).reversed()); + if (results.stream().filter(r -> r.getMatchMetrics().getMatch() > MATCH_LIMIT).count() > 0) { + results = results.stream().filter(r -> r.getMatchMetrics().getMatch() > MATCH_LIMIT).collect(Collectors.toList()); + } + //HOMONYM CHECKS and other checks + if (checkHomo) { + + //check to see if one of the results is excluded + if (results.size() > 0) { + int exclCount = 0; + NameSearchResult notExcludedResult = null; + NameSearchResult excludedResult = null; + for (NameSearchResult nsr : results) { + if (nsr.getSynonymType() == au.org.ala.names.model.SynonymType.EXCLUDES) { + exclCount++; + excludedResult = nsr; + } else if (notExcludedResult == null) { + notExcludedResult = nsr; } - if (exclCount > 0) { - //throw the basic exception if count == result size - if (exclCount == results.size()) { - throw new ExcludedNameException("The result is a name that has been excluded from the NSL", excludedResult); - } else if (notExcludedResult != null) { - //one of the results was an excluded concept - throw new ExcludedNameException("One of the results was excluded. Use the nonExcludedName for your match.", notExcludedResult, excludedResult); - } + } + if (exclCount > 0) { + //throw the basic exception if count == result size + if (exclCount == results.size()) { + throw new ExcludedNameException("The result is a name that has been excluded from the NSL", excludedResult); + } else if (notExcludedResult != null) { + //one of the results was an excluded concept + throw new ExcludedNameException("One of the results was excluded. Use the nonExcludedName for your match.", notExcludedResult, excludedResult); } } + } - //check to see if we have a situtation where a species has been split into subspecies and a synonym exists to the subspecies - checkForSpeciesSplit(results); + //check to see if we have a situtation where a species has been split into subspecies and a synonym exists to the subspecies + checkForSpeciesSplit(results); - //check to see if one of the results is a misapplied synonym - checkForMisapplied(results); + //check to see if one of the results is a misapplied synonym + checkForMisapplied(results); - //check result level homonyms - //TODO 2012-04-17: Work out edge case issues for canonical matches... - //checkResultLevelHomonym(results); + //check result level homonyms + //TODO 2012-04-17: Work out edge case issues for canonical matches... + //checkResultLevelHomonym(results); - //check to see if we have a cross rank homonym - //cross rank homonyms are resolvable if a rank has been supplied - if (rank == null) { - checkForCrossRankHomonym(results); - } + //check to see if we have a cross rank homonym + //cross rank homonyms are resolvable if a rank has been supplied + if (rank == null) { + checkForCrossRankHomonym(results); + } - //check to see if the search criteria could represent an unresolved genus or species homonym - if (results.size() > 0) { - RankType resRank = results.get(0).getRank(); - if ((resRank == RankType.GENUS || resRank == RankType.SPECIES) || (results.get(0).isSynonym() && (rank == null || rank == RankType.GENUS || rank == RankType.SPECIES))) { - NameSearchResult result = (cl != null && StringUtils.isNotBlank(cl.getAuthorship())) ? validateHomonymByAuthor(results, scientificName, cl) : validateHomonyms(results, scientificName, cl); - results.clear(); - results.add(result); - } + //check to see if the search criteria could represent an unresolved genus or species homonym + if (results.size() > 0) { + RankType resRank = results.get(0).getRank(); + if ((resRank == RankType.GENUS || resRank == RankType.SPECIES) || (results.get(0).isSynonym() && (rank == null || rank == RankType.GENUS || rank == RankType.SPECIES))) { + NameSearchResult result = (cl != null && StringUtils.isNotBlank(cl.getAuthorship())) ? validateHomonymByAuthor(results, scientificName, cl) : validateHomonyms(results, scientificName, cl); + results.clear(); + results.add(result); } } - - return results; - } catch (ParseException e) { - throw new SearchResultException("Error parsing " + query.toString() + "." + e.getMessage()); } + return results; } return null; } @@ -1528,12 +1479,10 @@ public TopDocs getIRMNGGenus(LinnaeanRankClassification cl, RankType rank) { if (cl != null && (cl.getGenus() != null || cl.getSpecies() != null)) { try { - - String searchString = "+rank:" + rank + " " + cl.getLuceneSearchString(false).trim(); - - - log.debug("Search string : " + searchString + " classification : " + cl); - Query query = queryParser.get().parse(searchString); + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + builder.add(NameIndexField.RANK.search(rank.getRank()), BooleanClause.Occur.MUST); + cl.appendLuceneQuery(builder, false); + Query query = builder.build(); log.debug("getIRMNG query: " + query.toString()); return irmngSearcher.search(query, 10); @@ -1627,13 +1576,13 @@ public String searchForLSIDCommonName(String commonName) { */ public String getCommonNameForLSID(String lsid) { if (lsid != null) { - TermQuery query = new TermQuery(new Term(ALANameIndexer.IndexField.LSID.toString(), lsid)); + Query query = NameIndexField.LSID.search(lsid); try { TopDocs results = vernSearcher.search(query, 1); log.debug("Number of matches for " + lsid + " " + results.totalHits); for (ScoreDoc sdoc : results.scoreDocs) { org.apache.lucene.document.Document doc = vernSearcher.doc(sdoc.doc); - return doc.get(ALANameIndexer.IndexField.COMMON_NAME.toString()); + return doc.get(NameIndexField.COMMON_NAME.toString()); } } catch (IOException e) { log.debug("Unable to access document for common name.", e); @@ -1652,16 +1601,14 @@ public String getCommonNameForLSID(String lsid, String[] languages) { if (lsid != null) { for (String language: languages) { try { - Query query = queryParser.get().parse( - ALANameIndexer.IndexField.LSID.toString() + ":\"" + lsid + "\" " + - " AND " + - ALANameIndexer.IndexField.LANGUAGE.toString() + ":\"" + language + "\" " - ); - TopDocs results = vernSearcher.search(query, 1); + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + builder.add(NameIndexField.LSID.search(lsid), BooleanClause.Occur.MUST); + builder.add(NameIndexField.LANGUAGE.search(language), BooleanClause.Occur.MUST); + TopDocs results = vernSearcher.search(builder.build(), 1); log.debug("Number of matches for " + lsid + " " + results.totalHits); for (ScoreDoc sdoc : results.scoreDocs) { org.apache.lucene.document.Document doc = vernSearcher.doc(sdoc.doc); - return doc.get(ALANameIndexer.IndexField.COMMON_NAME.toString()); + return doc.get(NameIndexField.COMMON_NAME.toString()); } } catch (Exception e) { log.debug("Unable to access document for common name.", e); @@ -1678,7 +1625,7 @@ public String getCommonNameForLSID(String lsid, String[] languages) { */ public Set getCommonNamesForLSID(String lsid, int maxNumberOfNames) { if (lsid != null) { - TermQuery query = new TermQuery(new Term(ALANameIndexer.IndexField.LSID.toString(), lsid)); + Query query = NameIndexField.LSID.search(lsid); try { TopDocs results = vernSearcher.search(query, maxNumberOfNames); //if all the results have the same scientific name result the LSID for the first @@ -1689,7 +1636,7 @@ public Set getCommonNamesForLSID(String lsid, int maxNumberOfNames) { int idx = 0; for (ScoreDoc sdoc : results.scoreDocs) { org.apache.lucene.document.Document doc = vernSearcher.doc(sdoc.doc); - String name = doc.get(ALANameIndexer.IndexField.COMMON_NAME.toString()); + String name = doc.get(NameIndexField.COMMON_NAME.toString()); if(!lowerCaseResults.contains(name.toLowerCase())){ lowerCaseResults.add(name.toLowerCase()); names.add(name); @@ -1715,7 +1662,7 @@ public Set getCommonNamesForLSID(String lsid, int maxNumberOfNames) { */ private String getLSIDForUniqueCommonName(String name) { if (name != null) { - TermQuery query = new TermQuery(new Term(ALANameIndexer.IndexField.SEARCHABLE_COMMON_NAME.toString(), name.toUpperCase().replaceAll("[^A-Z0-9ÏËÖÜÄÉÈČÁÀÆŒ]", ""))); + Query query = NameIndexField.SEARCHABLE_COMMON_NAME.search(name); try { TopDocs results = vernSearcher.search(query, 10); //if all the results have the same scientific name result the LSID for the first @@ -1725,10 +1672,10 @@ private String getLSIDForUniqueCommonName(String name) { for (ScoreDoc sdoc : results.scoreDocs) { org.apache.lucene.document.Document doc = vernSearcher.doc(sdoc.doc); if (firstLsid == null) { - firstLsid = doc.get(ALANameIndexer.IndexField.LSID.toString()); - firstName = doc.get(ALANameIndexer.IndexField.NAME.toString()); + firstLsid = doc.get(NameIndexField.LSID.toString()); + firstName = doc.get(NameIndexField.NAME.toString()); } else { - if (!doSciNamesMatch(firstName, doc.get(ALANameIndexer.IndexField.NAME.toString()))) + if (!doSciNamesMatch(firstName, doc.get(NameIndexField.NAME.toString()))) return null; } } @@ -1791,11 +1738,11 @@ public NameSearchResult searchForCommonName(String name) { */ public String getPrimaryLsid(String lsid) { if (lsid != null) { - TermQuery tq = new TermQuery(new Term("lsid", lsid)); + Query tq = NameIndexField.LSID.search(lsid); try { org.apache.lucene.search.TopDocs results = idSearcher.search(tq, 1); if (results.totalHits.value > 0) - return idSearcher.doc(results.scoreDocs[0].doc).get("reallsid"); + return idSearcher.doc(results.scoreDocs[0].doc).get(NameIndexField.REAL_LSID.toString()); } catch (IOException e) { } } @@ -1806,7 +1753,7 @@ public String getPrimaryLsid(String lsid) { public NameSearchResult searchForRecordByLsid(String lsid) { NameSearchResult result = null; try { - Query query = new TermQuery(new Term(NameIndexField.LSID.toString(), lsid)); + Query query = NameIndexField.LSID.search(lsid); TopDocs hits = this.idSearcher.search(query, 1); if (hits.totalHits.value == 0) hits = this.cbSearcher.search(query, 1); @@ -1905,16 +1852,16 @@ private void appendAutocompleteResults(Map output, TopDocs results, } } - private Query buildAutocompleteQuery(String field, String q, boolean allSearches) { + private Query buildAutocompleteQuery(NameIndexField field, String q, boolean allSearches) { //best match - Query fq1 = new BoostQuery(new TermQuery(new Term(field,q)), 12f); //exact match + Query fq1 = new BoostQuery(field.search(q), 12f); //exact match //partial matches - Query fq5 = new WildcardQuery(new Term(field,q + "*")); //begins with that begins with - Query fq6 = new WildcardQuery(new Term(field,"* " + q + "*")); //contains word that begins with + Query fq5 = field.searchWildcard(q + "*"); //begins with that begins with + Query fq6 = field.searchWildcard("* " + q + "*"); //contains word that begins with //any match - Query fq7 = new WildcardQuery(new Term(field,"*" + q + "*")); //any match + Query fq7 = field.searchWildcard("*" + q + "*"); //any match //join BooleanQuery o = new BooleanQuery.Builder() @@ -1927,8 +1874,8 @@ private Query buildAutocompleteQuery(String field, String q, boolean allSearches } private String getPreferredGuid(String taxonConceptGuid) throws Exception { - Query qGuid = new TermQuery(new Term("guid", taxonConceptGuid)); - Query qOtherGuid = new TermQuery(new Term("otherGuid", taxonConceptGuid)); + Query qGuid = NameIndexField.GUID.search(taxonConceptGuid); + Query qOtherGuid = NameIndexField.OTHER_GUID.search(taxonConceptGuid); BooleanQuery fullQuery = new BooleanQuery.Builder() .add(qGuid, BooleanClause.Occur.SHOULD) @@ -1937,7 +1884,7 @@ private String getPreferredGuid(String taxonConceptGuid) throws Exception { TopDocs topDocs = cbSearcher.search(fullQuery, 1); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { Document doc = cbSearcher.doc(scoreDoc.doc); - return doc.get("guid"); + return doc.get(NameIndexField.GUID.toString()); } return taxonConceptGuid; } @@ -2052,7 +1999,7 @@ private String findLSIDByConcatName(String name) { try { String concatName = concatName(name); - Query query = new TermQuery(new Term("concat_name", concatName)); + Query query = NameIndexField.CONCAT_NAME.search(concatName); TopDocs topDocs = cbSearcher.search(query, 2); if (topDocs != null && topDocs.totalHits.value == 1) { @@ -2106,10 +2053,10 @@ public List autocomplete(String q, int max, boolean includeSynonyms) { String uq = q.toUpperCase(); //name search - Query fq = buildAutocompleteQuery("name", lq, false); + Query fq = buildAutocompleteQuery(NameIndexField.NAME, lq, false); BooleanQuery b = new BooleanQuery.Builder() .add(fq, BooleanClause.Occur.MUST) - .add(new WildcardQuery(new Term("left", "*")), includeSynonyms ? BooleanClause.Occur.SHOULD : BooleanClause.Occur.MUST) + .add(NameIndexField.LEFT.searchWildcard("*"), includeSynonyms ? BooleanClause.Occur.SHOULD : BooleanClause.Occur.MUST) .build(); TopDocs results = cbSearcher.search(b, max); appendAutocompleteResults(output, results, includeSynonyms, false); @@ -2118,7 +2065,7 @@ public List autocomplete(String q, int max, boolean includeSynonyms) { uq = concatName(uq).toUpperCase(); //common name search - fq = buildAutocompleteQuery("common", uq, true); + fq = buildAutocompleteQuery(NameIndexField.SEARCHABLE_COMMON_NAME, uq, true); results = vernSearcher.search(fq, max); appendAutocompleteResults(output, results, includeSynonyms, true); @@ -2177,4 +2124,21 @@ public static void main(String[] args) throws IOException { } } + /** + * Values for fields + */ + private static class Value { + public NameIndexField field; + public T value; + + private Value(NameIndexField field, T value) { + this.field = field; + this.value = value; + } + + public static Value of(NameIndexField field, T value) { + return new Value<>(field, value); + } + } + } diff --git a/src/main/java/au/org/ala/names/search/DwcaNameIndexer.java b/src/main/java/au/org/ala/names/search/DwcaNameIndexer.java index 71ccbd30a..a74bdb705 100644 --- a/src/main/java/au/org/ala/names/search/DwcaNameIndexer.java +++ b/src/main/java/au/org/ala/names/search/DwcaNameIndexer.java @@ -213,10 +213,9 @@ public boolean create(File namesDwc) throws Exception{ } public void createIrmng(File irmngDwc) throws Exception { - if (irmngDwc == null || !irmngDwc.exists()) - return; - IndexWriter irmngWriter = this.createIndexWriter(new File(this.targetDir, "irmng"), this.analyzer, true); - this.indexIrmngDwcA(irmngWriter, irmngDwc.getCanonicalPath()); + IndexWriter irmngWriter = this.createIndexWriter(new File(this.targetDir, "irmng"), this.analyzer, true); + if (irmngDwc != null && irmngDwc.exists()) + this.indexIrmngDwcA(irmngWriter, irmngDwc.getCanonicalPath()); irmngWriter.commit(); irmngWriter.forceMerge(1); irmngWriter.close(); @@ -492,15 +491,18 @@ public boolean createLoadingIndex(File archiveDirectory) throws Exception{ RankType rt = RankType.getForStrRank(taxonRank); if(rt != null){ doc.add(new StringField(NameIndexField.RANK.toString(), rt.getRank(), Field.Store.YES)); - doc.add(new StringField(NameIndexField.RANK_ID.toString(), rt.getId().toString(), Field.Store.YES)); + doc.add(new IntPoint(NameIndexField.RANK_ID.toString(), rt.getId())); + doc.add(new StoredField(NameIndexField.RANK_ID.toString(), rt.getId())); } else { doc.add(new StringField(NameIndexField.RANK.toString(), taxonRank, Field.Store.YES)); - doc.add(new StringField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId().toString(), Field.Store.YES)); + doc.add(new IntPoint(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId())); + doc.add(new StoredField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId())); } } else { //put in unknown rank doc.add(new StringField(NameIndexField.RANK.toString(), "Unknown", Field.Store.YES)); - doc.add(new StringField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId().toString(), Field.Store.YES)); + doc.add(new IntPoint(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId())); + doc.add(new StoredField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId())); } if(StringUtils.equals(taxonID, acceptedNameUsageID) || StringUtils.equals(id, acceptedNameUsageID) || acceptedNameUsageID == null){ //mark this one as an accepted concept @@ -582,7 +584,7 @@ public void generateIndex() throws Exception{ //get all the records that don't have parents that are accepted log.info("Loading index from temporary index."); TopDocs rootConcepts = getLoadIdxResults(null, "root", "T", PAGE_SIZE); - int left = 0; + int left = 1; int right = left; int lastRight = right; int count = 0; @@ -729,8 +731,8 @@ private int addIndex(Document doc, int currentDepth, int currentLeft, LinnaeanRa doc.get(NameIndexField.AUTHOR.toString()), doc.get(NameIndexField.RANK.toString()), doc.get(NameIndexField.RANK_ID.toString()), - Integer.toString(left), - Integer.toString(right), + left, + right, newcl, nameComplete, otherNames, @@ -752,7 +754,7 @@ protected Document createALASynonymDocument(String scientificName, String author String genus = null; String specificEpithet = null; String infraspecificEpithet = null; - try { + try { TopDocs hits = this.cbSearcher.search(new TermQuery(new Term(NameIndexField.LSID.toString(), acceptedLsid)), 1); if (hits.totalHits.value > 0) accepted = this.cbSearcher.doc(hits.scoreDocs[0].doc); @@ -786,7 +788,7 @@ protected Document createALASynonymDocument(String scientificName, String author } Document doc = createALAIndexDocument(scientificName, id, lsid, null, null, - kingdom, null, phylum, null, clazz, null, order, null, family, null, genus, null, null, null, null, null, + kingdom, null, phylum, null, clazz, null, order, null, family, null, genus, null, null, null, 0, 0, acceptedLsid, specificEpithet, infraspecificEpithet, author, nameComplete, otherNames, priority); if (doc != null && synonymType != null) { try { diff --git a/src/main/resources/taxonomy.properties b/src/main/resources/taxonomy.properties index 00d2a60e2..216340814 100644 --- a/src/main/resources/taxonomy.properties +++ b/src/main/resources/taxonomy.properties @@ -38,6 +38,7 @@ count.vernacularName.placed=Placed {0} additional vernacular names count.vernacularName.unplaced=Unable to find taxa for {0} additional vernacular names dwca.additionalInfo=Created by combining source taxonomies using the ala-name-matching algorithms. \ See https://github.com/AtlasOfLivingAustralia/ala-name-matching for more information. +instance.accepted.invalidLink=Invalid accepted link for {0} {1} - {2} instance.accepted.resolve=Unable to resolve accepted taxon for {3} instance.accepted.resolve.loop=Loop resolving accepted taxon for {3} - {2} instance.accepted.resolve.loop.provenance=Synonym loop resolved by converting to inferred unplaced @@ -47,6 +48,7 @@ instance.discarded.synonym.provenance=Discarded name synonymised into this taxon instance.inferredSynonym.provenance=Inferred from {0} in source {1} instance.multiIndex=Multiple index entries for {3}: {4} {5} choosing first instance.noIndex=No index entry for {3} +instance.parent.invalidLink=Invalid parent link for {0} {1} - {2} instance.parent.resolve=Unable to resolve parent for {3} instance.parent.resolve.loop=Loop resolving parent for {3} - {2} instance.parent.resolve.loop.provenance=Parent loop resolved by replacing parent with the unknown taxon @@ -74,6 +76,7 @@ name.UnrankedScientificName.principal=Principal for unranked scientific name {3} name.principal=Principal for {3} is {4} nomenclaturalCode.notFound=Cant find nomenclatural code {2} nomenclaturalStatus.notFound=Cant find nomenclatural status {2}, ignoring - reported once for each status +provider.archive.noMetadata=Archive has no metadata provider.validation.unknownTaxonID.noID=No unknown taxon identifier provider.validation.unknownTaxonID.notFound=Unknown taxon identifier {0} not found rank.notFound=Cant find rank of {2}, making unranked - reported once for each rank diff --git a/src/test/java/au/org/ala/names/index/provider/MatchTaxonConditionTest.java b/src/test/java/au/org/ala/names/index/provider/MatchTaxonConditionTest.java index c256f22ad..a4e43cdea 100644 --- a/src/test/java/au/org/ala/names/index/provider/MatchTaxonConditionTest.java +++ b/src/test/java/au/org/ala/names/index/provider/MatchTaxonConditionTest.java @@ -440,6 +440,23 @@ public void testMatch41() { assertFalse(condition.match(instance, key)); } + + @Test + public void testMatch42() { + MatchTaxonCondition condition = new MatchTaxonCondition(); + condition.setScientificName("Unknown(\\s.*|)"); + condition.setMatchType(NameMatchType.REGEX); + TaxonConceptInstance instance = new TaxonConceptInstance("ID-1", NomenclaturalCode.BOTANICAL, NomenclaturalCode.BOTANICAL.getAcronym(), this.provider, "unknown", null, null, "1975", TaxonomicType.ACCEPTED, TaxonomicType.ACCEPTED.getTerm(), RankType.SPECIES, RankType.SPECIES.getRank(), null, null, null, null, null, null, null, null, null, null); + NameKey key = this.analyser.analyse(instance); + assertTrue(condition.match(instance, key)); + instance = new TaxonConceptInstance("ID-1", NomenclaturalCode.BOTANICAL, NomenclaturalCode.BOTANICAL.getAcronym(), this.provider, "Unknown sp.", null, null, "1975", TaxonomicType.ACCEPTED, TaxonomicType.ACCEPTED.getTerm(), RankType.SPECIES, RankType.SPECIES.getRank(), null, null, null, null, null, null, null, null, null, null); + key = this.analyser.analyse(instance); + assertTrue(condition.match(instance, key)); + instance = new TaxonConceptInstance("ID-1", NomenclaturalCode.BOTANICAL, NomenclaturalCode.BOTANICAL.getAcronym(), this.provider, "Unknownsp.", null, null, "1975", TaxonomicType.ACCEPTED, TaxonomicType.ACCEPTED.getTerm(), RankType.SPECIES, RankType.SPECIES.getRank(), null, null, null, null, null, null, null, null, null, null); + key = this.analyser.analyse(instance); + assertFalse(condition.match(instance, key)); + } + @Test public void testWrite1() throws Exception { MatchTaxonCondition condition = new MatchTaxonCondition(); diff --git a/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java b/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java index a4710b16a..a005d1810 100644 --- a/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java +++ b/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java @@ -20,7 +20,7 @@ public class ALANameSearcherTest { @org.junit.BeforeClass public static void init() throws Exception { - searcher = new ALANameSearcher("/data/lucene/namematching-20200214-lucene8"); + searcher = new ALANameSearcher("/data/lucene/namematching-20210629"); } @Test @@ -31,7 +31,7 @@ public void testMisappliedNames1() throws Exception { fail("A misapplied exception should be thrown"); //assertEquals("urn:lsid:biodiversity.org.au:apni.taxon:549612",lsid); } catch (MisappliedException ex) { - assertEquals("https://id.biodiversity.org.au/node/apni/2915977", ex.getMatchedResult().getLsid()); + assertEquals("https://id.biodiversity.org.au/taxon/apni/51401037", ex.getMatchedResult().getLsid()); //assertNull(ex.getMisappliedResult()); } } @@ -53,12 +53,27 @@ public void testMisappliedNames2() { @Test public void testMisappliedNames3() { try { - String name = "Scleroderma aurantium (L. : Pers.) Pers."; + String name = "Acacia bivenosa DC."; NameSearchResult nsr = searcher.searchForRecord(name); fail("Expecting misapplied exception"); assertNotNull(nsr); } catch (MisappliedException ex) { - assertEquals("92a4e5c4-32c1-44c6-a9f7-410659692dfa", ex.getMatchedResult().getLsid()); + assertEquals("https://id.biodiversity.org.au/node/apni/2912987", ex.getMatchedResult().getLsid()); + } catch (SearchResultException ex) { + fail("Unexpected search exception " + ex); + } + } + + + @Test + public void testMisappliedNames4() { + try { + String name = "Caladenia concinna"; + NameSearchResult nsr = searcher.searchForRecord(name); + fail("Expecting misapplied exception"); + assertNotNull(nsr); + } catch (MisappliedException ex) { + assertEquals("https://id.biodiversity.org.au/taxon/apni/51398909", ex.getMatchedResult().getLsid()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex); } @@ -119,7 +134,7 @@ public void parserBlackList() throws Exception { String name = "Petaurus australis unnamed subsp."; String lsid = searcher.searchForLSID(name, true); assertNotNull(lsid); - assertEquals("ALA_Petaurus_australis_unnamed_subsp", lsid); + assertEquals("ALA_3617757", lsid); } @Test @@ -133,7 +148,7 @@ public void testRecursiveSearch() { try { NameSearchResult nsr = searcher.searchForRecord(cl, true, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:3309bb2e-5b3f-4664-977b-147e60b66109", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/3309bb2e-5b3f-4664-977b-147e60b66109", nsr.getLsid()); System.out.println(nsr); } catch (Exception e) { e.printStackTrace(); @@ -150,8 +165,8 @@ public void testSpeciesSplitSynonym() { } catch (Exception e) { assertTrue(e instanceof ParentSynonymChildException); ParentSynonymChildException psce = (ParentSynonymChildException) e; - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:c195483c-6ef0-4043-8bdf-6d9464bef8f9", psce.getParentResult().getLsid()); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:db338300-a464-4ccb-bdc6-2cf92665fb7d", psce.getChildResult().getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/2c5fd509-d4d6-4adb-9566-96280ff9e6af", psce.getParentResult().getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/b4f39a2b-cfaf-4c69-8ace-77f1664acd6b", psce.getChildResult().getLsid()); } } @@ -176,7 +191,7 @@ public void testExcludedNames() { } catch (Exception e) { assertTrue(e instanceof ExcludedNameException); ExcludedNameException ene = (ExcludedNameException) e; - assertEquals("urn:lsid:biodiversity.org.au:afd.name:433c43fe-cf38-4b76-9bdb-55a89fbac291", ene.getExcludedName().getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/74ac7082-6138-4eb0-86ba-95535deab180", ene.getExcludedName().getLsid()); } String apcExcludedName = "Parestia elegans"; @@ -205,7 +220,7 @@ public void testHomonymsWithResolution1() throws Exception { cl.setScientificName("Thalia"); try { nsr = searcher.searchForRecord("Thalia", null, true); - fail("Thalia should throw a homonym without kingdom or author"); + fail("Thalia should throw a homonym without kingdom or author, got " + nsr.getLsid()); } catch (HomonymException e) { } } @@ -220,7 +235,7 @@ public void testHomonymsWithResolution2() throws Exception { try { nsr = searcher.searchForRecord(cl, false); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:52c68649-47d5-4f2e-9730-417fc54fb080", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/52c68649-47d5-4f2e-9730-417fc54fb080", nsr.getLsid()); } catch (HomonymException e) { fail("Homonym should be resolved via the Kingdom"); } @@ -250,7 +265,7 @@ public void testHomonymsWithResolution4() throws Exception { try { nsr = searcher.searchForRecord(cl, false); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:52c68649-47d5-4f2e-9730-417fc54fb080", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/52c68649-47d5-4f2e-9730-417fc54fb080", nsr.getLsid()); } catch (HomonymException e) { fail("Author should identify homonym value to use"); } @@ -265,7 +280,7 @@ public void testHomonymsWithResolution5() throws Exception { try { nsr = searcher.searchForRecord(cl, false); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:52c68649-47d5-4f2e-9730-417fc54fb080", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/52c68649-47d5-4f2e-9730-417fc54fb080", nsr.getLsid()); } catch (HomonymException e) { fail("Author should identify homonym value to use"); } @@ -294,7 +309,7 @@ public void testsStrMarker1(){ cl.setScientificName("Macropus rufus"); nsr = searcher.searchForRecord(cl, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.name:fbe09d8b-8cc2-444a-b8f7-d06730543781", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/fbe09d8b-8cc2-444a-b8f7-d06730543781", nsr.getLsid()); } catch (SearchResultException ex) { fail("Not expecting exception " + ex); } @@ -311,7 +326,7 @@ public void testsStrMarker2(){ cl.setScientificName("Osphranter rufus"); nsr = searcher.searchForRecord(cl, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:e6aff6af-ff36-4ad5-95f2-2dfdcca8caff", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/e6aff6af-ff36-4ad5-95f2-2dfdcca8caff", nsr.getLsid()); } catch (SearchResultException ex) { fail("Not expecting exception " + ex); } @@ -324,7 +339,7 @@ public void testsStrMarker3() { String name = "Oenochrominae s. str."; // There's only one of these left NameSearchResult nsr = searcher.searchForRecord(name, null); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:537ff8fb-b6c2-4536-9cb8-ad244832c1de", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/537ff8fb-b6c2-4536-9cb8-ad244832c1de", nsr.getLsid()); } catch (SearchResultException ex) { fail("Not expecting exception " + ex); } @@ -349,7 +364,7 @@ public void testsStrMarker5() { String name = "Stennella longirostris longirostris"; NameSearchResult nsr = searcher.searchForRecord(name, null, true); assertNotNull(nsr); - assertEquals("ALA_Stennella_longirostris_longirostris", nsr.getLsid()); + assertEquals("ALA_190693", nsr.getLsid()); } catch (SearchResultException ex) { fail("Not expecting exception " + ex); } @@ -361,7 +376,7 @@ public void testsStrMarker6() { String name = "Aplonis fusca hulliana"; NameSearchResult nsr = searcher.searchForRecord(name, null); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:d1674a33-af14-4592-be4d-2ededc1b53cd", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/7b241ea8-07ab-4aa0-a2d7-c0b43767c3d4", nsr.getLsid()); } catch (SearchResultException ex) { fail("Not expecting exception " + ex); } @@ -398,8 +413,8 @@ public void testsStrMarker9() { String name = "Siganus nebulosus"; NameSearchResult nsr = searcher.searchForRecord(name, null, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.name:c2d406d8-1066-4fd3-8c95-31ee6343a1b8", nsr.getLsid()); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:0aa9653f-00c7-42b9-896b-f399103703b8", nsr.getAcceptedLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/c2d406d8-1066-4fd3-8c95-31ee6343a1b8", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/0aa9653f-00c7-42b9-896b-f399103703b8", nsr.getAcceptedLsid()); } catch (SearchResultException ex) { fail("Not expecting exception " + ex); @@ -411,7 +426,7 @@ public void testsStrMarker10() { String name = "Anabathron contabulatum"; NameSearchResult nsr = searcher.searchForRecord(name, null, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:eea54328-a4a5-406b-bdfd-3ed119241591", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/39a6129d-dca7-4e3f-bec7-88f0e848c92c", nsr.getLsid()); } catch (SearchResultException ex) { fail("Not expecting exception " + ex); } @@ -443,7 +458,7 @@ public void testQuestionSpeciesMatch() { //test the "name based" synonym "has generic combination" nsr = searcher.searchForRecord("Cacatua leadbeateri", null); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:0217f06f-664c-4c64-bc59-1b54650fa23d", nsr.getAcceptedLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/5815e99d-01cd-4a92-99ba-36f480c4834d", nsr.getAcceptedLsid()); name = "Zieria smithii"; nsr = searcher.searchForRecord(name, null); @@ -465,7 +480,7 @@ public void testSpMarker1() { nsr = searcher.searchForRecord(name, null); assertNotNull(nsr); // Either one can match - assertTrue("https://id.biodiversity.org.au/name/apni/190511".equals(nsr.getLsid()) || "https://id.biodiversity.org.au/name/apni/233691".equals(nsr.getLsid())); + assertEquals("https://id.biodiversity.org.au/taxon/apni/51414212", nsr.getLsid()); } catch (SearchResultException e) { fail("Unexpected search exception " + e); } @@ -489,7 +504,7 @@ public void testSpMarker3() { try { String name = "Lindernia sp. Pilbara (M.N.Lyons & L.Lewis FV 1069)"; NameSearchResult nsr = null; - nsr = searcher.searchForRecord(name, RankType.SUBSPECIES); + nsr = searcher.searchForRecord(name, RankType.SPECIES); assertNotNull(nsr); assertEquals("https://id.biodiversity.org.au/name/apni/51306553", nsr.getLsid()); } catch (SearchResultException e) { @@ -583,7 +598,7 @@ public void testPhraseMatch4() { NameSearchResult nsr = null; nsr = searcher.searchForRecord(name, null); assertNotNull(nsr); - assertEquals("https://id.biodiversity.org.au/name/apni/233691", nsr.getLsid()); + assertEquals("https://id.biodiversity.org.au/taxon/apni/51414212", nsr.getLsid()); } catch (SearchResultException e) { fail("Unexpected search exception " + e); } @@ -596,29 +611,39 @@ public void testPhraseMatch5() { NameSearchResult nsr = null; nsr = searcher.searchForRecord(name, null); assertNotNull(nsr); - assertEquals("https://id.biodiversity.org.au/node/apni/2898916", nsr.getLsid()); + assertEquals("https://id.biodiversity.org.au/instance/apni/9302042", nsr.getLsid()); } catch (SearchResultException e) { fail("Unexpected search exception " + e); } } - @Test - public void testSynonymWithoutRank() { - try { - LinnaeanRankClassification cl = new LinnaeanRankClassification(); - cl.setKingdom("Animalia"); - cl.setScientificName("Gymnorhina tibicen"); - NameSearchResult nsr = searcher.searchForRecord(cl, true, true); - assertEquals("Gymnorhina tibicen", nsr.getRankClassification().getScientificName()); - assertEquals("(Latham, 1801)", nsr.getRankClassification().getAuthorship()); - nsr = searcher.searchForRecord("Cracticus tibicen", RankType.SPECIES); - assertEquals("Cracticus tibicen", nsr.getRankClassification().getScientificName()); - nsr = searcher.searchForRecord("Cracticus tibicen", RankType.GENUS); - assertEquals(null, nsr); - } catch (Exception e) { + public void testSynonymWithoutRank1() throws Exception { + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setKingdom("Animalia"); + cl.setScientificName("Gymnorhina tibicen"); + NameSearchResult nsr = searcher.searchForRecord(cl, true, true); + assertEquals("Gymnorhina tibicen", nsr.getRankClassification().getScientificName()); + assertEquals("(Latham, 1801)", nsr.getRankClassification().getAuthorship()); + cl.setScientificName("Cracticus tibicen"); + cl.setRank(RankType.SPECIES.getRank()); + nsr = searcher.searchForRecord(cl, true, true); + assertEquals("ALA_3267030", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/5291343e-fdeb-4a65-8ba5-928f5b96acf5", nsr.getAcceptedLsid()); + } - } + + @Test + public void testSynonymWithoutRank2() throws Exception { + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName("Abantiades zonatriticum"); + NameSearchResult nsr = searcher.searchForRecord(cl, true, true); + assertEquals("Abantiades", nsr.getRankClassification().getScientificName()); + assertEquals(MatchType.RECURSIVE, nsr.getMatchType()); + cl.setRank(RankType.SPECIES.getRank()); + nsr = searcher.searchForRecord(cl, true, true); + assertEquals("Abantiades", nsr.getRankClassification().getScientificName()); + assertEquals(MatchType.RECURSIVE, nsr.getMatchType()); } @Test @@ -637,11 +662,11 @@ public void testRecordSearchWithoutScientificName() { @Test public void testInfragenricAndSoundEx1() { - String nameDifferentEnding = "Phylidonyris pyrrhopterus"; + String nameDifferentEnding = "Phylidonyris pyrrhoptera"; try { NameSearchResult nsr = searcher.searchForRecord(nameDifferentEnding, null, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:604e6ea0-1a7f-4ee4-ad50-2cff8243631f", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/61f2bc62-dd50-4ba2-82a0-0377d386e4d8", nsr.getLsid()); assertEquals(MatchType.SOUNDEX, nsr.getMatchType()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); @@ -654,8 +679,8 @@ public void testInfragenricAndSoundEx2() { try { NameSearchResult nsr = searcher.searchForRecord(nameWithInfraGenric, null, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:604e6ea0-1a7f-4ee4-ad50-2cff8243631f", nsr.getLsid()); - assertEquals(MatchType.EXACT, nsr.getMatchType()); + assertEquals("https://biodiversity.org.au/afd/taxa/61f2bc62-dd50-4ba2-82a0-0377d386e4d8", nsr.getLsid()); + assertEquals(MatchType.SOUNDEX, nsr.getMatchType()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); } @@ -663,11 +688,11 @@ public void testInfragenricAndSoundEx2() { @Test public void testInfragenricAndSoundEx3() { - String nameDiffEndInfraGeneric = "Phylidonyris (Phylidonyris) pyrrhopterus"; + String nameDiffEndInfraGeneric = "Phylidonyris (Phylidonyris) pyrrhopteras"; try { NameSearchResult nsr = searcher.searchForRecord(nameDiffEndInfraGeneric, null, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:604e6ea0-1a7f-4ee4-ad50-2cff8243631f", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/61f2bc62-dd50-4ba2-82a0-0377d386e4d8", nsr.getLsid()); assertEquals(MatchType.SOUNDEX, nsr.getMatchType()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); @@ -680,7 +705,7 @@ public void testInfragenricAndSoundEx4() { try { NameSearchResult nsr = searcher.searchForRecord(name, null, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:b1ba5449-a68e-4c3b-ae90-8e667617945b", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/c7d8dbc8-dcde-4182-85ba-907182f95ea9", nsr.getLsid()); assertEquals(MatchType.EXACT, nsr.getMatchType()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); @@ -693,7 +718,7 @@ public void testInfragenricAndSoundEx5() { try { NameSearchResult nsr = searcher.searchForRecord(name, null, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:b1ba5449-a68e-4c3b-ae90-8e667617945b", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/c7d8dbc8-dcde-4182-85ba-907182f95ea9", nsr.getLsid()); assertEquals(MatchType.SOUNDEX, nsr.getMatchType()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); @@ -706,8 +731,8 @@ public void testInfragenricAndSoundEx6() { try { NameSearchResult nsr = searcher.searchForRecord(name, null, true); assertNotNull(nsr); - assertEquals("CAAB:79629da1:6054320e:589caaa6:bb265593", nsr.getLsid()); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:1a85a82f-5a1f-4c56-9f04-918643461260", nsr.getAcceptedLsid()); + assertEquals("SY_39006017_1", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/1a85a82f-5a1f-4c56-9f04-918643461260", nsr.getAcceptedLsid()); assertEquals(MatchType.SOUNDEX, nsr.getMatchType()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); @@ -780,7 +805,7 @@ public void testOutOfGeography1() { try { NameSearchResult nsr = searcher.searchForRecord(classification, true, true, true); assertNotNull(nsr); - assertEquals("ALA_Proboscidea", nsr.getLsid()); + assertEquals("ALA_3267031", nsr.getLsid()); assertEquals(MatchType.RECURSIVE, nsr.getMatchType()); assertEquals(RankType.ORDER, nsr.getRank()); } catch (SearchResultException ex) { @@ -797,7 +822,7 @@ public void testOutOfGeography2() { try { NameSearchResult nsr = searcher.searchForRecord(classification, true, true, true); assertNotNull(nsr); - assertEquals("ALA_Myrina", nsr.getLsid()); + assertEquals("ALA_3267033", nsr.getLsid()); assertEquals(MatchType.RECURSIVE, nsr.getMatchType()); assertEquals(RankType.GENUS, nsr.getRank()); } catch (SearchResultException ex) { @@ -856,7 +881,7 @@ public void testPhraseNames() { public void testNoRank() { try { String lsid = searcher.searchForLSID("Animalia"); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:4647863b-760d-4b59-aaa1-502c8cdf8d3c", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/4647863b-760d-4b59-aaa1-502c8cdf8d3c", lsid); lsid = searcher.searchForLSID("Bacteria"); assertEquals("NZOR-6-73174", lsid); } catch (SearchResultException e) { @@ -911,7 +936,7 @@ public void testIgnoredHomonyms2() { cl.setGenus("Macropus"); //NameSearchResult nsr =searcher.searchForRecord(cl.getId(), cl, null, true,true); String lsid = searcher.searchForLSID("Macropus", false, true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:b1d9bf29-648f-47e6-8544-2c2fbdf632b1", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/b1d9bf29-648f-47e6-8544-2c2fbdf632b1", lsid); } catch (Exception e) { fail("ignored homonyms should not throw exception " + e.getMessage()); } @@ -940,7 +965,7 @@ public void testIgnoredHomonyms4() { cl.setGenus("Agathis"); cl.setKingdom("Animalia"); NameSearchResult nsr = searcher.searchForRecord(cl.getScientificName(), cl, null, true, true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:a4109d9e-723c-491a-9363-95df428fe230", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/d02923bc-cf54-4d7f-ae74-aac1d6af1830", nsr.getLsid()); } catch (Exception e) { fail("A kingdom was supplied and should be resolvable. " + e.getMessage()); } @@ -1022,7 +1047,7 @@ public void testCommonNames1() { String name = "Red Kangaroo"; String lsid = getCommonNameLSID(name); String sciName = getCommonName(name); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:e6aff6af-ff36-4ad5-95f2-2dfdcca8caff", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/e6aff6af-ff36-4ad5-95f2-2dfdcca8caff", lsid); assertEquals("Osphranter rufus", sciName); } @@ -1031,8 +1056,8 @@ public void testCommonNames2() { String name = "Yellow-tailed Black-Cockatoo"; String lsid = getCommonNameLSID(name); String sciName = getCommonName(name); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:72ca8d75-71da-4751-a5cf-aa07ac3869f7", lsid); - assertEquals("Calyptorhynchus (Zanda) funereus", sciName); + assertEquals("https://biodiversity.org.au/afd/taxa/145b081d-eca7-4d9b-9171-b97e2d061536", lsid); + assertEquals("Zanda funerea", sciName); } @Test @@ -1040,7 +1065,7 @@ public void testCommonNames3() { String name = "Scarlet Robin"; String lsid = getCommonNameLSID(name); String sciName = getCommonName(name); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:b02a8195-266e-463b-89b7-3dc2a1c48450", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/a3e5376b-f9e6-4bdf-adae-1e7add9f5c29", lsid); assertEquals("Petroica (Petroica) boodang", sciName); } @@ -1049,7 +1074,7 @@ public void testCommonNames4() { String name = "Pacific Bluefin Tuna"; String lsid = getCommonNameLSID(name); String sciName = getCommonName(name); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:b35bf6d6-3b67-4d4c-b81e-b7ca7a64d341", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/b35bf6d6-3b67-4d4c-b81e-b7ca7a64d341", lsid); assertEquals("Thunnus orientalis", sciName); } @@ -1058,7 +1083,7 @@ public void testCommonNames5() { String name = "Pacific Black Duck"; String lsid = getCommonNameLSID(name); String sciName = getCommonName(name); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:da8a156f-95e2-4fcb-a6e7-52721705a70c", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/81be58f5-caf7-4f3d-b1eb-d4f83eb0af5a", lsid); assertEquals("Anas (Anas) superciliosa", sciName); } @@ -1067,7 +1092,7 @@ public void testCommonNames6() { String name = "European Carp"; String lsid = getCommonNameLSID(name); String sciName = getCommonName(name); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:16171fac-8d6c-4327-9fab-f2db864d71bf", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/16171fac-8d6c-4327-9fab-f2db864d71bf", lsid); assertEquals("Cyprinus carpio", sciName); } @@ -1075,13 +1100,13 @@ public void testCommonNames6() { public void testCommonNames7() { String name = "Sulphur-crested Cockatoo"; String lsid = getCommonNameLSID(name); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9eb417b-2de3-48ac-ba4e-1d438f0cb323", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/2c33a1fd-34f4-48ec-9ae6-38b51f2aa7ea", lsid); name = "Sulphur crested Cockatoo"; lsid = getCommonNameLSID(name); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9eb417b-2de3-48ac-ba4e-1d438f0cb323", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/2c33a1fd-34f4-48ec-9ae6-38b51f2aa7ea", lsid); name = "SULPHUR CRESTED COCKATOO"; lsid = getCommonNameLSID(name); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9eb417b-2de3-48ac-ba4e-1d438f0cb323", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/2c33a1fd-34f4-48ec-9ae6-38b51f2aa7ea", lsid); String sciName = getCommonName(name); assertEquals("Cacatua (Cacatua) galerita", sciName); } @@ -1159,7 +1184,7 @@ public void testMyrmecia() { public void testSearchForLSID1() { try { String lsid = searcher.searchForLSID("Anochetus"); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); } @@ -1169,7 +1194,7 @@ public void testSearchForLSID1() { public void testSearchForLSID2() { try { String lsid = searcher.searchForLSID("Anochetus", true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); } @@ -1179,7 +1204,7 @@ public void testSearchForLSID2() { public void testSearchForLSID3() { try { String lsid = searcher.searchForLSID("Anochetus", true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); } @@ -1189,7 +1214,7 @@ public void testSearchForLSID3() { public void testSearchForLSID4() { try { String lsid = searcher.searchForLSID("Anochetus", RankType.GENUS); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); } @@ -1200,7 +1225,7 @@ public void testSearchForLSID5() { try { LinnaeanRankClassification cl = new LinnaeanRankClassification("Animalia", "Arthropoda", "Insecta", "Hymenoptera", "Formicidae", "Anochetus", null); String lsid = searcher.searchForLSID("Anochetus", cl, RankType.GENUS); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); } @@ -1211,7 +1236,7 @@ public void testSearchForLSID6() { try { LinnaeanRankClassification cl = new LinnaeanRankClassification("Animalia", "Arthropoda", "Insecta", "Hymenoptera", "Formicidae", "Anochetus", null); String lsid = searcher.searchForLSID(cl, true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); } @@ -1222,7 +1247,7 @@ public void testSearchForLSID7() { try { LinnaeanRankClassification cl = new LinnaeanRankClassification("Animalia", "Arthropoda", "Insecta", "Hymenoptera", "Formicidae", "Anochetus", null); String lsid = searcher.searchForLSID(cl, true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); } @@ -1230,11 +1255,11 @@ public void testSearchForLSID7() { @Test public void testFuzzyMatches() throws Exception { - //Eolophus roseicapillus - non fuzzy match - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:577ff059-a2a7-48b0-976c-fdd6a345f878", searcher.searchForLSID("Eolophus roseicapilla")); + //Eolophus roseicapilla - non fuzzy match + assertEquals("https://biodiversity.org.au/afd/taxa/9b4ad548-8bb3-486a-ab0a-905506c463ea", searcher.searchForLSID("Eolophus roseicapilla")); - //Eolophus roseicapilla - fuzzy match - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:577ff059-a2a7-48b0-976c-fdd6a345f878", searcher.searchForLSID("Eolophus roseicapillus", true)); + //Eolophus roseicapillus - fuzzy match + assertEquals("https://biodiversity.org.au/afd/taxa/9b4ad548-8bb3-486a-ab0a-905506c463ea", searcher.searchForLSID("Eolophus roseicapillus", true)); } @Test @@ -1308,10 +1333,10 @@ public void testRankMarker() { @Test public void testSimpleLookup1() { try { - String name = "Megalurus gramineus"; + String name = "Poodytes gramineus"; NameSearchResult nsr = searcher.searchForRecord(name); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:b88430ed-f7d7-482e-a586-f0a02d8e11ce", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/061fef09-7c9d-4b6d-9827-4da13a350dc6", nsr.getLsid()); } catch (SearchResultException e) { fail("Unexpected search exception " + e); } @@ -1324,7 +1349,7 @@ public void testSimpleLookup2() { String name = "Synemon plana"; NameSearchResult nsr = searcher.searchForRecord(name); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:a51dca29-50e7-49b4-ae35-5c35a9c4f854", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/a51dca29-50e7-49b4-ae35-5c35a9c4f854", nsr.getLsid()); } catch (SearchResultException e) { fail("Unexpected search exception " + e); } @@ -1349,7 +1374,7 @@ public void testSimpleLookup4() { String name = "Chenopodium x bontei nothovar. submelanocarpum"; NameSearchResult nsr = searcher.searchForRecord(name); assertNotNull(nsr); - assertEquals("https://id.biodiversity.org.au/instance/apni/769095", nsr.getLsid()); + assertEquals("https://id.biodiversity.org.au/node/apni/2902250", nsr.getLsid()); } catch (SearchResultException e) { fail("Unexpected search exception " + e); } @@ -1363,8 +1388,8 @@ public void testSimpleLookup5() { cl.setScientificName("Favolus princeps"); NameSearchResult nsr = searcher.searchForRecord(cl, true); assertNotNull(nsr); // Been removed - assertEquals("43e1bc65-3580-47db-b269-cdb066ed49e9", nsr.getLsid()); - assertEquals( "10911fd1-a2dd-41f1-9c4d-8dff7f118670", nsr.getAcceptedLsid()); + assertEquals("https://id.biodiversity.org.au/instance/fungi/60071845", nsr.getLsid()); + assertEquals( "https://id.biodiversity.org.au/node/fungi/60098663", nsr.getAcceptedLsid()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex); } @@ -1393,7 +1418,7 @@ public void testSimpleLookup7() { String name = "Astomum"; NameSearchResult nsr = searcher.searchForRecord(name, cl, RankType.GENUS); assertNotNull(nsr); - assertEquals("https://id.biodiversity.org.au/name/ausmoss/10001613", nsr.getLsid()); + assertEquals("NZOR-6-29460", nsr.getLsid()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex); } @@ -1404,7 +1429,7 @@ public void testSimpleLookup8() { try { String name = "Carbo ater"; NameSearchResult nsr = searcher.searchForRecord(name); - fail("Expecting ecxluded name exception"); + fail("Expecting excluded name exception"); } catch (ExcludedNameException ex) { assertNull(ex.getNonExcludedName()); // Two types both excluded } catch (SearchResultException ex) { @@ -1418,7 +1443,7 @@ public void testSimpleLookup9() { String name = "Neobatrachus sudellae"; NameSearchResult nsr = searcher.searchForRecord(name); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:953a5af4-2932-4c8b-8f33-850b5f8f3fed", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/953a5af4-2932-4c8b-8f33-850b5f8f3fed", nsr.getLsid()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex); } @@ -1449,6 +1474,75 @@ public void testSimpleLookup12() { fail("Unexpected search exception " + e); } } + + // Do not match nom illeg. taxonomic status + @Test + public void testSimpleLookup13() throws Exception { + String name = "Banksia collina"; + NameSearchResult nsr = searcher.searchForRecord(name); + assertNotNull(nsr); + assertEquals("https://id.biodiversity.org.au/instance/apni/838699", nsr.getLsid()); + } + + @Test + public void testSimpleLookup14() throws Exception { + String name = "Stephanopis similis"; + NameSearchResult nsr = searcher.searchForRecord(name); + assertNotNull(nsr); + assertEquals("https://biodiversity.org.au/afd/taxa/24bc164a-85b2-4633-85c5-a3b399daec0a", nsr.getLsid()); + } + + @Test + public void testSimpleLookup15() throws Exception { + String name = "Fraus latistria"; + NameSearchResult nsr = searcher.searchForRecord(name); + assertNotNull(nsr); + assertEquals("https://biodiversity.org.au/afd/taxa/2358fcc0-8db2-475d-8da4-fd4bd5e711f2", nsr.getLsid()); + } + + @Test + public void testSimpleLookup16() throws Exception { + String name = "Metrosideros fulgens"; + NameSearchResult nsr = searcher.searchForRecord(name); + assertNotNull(nsr); + assertEquals("https://id.biodiversity.org.au/name/apni/110385", nsr.getLsid()); + } + + + @Test + public void testSimpleLookup17() throws Exception { + String name = "Metrosideros scandens"; + NameSearchResult nsr = searcher.searchForRecord(name); + assertNotNull(nsr); + assertEquals("https://id.biodiversity.org.au/name/apni/233086", nsr.getLsid()); + } + + + @Test + public void testMetricsLookup1() throws Exception { + String name = "Geopelia placida"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true, true); + assertNotNull(metrics); + assertEquals("https://biodiversity.org.au/afd/taxa/3d5c4e0d-5138-46e0-8e14-5acd8fd2c523", metrics.getResult().getLsid()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + assertTrue(metrics.getErrors().contains(ErrorType.PARENT_CHILD_SYNONYM)); + } + + @Ignore // Until sub-taxon synonymy decided + public void testMetricsLookup2() throws Exception { + String name = "Trigonaphera vinnulum"; // Synonym of Trigonostoma vinnulum + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true, true); + assertNotNull(metrics); + assertEquals("https://biodiversity.org.au/afd/taxa/7e67e588-927e-48a9-8765-365ae9f25fcb", metrics.getResult().getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/5855a347-eee2-47bb-8130-94d49602d232", metrics.getResult().getAcceptedLsid()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + assertTrue(metrics.getErrors().contains(ErrorType.PARENT_CHILD_SYNONYM)); + } + @Test public void testParentChildSynonym1() { try { @@ -1458,7 +1552,7 @@ public void testParentChildSynonym1() { } catch (ParentSynonymChildException ex) { NameSearchResult nsr = ex.getChildResult(); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:d0e66526-1cdd-4b03-85b2-71b7e7d8b84a", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/3e062650-6ecb-43e7-a903-5487e3dbbbb5", nsr.getLsid()); assertEquals(RankType.SUBSPECIES, nsr.getRank()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex); @@ -1474,7 +1568,7 @@ public void testParentChildSynonym2() { } catch (ParentSynonymChildException ex) { NameSearchResult nsr = ex.getChildResult(); assertNotNull(nsr); - assertEquals("8e64942a-f300-46c8-ba97-76492d25d985", nsr.getLsid()); + assertEquals("https://id.biodiversity.org.au/node/fungi/60083449", nsr.getLsid()); assertEquals(RankType.FORM, nsr.getRank()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex); @@ -1490,8 +1584,8 @@ public void testStigmoderaAurifera() { cl.setScientificName(name); NameSearchResult nsr = searcher.searchForRecord(cl, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.name:e89de580-2942-479d-b5ef-5edd60424560", nsr.getLsid()); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:2e8ac1d8-5f2b-4fcd-a124-c619c7cab6b0", nsr.getAcceptedLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/426ab801-0d5f-4b43-b1b4-55ce7ce7a44e", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/6c212123-fadc-4307-8dd8-ac501bb534ba", nsr.getAcceptedLsid()); assertEquals("Stigmodera aurifera", nsr.getRankClassification().getScientificName()); assertEquals(MatchType.CANONICAL, nsr.getMatchType()); } catch (SearchResultException e) { @@ -1577,7 +1671,7 @@ public void testHigherTaxonMatch2() { } @Test - public void testHomonymWithOrderResolution1() { + public void testHomonymWithOrderResolution1() throws Exception { try { String name = "Abelia"; LinnaeanRankClassification cl = new LinnaeanRankClassification(); @@ -1585,22 +1679,16 @@ public void testHomonymWithOrderResolution1() { NameSearchResult nsr = searcher.searchForRecord(cl, true); fail("Expecting homonym exception"); } catch (HomonymException ex) { - assertEquals(1, ex.getResults().size()); - } catch (SearchResultException e) { - fail("Unexpected search exception " + e); + assertEquals(2, ex.getResults().size()); } - try { - String name = "Abelia"; - LinnaeanRankClassification cl = new LinnaeanRankClassification(); - cl.setScientificName(name); - cl.setOrder("Dipsacales"); - NameSearchResult nsr = searcher.searchForRecord(cl, true); - assertNotNull(nsr); - assertEquals("https://id.biodiversity.org.au/node/apni/2892114", nsr.getLsid()); - } catch (SearchResultException e) { - fail("Unexpected search exception " + e); - } - } + String name = "Abelia"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName(name); + cl.setOrder("Dipsacales"); + NameSearchResult nsr = searcher.searchForRecord(cl, true); + assertNotNull(nsr); + assertEquals("https://id.biodiversity.org.au/node/apni/2892114", nsr.getLsid()); + } @Test public void testMultipleMisappliedResolution1() throws Exception { @@ -1639,6 +1727,26 @@ public void testMultipleMisappliedResolution3() throws Exception { assertTrue(metrics.getErrors().contains(ErrorType.MISAPPLIED)); } + // Ensure misapplication is ignored + @Test + public void testMultipleMisappliedResolution4() throws Exception { + String name = "Pterostylis bryophila"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertNotNull(metrics); + assertEquals("https://id.biodiversity.org.au/taxon/apni/51412050", metrics.getResult().getLsid()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + name = "Pterostylis obtusa"; + cl = new LinnaeanRankClassification(); + cl.setScientificName(name); + metrics = searcher.searchForRecordMetrics(cl, true); + assertNotNull(metrics); + assertEquals("https://id.biodiversity.org.au/taxon/apni/51412242", metrics.getResult().getLsid()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + assertTrue(metrics.getErrors().contains(ErrorType.MATCH_MISAPPLIED)); + } + // Synonym and accepted @Test @@ -1678,6 +1786,20 @@ public void testSynonymAccepted3() throws Exception { assertEquals("https://id.biodiversity.org.au/node/apni/2911212", metrics.getResult().getAcceptedLsid()); } + + @Test + public void testSynonymAccepted4() throws Exception { + String name = "Sugomel niger"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertNotNull(metrics); + assertEquals("ALA_3782348", metrics.getResult().getLsid()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + assertTrue(metrics.getErrors().contains(ErrorType.NONE)); + assertEquals("https://biodiversity.org.au/afd/taxa/b32a2ec6-315c-48cf-84b3-4898e39f4b57", metrics.getResult().getAcceptedLsid()); + } + // Available as a synonym but also misapplied. @Test public void testSynonymMisapplied1() throws Exception { @@ -1711,7 +1833,7 @@ public void testHigherTaxonomy() throws Exception { @Test public void testPhraseName1() throws Exception { String name = "Tephrosia sp. Crowded pinnae (C.R.Dunlop 8202)"; - String kingdom = "Planate"; + String kingdom = "Plantae"; String phylum = "Streptophyta"; String class_ = "Equisetopsida"; String order = "Fabales"; @@ -1725,7 +1847,7 @@ public void testPhraseName1() throws Exception { cl.setOrder(order); cl.setGenus(genus); cl.setSpecificEpithet(specificEpithet); - //cl.setRank(rank); + cl.setRank(rank); cl.setScientificName(name); MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); assertNotNull(metrics); @@ -1736,8 +1858,8 @@ public void testPhraseName1() throws Exception { @Test public void testPhraseName2() throws Exception { - String name = "Tephrosia sp. Miriam Vale (E.J.Thompson+ MIR33)"; - String kingdom = "Planate"; + String name = "Tephrosia sp. (Miriam Vale E.J.Thompson+ MIR33)"; + String kingdom = "Plantae"; String class_ = "Equisetopsida"; String genus = "Tephrosia"; String rank = "species"; @@ -1749,8 +1871,69 @@ public void testPhraseName2() throws Exception { cl.setScientificName(name); MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); assertNotNull(metrics); + assertEquals("https://id.biodiversity.org.au/instance/apni/51376249", metrics.getResult().getLsid()); + assertEquals("https://id.biodiversity.org.au/node/apni/2903953", metrics.getResult().getAcceptedLsid()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + assertEquals(SynonymType.SUBJECTIVE_SYNONYM, metrics.getResult().getSynonymType()); + } + + @Test + public void testPhraseName3() throws Exception { + String name = "Thryptomene sp. Leinster (B.J. Lepschi & L.A. Craven 4362) PN"; + String kingdom = "Plantae"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setKingdom(kingdom); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertNotNull(metrics); + assertEquals("https://id.biodiversity.org.au/node/apni/2904210", metrics.getResult().getLsid()); + assertEquals(MatchType.PHRASE, metrics.getResult().getMatchType()); + } + + @Test + public void testPhraseName4() throws Exception { + String name = "Tephrosia sp. Miriam Vale (E.J.Thompson+ MIR33) WA Herbarium"; + String kingdom = "Plantae"; + String class_ = "Equisetopsida"; + String genus = "Tephrosia"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setKingdom(kingdom); + cl.setKlass(class_); + cl.setGenus(genus); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertNotNull(metrics); assertEquals("https://id.biodiversity.org.au/node/apni/2903953", metrics.getResult().getLsid()); assertEquals(MatchType.PHRASE, metrics.getResult().getMatchType()); } + // Ensure illegitimate names are excluded from the system and don't gum the works up + @Test + public void testIllegitimate1() throws Exception { + String name = "Banksia collina"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertNotNull(metrics); + assertEquals("https://id.biodiversity.org.au/instance/apni/838699", metrics.getResult().getLsid()); + assertEquals("https://id.biodiversity.org.au/node/apni/2900678", metrics.getResult().getAcceptedLsid()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + assertEquals(SynonymType.OBJECTIVE_SYNONYM, metrics.getResult().getSynonymType()); + } + + + // Ensure illegitimate names are excluded from the system and don't gum the works up + @Test + public void testIllegitimate2() throws Exception { + String name = "Zieria fordii"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertNotNull(metrics); + assertEquals("https://id.biodiversity.org.au/name/apni/51337126", metrics.getResult().getLsid()); + assertEquals("https://id.biodiversity.org.au/taxon/apni/51367864", metrics.getResult().getRankClassification().getGid()); + assertEquals("https://id.biodiversity.org.au/taxon/apni/51367862", metrics.getResult().getRankClassification().getFid()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + } + } diff --git a/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java b/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java index 5885fcf05..9d1b670b7 100644 --- a/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java +++ b/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java @@ -22,7 +22,7 @@ public class BiocacheMatchTest { @org.junit.BeforeClass public static void init() throws Exception { - searcher = new ALANameSearcher("/data/lucene/namematching-20200214-lucene8"); + searcher = new ALANameSearcher("/data/lucene/namematching-20210629"); } @Test @@ -73,20 +73,15 @@ public void synonymHomonymIssue(){ } @Test - public void testRecursiveAuthorshipIssue() { - try { - LinnaeanRankClassification cl = new LinnaeanRankClassification(); - cl.setScientificName("Graphis notreallyaname Mull.Arg."); - cl.setAuthorship("Mull.Arg."); - cl.setKingdom("Animalia"); - cl.setGenus("Graphis"); - cl.setSpecificEpithet("notreallyaname"); - MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:2af76a1e-2086-46e3-90b9-6f00983b21a5", metrics.getResult().getLsid()); // Graphis from AFD - } catch (Exception e) { - e.printStackTrace(); - fail("Exception should not occur"); - } + public void testRecursiveAuthorshipIssue1() throws Exception { + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName("Graphis notreallyaname Mull.Arg."); + cl.setAuthorship("Mull.Arg."); + cl.setKingdom("Animalia"); + cl.setGenus("Graphis"); + cl.setSpecificEpithet("notreallyaname"); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertEquals("https://biodiversity.org.au/afd/taxa/2af76a1e-2086-46e3-90b9-6f00983b21a5", metrics.getResult().getLsid()); // Graphis from AFD } @Test @@ -98,7 +93,17 @@ public void testRecursiveAuthorshipIssue2() throws Exception { cl.setGenus("Graphis"); cl.setSpecificEpithet("notreallyaname"); MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); - assertEquals("NZOR-6-122770", metrics.getResult().getLsid()); // Can't find Graphis since not APC placed so gets Graphidaceae + assertEquals("NZOR-6-132826", metrics.getResult().getLsid()); // Can't find Graphis homonym so gets Graphidaceae + } + + @Test + public void testRecursiveAuthorshipIssue3() throws Exception { + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName("Graphis"); + cl.setKingdom("Fungi"); + cl.setGenus("Graphis"); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertEquals("NZOR-6-122770", metrics.getResult().getLsid()); // Can't find Graphis homonym so gets Graphidaceae } @Test @@ -147,7 +152,7 @@ public void testSPNovName() { cl.setSpecificEpithet(spEp); MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); //System.out.println(metrics.getResult()); - assertEquals("http://id.biodiversity.org.au/instance/apni/884433", metrics.getResult().getLsid()); + assertEquals("https://id.biodiversity.org.au/instance/apni/884433", metrics.getResult().getLsid()); assertTrue(metrics.getErrors().contains(ErrorType.HOMONYM)); } catch (Exception e) { @@ -236,7 +241,7 @@ public void testParentChildWithDifferentSpelling1() throws Exception { cl.setScientificName("Climacteris affinis"); MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:5d7c50bc-2c2d-4984-9924-d2a46dc3b00f", metrics.getResult().getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/0d28bce2-0bae-44f6-9c73-0afc0f343b8c", metrics.getResult().getLsid()); assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); // Dereferenced synonym assertTrue(metrics.getErrors().contains(ErrorType.PARENT_CHILD_SYNONYM)); } @@ -247,7 +252,7 @@ public void testParentChildWithDifferentSpelling2() throws Exception { cl.setScientificName("Limnodynastes dumerilii"); MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:2c50c2f6-7a0d-44e1-b549-458427b420c4", metrics.getResult().getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/2c50c2f6-7a0d-44e1-b549-458427b420c4", metrics.getResult().getLsid()); assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); // Dereferenced synonym assertTrue(metrics.getErrors().contains(ErrorType.PARENT_CHILD_SYNONYM)); } @@ -259,7 +264,7 @@ public void testAffCfSpecies1() throws Exception { // No issues cl.setScientificName("Zabidius novemaculeatus"); MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:58e06bba-de3b-4c8c-b165-d75bbeb21a36", metrics.getResult().getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/58e06bba-de3b-4c8c-b165-d75bbeb21a36", metrics.getResult().getLsid()); assertTrue(metrics.getErrors().contains(ErrorType.NONE)); cl = new LinnaeanRankClassification(); @@ -453,8 +458,8 @@ public void testDingo1() { cl.setScientificName(name); MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); assertNotNull(metrics); - assertEquals("urn:lsid:biodiversity.org.au:afd.name:3064f20b-f6de-4375-8377-904cbd6cf9fa", metrics.getResult().getLsid()); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:c2056f1b-fcde-45b9-904b-1cab280368d1", metrics.getResult().getAcceptedLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/3064f20b-f6de-4375-8377-904cbd6cf9fa", metrics.getResult().getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/c2056f1b-fcde-45b9-904b-1cab280368d1", metrics.getResult().getAcceptedLsid()); assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex); diff --git a/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java b/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java index b4a77dc74..cd2c324e0 100644 --- a/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java +++ b/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java @@ -30,7 +30,7 @@ public class IconicSpeciesTest { @org.junit.BeforeClass public static void init() throws Exception { - searcher = new ALANameSearcher("/data/lucene/namematching-20200214-lucene8"); + searcher = new ALANameSearcher("/data/lucene/namematching-20210629"); } //@Test diff --git a/src/test/java/au/org/ala/names/search/VernacularMatchTest.java b/src/test/java/au/org/ala/names/search/VernacularMatchTest.java index 20a34d554..62ac8c484 100644 --- a/src/test/java/au/org/ala/names/search/VernacularMatchTest.java +++ b/src/test/java/au/org/ala/names/search/VernacularMatchTest.java @@ -24,13 +24,13 @@ public class VernacularMatchTest { @org.junit.BeforeClass public static void init() throws Exception { - searcher = new ALANameSearcher("/data/lucene/namematching-20200214-lucene8"); + searcher = new ALANameSearcher("/data/lucene/namematching-20210629"); } @Test public void testVernacular1() throws Exception { String name = "Mary River Turtle"; - String expectedLsid = "urn:lsid:biodiversity.org.au:afd.taxon:d315deea-822c-4f2c-b439-da33d6af5fd6"; + String expectedLsid = "https://biodiversity.org.au/afd/taxa/d315deea-822c-4f2c-b439-da33d6af5fd6"; NameSearchResult result = null; result = searcher.searchForCommonName(name); @@ -52,7 +52,7 @@ public void testVernacular2() throws Exception { @Test public void testVernacular3() throws Exception { String name = "Drain Mangrovegoby"; - String expectedLsid = "urn:lsid:biodiversity.org.au:afd.taxon:19c60dcd-93a0-40a2-9ac1-3abe7119c505"; + String expectedLsid = "https://biodiversity.org.au/afd/taxa/19c60dcd-93a0-40a2-9ac1-3abe7119c505"; NameSearchResult result = null; result = searcher.searchForCommonName(name); @@ -64,7 +64,7 @@ public void testVernacular3() throws Exception { @Test public void testVernacular4() throws Exception { String name = "Onespine Unicornfish"; - String expectedLsid = "urn:lsid:biodiversity.org.au:afd.taxon:f7bfd383-5501-4196-9acb-d9d4d03cc45d"; + String expectedLsid = "https://biodiversity.org.au/afd/taxa/f7bfd383-5501-4196-9acb-d9d4d03cc45d"; NameSearchResult result = null; result = searcher.searchForCommonName(name); diff --git a/src/test/resources/au/org/ala/names/search/iconic_species_list.csv b/src/test/resources/au/org/ala/names/search/iconic_species_list.csv index d96313247..568e24879 100644 --- a/src/test/resources/au/org/ala/names/search/iconic_species_list.csv +++ b/src/test/resources/au/org/ala/names/search/iconic_species_list.csv @@ -23,12 +23,12 @@ BIRDS ,Boobook Owl,,Animalia,Chordata,Aves,STRIGIFORMES,STRIGIDAE,Ninox,novaeseelandiae,,,Yes,Yes,Yes ,Little Raven,,Animalia,Chordata,Aves,PASSERIFORMES,CORVIDAE,Corvus,mellori,,,Yes,Yes,Yes ,Sulphur-crested Cockatoo,,Animalia,Chordata,Aves,PSITTACIFORMES,CACATUIDAE,Cacatua,galerita,,,Yes,Yes,Yes -,Osprey,,Animalia,Chordata,Aves,FALCONIFORMES,ACCIPITRIDAE,Pandion,haliaetus,,,Yes,Yes,Yes +,Osprey,,Animalia,Chordata,Aves,Accipitriformes,Accipitridae,Pandion,haliaetus,,,Yes,Yes,Yes ,Major Mitchell Cockatoo,,Animalia,Chordata,Aves,PSITTACIFORMES,CACATUIDAE,Lophochroa,leadbeateri,,,Yes,Yes,Yes ,Southern Cassowary,,Animalia,Chordata,Aves,STRUTHIONIFORMES,CASUARIIDAE,Casuarius,casuarius,,,Yes,No,No ,Cape Baron Goose,,Animalia,Chordata,Aves,ANSERIFORMES,ANATIDAE,Cereopsis,novaehollandiae,novaehollandiae,,Yes,Yes,Yes ,Brolga,,Animalia,Chordata,Aves,GRUIFORMES,GRUIDAE,Grus,rubicunda,,,Yes,No,No -,Wedge-tailed Eagle,,Animalia,Chordata,Aves,FALCONIFORMES,ACCIPITRIDAE,Aquila,audax,,,Yes,No,Yes +,Wedge-tailed Eagle,,Animalia,Chordata,Aves,Accipitriformes,Accipitridae,Aquila,audax,,,Yes,No,Yes FISH ,Barramundi,,Animalia,CHORDATA,ACTINOPTERYGII,PERCIFORMES,LATIDAE,Lates,calcarifer,,,yes,yes,yes @@ -116,7 +116,7 @@ REPTILES INVERTEBRATES ,Red-backed Spider,,Animalia,Arthropoda,Arachnida,ARANEAE,THERIDIIDAE,Latrodectus,hasseltii,,,yes,yes,yes -,Sydney Funnelweb Spider,,Animalia,Arthropoda,Arachnida,ARANEAE,HEXATHELIDAE,Atrax,robustus,,,yes,yes,yes +,Sydney Funnelweb Spider,,Animalia,Arthropoda,Arachnida,Araneae,Atracidae,Atrax,robustus,,,yes,yes,yes ,Red-headed Mouse Spider,,Animalia,Arthropoda,Arachnida,ARANEAE,ACTINOPODIDAE,Missulena,occatoria,,,yes,yes,yes ,Cairn's Birdwing,,Animalia,Arthropoda,Insecta,LEPIDOPTERA,PAPILIONIDAE,Ornithoptera,priamus,,,yes,yes,yes ,Cabbage White Butterfly,,Animalia,Arthropoda,Insecta,LEPIDOPTERA,PIERIDAE,Pieris,rapae,,,yes,yes,yes @@ -196,12 +196,12 @@ Marine,Blue Groper,Official,Animalia,,,,,Achoerodus,viridis NT Animal,Red Kangaroo,Official,Animalia,CHORDATA,MAMMALIA,DIPROTODONTIA,MACROPODIDAE,Osphranter,rufus,,,yes,yes - limited,yes Plant,Sturt's Desert Rose,official,Plantae,Charophyta,Equisetopsida,MALVALES,MALVACEAE,Gossypium,sturtianum,,,yes,yes - limited,yes -Bird,Wedge-tailed,Official,Animalia,CHORDATA,AVES,FALCONIFORMES,ACCIPITRIDAE,Aquila,audax,,Wedge-tailed Eagle,yes,yes - limited,yes - change image on region page +Bird,Wedge-tailed,Official,Animalia,CHORDATA,AVES,Accipitriformes,Accipitridae,Aquila,audax,,Wedge-tailed Eagle,yes,yes - limited,yes - change image on region page Marine,No emblem QLD Animal,Koala,Official,ANIMALIA,CHORDATA,MAMMALIA,DIPROTODONTIA,PHASCOLARCTIDAE,Phascolarctos,cinereus,,,yes,yes,yes - change image on Region Page -Plant,,Official,,,,,,Vappodes,phalaenopsis,,Cooktown Orchid,yes,yes - limited,yes - limited +Plant,,Official,Plantae,Charophyta,Equisetopsida,Asparagales,Orchidaceae,Dendrobium,bigibbum ,,Cooktown Orchid,yes,yes - limited,yes - limited Bird,Brolga,Official,ANIMALIA,CHORDATA,AVES,GRUIFORMES,GRUIDAE,Grus,rubicunda,,,yes,yes,yes Marine,Anemone Fish,Official,ANIMALIA,,,,,Amphiprion,akindynos From 900ad5efa68621977bfd66a12e9f0b301006e77d Mon Sep 17 00:00:00 2001 From: pal155 Date: Fri, 24 Sep 2021 10:19:04 +1000 Subject: [PATCH 10/19] Updated test cases for new index Improved phrase name parsing (to be phased out, since GBIF now has phrase names) Rank comparator (unused for now, until reversed ranks structures can be worked out) --- .travis.yml | 6 +- README.md | 2 +- .../org/ala/names/index/RankComparator.java | 136 ++++++++++++++ .../org/ala/names/index/ScientificName.java | 16 +- .../java/au/org/ala/names/model/RankType.java | 5 +- .../org/gbif/nameparser/PhraseNameParser.java | 12 +- .../ala/names/index/RankComparatorTest.java | 166 ++++++++++++++++++ .../parser/util/PhraseNameParserTests.java | 26 ++- .../ala/names/search/ALANameSearcherTest.java | 5 +- .../ala/names/search/BiocacheMatchTest.java | 8 +- .../ala/names/search/IconicSpeciesTest.java | 2 +- .../ala/names/search/VernacularMatchTest.java | 4 +- 12 files changed, 355 insertions(+), 33 deletions(-) create mode 100644 src/main/java/au/org/ala/names/index/RankComparator.java create mode 100644 src/test/java/au/org/ala/names/index/RankComparatorTest.java diff --git a/.travis.yml b/.travis.yml index db985dbf4..bd4809b34 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,10 +10,10 @@ branches: before_install: - mkdir -p ~/.m2; wget -q -O ~/.m2/settings.xml https://raw.githubusercontent.com/AtlasOfLivingAustralia/travis-build-configuration/master/travis_maven_settings_simple.xml -- sudo mkdir -p /data/lucene; sudo wget -O /data/lucene/namematching-20210629.tgz https://archives.ala.org.au/archives/nameindexes/20210629/namematching-20210629.tgz +- sudo mkdir -p /data/lucene; sudo wget -O /data/lucene/namematching-20210811.tgz https://archives.ala.org.au/archives/nameindexes/20210811/namematching-20210811.tgz - cd /data/lucene -- sudo tar zxvf namematching-20210629.tgz -- sudo ln -s namematching-20210629 namematching +- sudo tar zxvf namematching-20210811.tgz +- sudo ln -s namematching-20210811 namematching - ls -laF - cd $TRAVIS_BUILD_DIR diff --git a/README.md b/README.md index 6d0dd4438..7abe2c0e5 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ The build creates 3 artefacts in the ala-name-matching/target directory: * ala-name-matching-3.5-sources.jar - source jar for the project code only The name index for Australian names lists used in unit tests can be downloaded [from here](https://biocache.ala.org.au/archives/nameindexes/20220629) and needs to be extracted to the -directory `/data/lucene/namematching-20210629` +directory `/data/lucene/namematching-20210811` ## ALA Names List diff --git a/src/main/java/au/org/ala/names/index/RankComparator.java b/src/main/java/au/org/ala/names/index/RankComparator.java new file mode 100644 index 000000000..9fbacf7d8 --- /dev/null +++ b/src/main/java/au/org/ala/names/index/RankComparator.java @@ -0,0 +1,136 @@ +package au.org.ala.names.index; + +import au.org.ala.names.model.RankType; +import org.gbif.checklistbank.model.Equality; + +import java.util.Arrays; +import java.util.Map; +import java.util.stream.Collectors; + +import static au.org.ala.names.model.RankType.*; + +/** + * Compare two ranks and establish whether they are close enough to each other or different. + */ +public class RankComparator { + private static final RankType[][] RANK_RANGES = new RankType[][] { + new RankType[] { DOMAIN, DOMAIN, KINGDOM }, + new RankType[] { KINGDOM, DOMAIN, INFRAKINGDOM }, + new RankType[] { SUBKINGDOM, KINGDOM, SUPERPHYLUM }, + new RankType[] { INFRAKINGDOM, KINGDOM, SUPERPHYLUM }, + new RankType[] { SUPERPHYLUM, INFRAKINGDOM, INFRAPHYLUM }, + new RankType[] { PHYLUM, INFRAKINGDOM, SUPERCLASS }, + new RankType[] { SUBPHYLUM, INFRAKINGDOM, CLASS }, + new RankType[] { INFRAPHYLUM, PHYLUM, CLASS }, + new RankType[] { SUPERCLASS, INFRAPHYLUM, INFRACLASS }, + new RankType[] { CLASS, SUBPHYLUM, SUPERORDER }, + new RankType[] { SUBCLASS, INFRAPHYLUM, ORDER }, + new RankType[] { INFRACLASS, CLASS, ORDER }, + new RankType[] { SUBINFRACLASS, SUBCLASS, ORDER }, + new RankType[] { SUPERDIVISION_ZOOLOGY, SUBCLASS, ORDER }, + new RankType[] { DIVISION_ZOOLOGY, SUBCLASS, ORDER }, + new RankType[] { SUBDIVISION_ZOOLOGY, SUBCLASS, ORDER }, + new RankType[] { SUPERCOHORT, SUBCLASS, ORDER }, + new RankType[] { COHORT, SUBCLASS, ORDER }, + new RankType[] { SUBCOHORT, SUBCLASS, ORDER }, + new RankType[] { SUPERORDER, INFRACLASS, INFRAORDER }, + new RankType[] { ORDER, SUBCLASS, SUPERFAMILY }, + new RankType[] { SUBORDER, INFRACLASS, FAMILY }, + new RankType[] { INFRAORDER, ORDER, FAMILY }, + new RankType[] { PARVORDER, SUBORDER, FAMILY }, + new RankType[] { SUPERSERIES_ZOOLOGY, SUBORDER, FAMILY }, + new RankType[] { SERIES_ZOOLOGY, SUBORDER, FAMILY }, + new RankType[] { SUBSERIES_ZOOLOGY, SUBORDER, FAMILY }, + new RankType[] { SUPERSECTION_ZOOLOGY, SUBORDER, FAMILY }, + new RankType[] { SECTION_ZOOLOGY, SUBORDER, FAMILY }, + new RankType[] { SUBSECTION_ZOOLOGY, SUBORDER, FAMILY }, + new RankType[] { FAMILY, SUBORDER, INFRAFAMILY }, + new RankType[] { SUBFAMILY, INFRAORDER, GENUS }, + new RankType[] { INFRAFAMILY, FAMILY, GENUS }, + new RankType[] { SUPERTRIBE, SUBFAMILY, GENUS }, + new RankType[] { TRIBE, SUBFAMILY, GENUS }, + new RankType[] { SUBTRIBE, SUBFAMILY, GENUS }, + new RankType[] { SUPERGENUS, INFRAFAMILY, INFRAGENUS }, + new RankType[] { GENUS_GROUP, INFRAFAMILY, INFRAGENUS }, + new RankType[] { GENUS, INFRAFAMILY, SUPERSPECIES }, + new RankType[] { SUBGENUS, INFRAFAMILY, SUPERSPECIES }, + new RankType[] { INFRAGENUS, INFRAFAMILY, SUPERSPECIES }, + new RankType[] { SUPERSECTION_BOTANY, GENUS, SPECIES_SUBGROUP }, + new RankType[] { SECTION_BOTANY, GENUS, SPECIES_SUBGROUP }, + new RankType[] { SUBSECTION_BOTANY, GENUS, SPECIES_SUBGROUP }, + new RankType[] { SUPERSERIES_BOTANY, GENUS, SPECIES_SUBGROUP }, + new RankType[] { SERIES_BOTANY, SUBGENUS, SPECIES_SUBGROUP }, + new RankType[] { SUBSERIES_BOTANY, SUBGENUS, SPECIES_SUBGROUP }, + new RankType[] { INFRAGENERICNAME, SUBGENUS, SPECIES_SUBGROUP }, + new RankType[] { SPECIES_GROUP, SUBGENUS, SPECIES_SUBGROUP }, + new RankType[] { SUPERSPECIES, SUBGENUS, SPECIES_SUBGROUP }, + new RankType[] { SPECIES_SUBGROUP, SUBGENUS, SPECIES_SUBGROUP }, + new RankType[] { SPECIES, SPECIES, TELEOMORPH }, + new RankType[] { NOTHOSPECIES, SPECIES, SUBSPECIES }, + new RankType[] { HOLOMORPH, SPECIES, SUBSPECIES }, + new RankType[] { ANAMORPH, SPECIES, SUBSPECIES }, + new RankType[] { TELEOMORPH, SPECIES, SUBSPECIES }, + new RankType[] { SUBSPECIES, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { NOTHOSUBSPECIES, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { INFRASPECIFICNAME, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { INFRASUBSPECIESNAME, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { VARIETY, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { NOTHOVARIETY, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { SUBVARIETY, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { FORM, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { NOTHOFORM, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { SUBFORM, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { BIOVAR, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { SEROVAR, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { FORMASPECIALIS, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { CULTIVARGROUP, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { CULTIVAR, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { PATHOVAR, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { HYBRID, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { SUPRAGENERICNAME, NOTHOSPECIES, SUPRAGENERICNAME } + }; + + private static final Map UPPER_BOUND = Arrays.stream(RANK_RANGES).collect(Collectors.toMap( + r -> r[0], + r -> r[1] + )); + + private static final Map LOWER_BOUND = Arrays.stream(RANK_RANGES).collect(Collectors.toMap( + r -> r[0], + r -> r[2] + )); + + /** + * Compare two ranges and see if they are equivalent-ish. + *

+ * Rank comparators allow a degree of slop between ranks, so that a + * subclass and order or supergenus and family are considered close enough. + * Incomparable ranks {@link RankType#INFORMAL} and the like are equal to each other + * and not equal to any other rank. + *

+ * @param rank1 The first rank + * @param rank2 The second rank + * + * @return An equality statement + */ + public Equality compare(RankType rank1, RankType rank2) { + if (rank1 == rank2) + return Equality.EQUAL; + if (rank1.getId() <= 0 && rank2.getId() <= 0) + return Equality.EQUAL; + if (rank1 == UNRANKED || rank2 == UNRANKED || rank1 == INFORMAL || rank2 == INFORMAL) + return Equality.EQUAL; + if (rank1.getId() <= 0 && rank2.getId() <= 0) + return Equality.UNKNOWN; + RankType r1u = UPPER_BOUND.get(rank1); + RankType r1l = LOWER_BOUND.get(rank1); + if ((r1u != null && rank2.compareTo(r1u) >= 0) && (r1l != null && rank2.compareTo(r1l) <= 0)) + return Equality.EQUAL; + RankType r2u = UPPER_BOUND.get(rank2); + RankType r2l = LOWER_BOUND.get(rank2); + if ((r2u != null && rank1.compareTo(r2u) >= 0) && (r2l != null && rank1.compareTo(r2l) <= 0)) + return Equality.EQUAL; + return Equality.DIFFERENT; + } + +} diff --git a/src/main/java/au/org/ala/names/index/ScientificName.java b/src/main/java/au/org/ala/names/index/ScientificName.java index 3a8ca1161..d810ef53f 100644 --- a/src/main/java/au/org/ala/names/index/ScientificName.java +++ b/src/main/java/au/org/ala/names/index/ScientificName.java @@ -147,16 +147,16 @@ private TaxonConcept findBasePrincipal(Taxonomy taxonomy) { return concepts.get(0); if (accepted.size() == 1) return accepted.get(0); - List authored = accepted.stream().filter(tc -> tc.isAuthored() || tc.isAutonym()).collect(Collectors.toList()); - if (authored.size() == 0) - return accepted.get(0); + final int score = accepted.stream().mapToInt(TaxonConcept::getPrincipalScore).max().orElse(TaxonomicElement.MIN_SCORE); + List candidates = accepted.stream().filter(tc -> tc.getPrincipalScore() == score).collect(Collectors.toList()); + if (candidates.size() == 1) + return candidates.get(0); + candidates.sort(REVERSE_PRINCIPAL_SCORE_COMPARATOR); + List authored = candidates.stream().filter(tc -> tc.isAuthored() || tc.isAutonym()).collect(Collectors.toList()); if (authored.size() == 1) return authored.get(0); - taxonomy.report(IssueType.COLLISION, "scientificName.collision", this, authored); - final int score = authored.stream().mapToInt(TaxonConcept::getPrincipalScore).max().orElse(TaxonomicElement.MIN_SCORE); - List candidates = authored.stream().filter(tc -> tc.getPrincipalScore() == score).collect(Collectors.toList()); - if (candidates.size() > 1) - taxonomy.report(IssueType.PROBLEM, "scientificName.collision.warn", this, candidates); + taxonomy.report(IssueType.COLLISION, "scientificName.collision", this, candidates); + taxonomy.report(IssueType.PROBLEM, "scientificName.collision.warn", this, candidates); return candidates.get(0); } diff --git a/src/main/java/au/org/ala/names/model/RankType.java b/src/main/java/au/org/ala/names/model/RankType.java index 773c1af75..bd68fb125 100644 --- a/src/main/java/au/org/ala/names/model/RankType.java +++ b/src/main/java/au/org/ala/names/model/RankType.java @@ -31,9 +31,11 @@ public enum RankType { DOMAIN(800, "kingdom", Rank.DOMAIN, null, 800, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Domain", "Domain", "Superkingdom", "Empire"), KINGDOM(1000, "kingdom", Rank.KINGDOM, 2f, 1000, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Kingdom", "Kingdom"), SUBKINGDOM(1200, "subkingdom", Rank.SUBKINGDOM, null, 1200, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Subkingdom"), + INFRAKINGDOM(1400, "infrakingdom", Rank.INFRAKINGDOM, null, 1400, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Infrakingdom"), SUPERPHYLUM(1800, "superphylum", Rank.SUPERPHYLUM, null, 2800, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Superphylum"), PHYLUM(2000, "phylum", Rank.PHYLUM, 2f, 2000, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Phylum", "Phylum", "division botany", "Division Botany"), SUBPHYLUM(2200, "subphylum", Rank.SUBPHYLUM, null, 2200, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Subphylum", "subdivision botany"), + INFRAPHYLUM(2400, "infraphylum", Rank.INFRAPHYLUM, null, 2400, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Infraphylum", "infradivision botany"), SUPERCLASS(2800, "superclass", Rank.SUPERCLASS, null, 2800, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Superclass"), CLASS(3000, "class", Rank.CLASS, 2f, 3000, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Class", "Class"), SUBCLASS(3200, "subclass", Rank.SUBCLASS, null, 3200, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Subclass"), @@ -66,7 +68,8 @@ public enum RankType { SUPERGENUS(5900, "genus", null, null, 5900, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Supergenus", "Supergenus"), GENUS_GROUP(5950, "genus group", null, null, 5950, true, "aggregate genera", "Aggregate Genera", "Genus Group"), GENUS(6000, "genus", Rank.GENUS, 3f, 6000, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Genus", "Genus"), - SUBGENUS(6500, "subgenus", Rank.SUBGENUS, null, 6500, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Subgenus"), + SUBGENUS(6400, "subgenus", Rank.SUBGENUS, null, 6400, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Subgenus"), + INFRAGENUS(6500, "infragenus", Rank.INFRAGENUS, null, 6500, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Infragenus"), SUPERSECTION_BOTANY(6550, "supersection botany", Rank.SECTION, null, 6550, false), SECTION_BOTANY(6600, "section botany", Rank.SECTION, null, 6600, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Section"), SUBSECTION_BOTANY(6700, "subsection botany", Rank.SUBSECTION, null, 6700, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Subsection"), diff --git a/src/main/java/org/gbif/nameparser/PhraseNameParser.java b/src/main/java/org/gbif/nameparser/PhraseNameParser.java index 59b181ffc..51e493ac9 100644 --- a/src/main/java/org/gbif/nameparser/PhraseNameParser.java +++ b/src/main/java/org/gbif/nameparser/PhraseNameParser.java @@ -56,6 +56,7 @@ public class PhraseNameParser extends GBIFNameParser { static { HashMap ranks = new HashMap(); + ranks.put("f", Rank.FORM); ranks.put("subsp", Rank.SUBSPECIES); ranks.put("ssp", Rank.SUBSPECIES); ranks.put("var", Rank.VARIETY); @@ -68,7 +69,8 @@ public class PhraseNameParser extends GBIFNameParser { public static final String ALL_LETTERS_NUMBERS = NormalisedNameParser.NAME_LETTERS + NormalisedNameParser.name_letters + "0-9"; protected static final String LOCATION_OR_DESCR = "(?:[" + ALL_LETTERS_NUMBERS + " -'\"_\\.]+|\\.)"; protected static final String VOUCHER = "(\\([" + ALL_LETTERS_NUMBERS + "- \\./&,']+\\))"; - protected static final String SOURCE_AUTHORITY = "([" + ALL_LETTERS_NUMBERS + "\\[\\]'\" -,\\.]+|\\.)"; + protected static final String COMMENTARY = "(\\[[^\\]]*\\])"; + protected static final String SOURCE_AUTHORITY = "([" + ALL_LETTERS_NUMBERS + "'\" -,\\.]+|\\.)"; protected static final String PHRASE_RANKS = "(?:" + StringUtils.join(VALID_PHRASE_RANKS.keySet(), "|") + ")\\.? "; private static final String RANK_MARKER_ALL = "(notho)? *(" + StringUtils.join(RankUtils.RANK_MARKER_MAP.keySet(), "|") + ")\\.?"; @@ -85,8 +87,10 @@ public class PhraseNameParser extends GBIFNameParser { // Group 3 indicates the mandatory location/desc for the phrase name. But it may be possible to have homonyms if the VOUCHER is not supplied + "(" + LOCATION_OR_DESCR + ")" //Group 4 is the VOUCHER for the phrase it indicates the collector and a voucher id - + VOUCHER + "?" - //Group 5 is the party propsoing addition of the taxon + + VOUCHER + "?(?: *)" + // Group 5 is any commentary + + COMMENTARY + "?(?: *)" + //Group 6 is the party propsoing addition of the taxon + SOURCE_AUTHORITY + "?$" ); @@ -146,7 +150,7 @@ public ParsedName parse(String scientificName, Rank rank) throws UnparsableExcep alapn.setAuthorsParsed(false); alapn.setLocationPhraseDescription(StringUtils.trimToNull(m.group(3))); alapn.setPhraseVoucher(StringUtils.trimToNull(m.group(4))); - alapn.setPhraseNominatingParty(StringUtils.trimToNull(m.group(5))); + alapn.setPhraseNominatingParty(StringUtils.trimToNull(m.group(6))); return alapn; } diff --git a/src/test/java/au/org/ala/names/index/RankComparatorTest.java b/src/test/java/au/org/ala/names/index/RankComparatorTest.java new file mode 100644 index 000000000..40aa3cb38 --- /dev/null +++ b/src/test/java/au/org/ala/names/index/RankComparatorTest.java @@ -0,0 +1,166 @@ +package au.org.ala.names.index; + +import static au.org.ala.names.model.RankType.*; + +import static org.gbif.checklistbank.model.Equality.*; +import org.junit.Before; +import org.junit.Test; + +import static org.junit.Assert.*; + +public class RankComparatorTest { + private RankComparator comparator; + + @Before + public void setUp() throws Exception { + this.comparator = new RankComparator(); + } + + @Test + public void testCompare1() { + assertEquals(EQUAL, this.comparator.compare(KINGDOM, KINGDOM)); + assertEquals(DIFFERENT, this.comparator.compare(KINGDOM, PHYLUM)); + assertEquals(DIFFERENT, this.comparator.compare(KINGDOM, CLASS)); + assertEquals(DIFFERENT, this.comparator.compare(KINGDOM, ORDER)); + assertEquals(DIFFERENT, this.comparator.compare(KINGDOM, FAMILY)); + assertEquals(DIFFERENT, this.comparator.compare(KINGDOM, GENUS)); + assertEquals(DIFFERENT, this.comparator.compare(KINGDOM, SPECIES)); + assertEquals(DIFFERENT, this.comparator.compare(KINGDOM, SUBSPECIES)); + } + + + @Test + public void testCompare2() { + assertEquals(DIFFERENT, this.comparator.compare(PHYLUM, KINGDOM)); + assertEquals(EQUAL, this.comparator.compare(PHYLUM, PHYLUM)); + assertEquals(DIFFERENT, this.comparator.compare(PHYLUM, CLASS)); + assertEquals(DIFFERENT, this.comparator.compare(PHYLUM, ORDER)); + assertEquals(DIFFERENT, this.comparator.compare(PHYLUM, FAMILY)); + assertEquals(DIFFERENT, this.comparator.compare(PHYLUM, GENUS)); + assertEquals(DIFFERENT, this.comparator.compare(PHYLUM, SPECIES)); + assertEquals(DIFFERENT, this.comparator.compare(PHYLUM, SUBSPECIES)); + } + + @Test + public void testCompare3() { + assertEquals(DIFFERENT, this.comparator.compare(CLASS, KINGDOM)); + assertEquals(DIFFERENT, this.comparator.compare(CLASS, PHYLUM)); + assertEquals(EQUAL, this.comparator.compare(CLASS, CLASS)); + assertEquals(DIFFERENT, this.comparator.compare(CLASS, ORDER)); + assertEquals(DIFFERENT, this.comparator.compare(CLASS, FAMILY)); + assertEquals(DIFFERENT, this.comparator.compare(CLASS, GENUS)); + assertEquals(DIFFERENT, this.comparator.compare(CLASS, SPECIES)); + assertEquals(DIFFERENT, this.comparator.compare(CLASS, SUBSPECIES)); + } + + @Test + public void testCompare4() { + assertEquals(DIFFERENT, this.comparator.compare(ORDER, KINGDOM)); + assertEquals(DIFFERENT, this.comparator.compare(ORDER, PHYLUM)); + assertEquals(DIFFERENT, this.comparator.compare(ORDER, CLASS)); + assertEquals(EQUAL, this.comparator.compare(ORDER, ORDER)); + assertEquals(DIFFERENT, this.comparator.compare(ORDER, FAMILY)); + assertEquals(DIFFERENT, this.comparator.compare(ORDER, GENUS)); + assertEquals(DIFFERENT, this.comparator.compare(ORDER, SPECIES)); + assertEquals(DIFFERENT, this.comparator.compare(ORDER, SUBSPECIES)); + } + + @Test + public void testCompare5() { + assertEquals(DIFFERENT, this.comparator.compare(FAMILY, KINGDOM)); + assertEquals(DIFFERENT, this.comparator.compare(FAMILY, PHYLUM)); + assertEquals(DIFFERENT, this.comparator.compare(FAMILY, CLASS)); + assertEquals(DIFFERENT, this.comparator.compare(FAMILY, ORDER)); + assertEquals(EQUAL, this.comparator.compare(FAMILY, FAMILY)); + assertEquals(DIFFERENT, this.comparator.compare(FAMILY, GENUS)); + assertEquals(DIFFERENT, this.comparator.compare(FAMILY, SPECIES)); + assertEquals(DIFFERENT, this.comparator.compare(FAMILY, SUBSPECIES)); + } + + @Test + public void testCompare6() { + assertEquals(DIFFERENT, this.comparator.compare(GENUS, KINGDOM)); + assertEquals(DIFFERENT, this.comparator.compare(GENUS, PHYLUM)); + assertEquals(DIFFERENT, this.comparator.compare(GENUS, CLASS)); + assertEquals(DIFFERENT, this.comparator.compare(GENUS, ORDER)); + assertEquals(DIFFERENT, this.comparator.compare(GENUS, FAMILY)); + assertEquals(EQUAL, this.comparator.compare(GENUS, GENUS)); + assertEquals(DIFFERENT, this.comparator.compare(GENUS, SPECIES)); + assertEquals(DIFFERENT, this.comparator.compare(GENUS, SUBSPECIES)); + } + + @Test + public void testCompare7() { + assertEquals(DIFFERENT, this.comparator.compare(SPECIES, KINGDOM)); + assertEquals(DIFFERENT, this.comparator.compare(SPECIES, PHYLUM)); + assertEquals(DIFFERENT, this.comparator.compare(SPECIES, CLASS)); + assertEquals(DIFFERENT, this.comparator.compare(SPECIES, ORDER)); + assertEquals(DIFFERENT, this.comparator.compare(SPECIES, FAMILY)); + assertEquals(DIFFERENT, this.comparator.compare(SPECIES, GENUS)); + assertEquals(EQUAL, this.comparator.compare(SPECIES, SPECIES)); + assertEquals(DIFFERENT, this.comparator.compare(SPECIES, SUBSPECIES)); + } + + @Test + public void testCompare8() { + assertEquals(DIFFERENT, this.comparator.compare(SUBSPECIES, KINGDOM)); + assertEquals(DIFFERENT, this.comparator.compare(SUBSPECIES, PHYLUM)); + assertEquals(DIFFERENT, this.comparator.compare(SUBSPECIES, CLASS)); + assertEquals(DIFFERENT, this.comparator.compare(SUBSPECIES, ORDER)); + assertEquals(DIFFERENT, this.comparator.compare(SUBSPECIES, FAMILY)); + assertEquals(DIFFERENT, this.comparator.compare(SUBSPECIES, GENUS)); + assertEquals(DIFFERENT, this.comparator.compare(SUBSPECIES, SPECIES)); + assertEquals(EQUAL, this.comparator.compare(SUBSPECIES, SUBSPECIES)); + } + + @Test + public void testCompare9() { + assertEquals(DIFFERENT, this.comparator.compare(KINGDOM, SUBPHYLUM)); + assertEquals(DIFFERENT, this.comparator.compare(CLASS, INFRAGENUS)); + assertEquals(DIFFERENT, this.comparator.compare(SUBORDER, CLASS)); + assertEquals(DIFFERENT, this.comparator.compare(INFRAORDER, INFRAFAMILY)); + assertEquals(DIFFERENT, this.comparator.compare(SUBGENUS, SUBSECTION_ZOOLOGY)); + assertEquals(DIFFERENT, this.comparator.compare(INFRAFAMILY, SECTION_ZOOLOGY)); + assertEquals(DIFFERENT, this.comparator.compare(INFRAFAMILY, SECTION_BOTANY)); + assertEquals(DIFFERENT, this.comparator.compare(PARVORDER, SPECIES)); + } + + + @Test + public void testCompare10() { + assertEquals(EQUAL, this.comparator.compare(INFRAKINGDOM, SUBPHYLUM)); + assertEquals(EQUAL, this.comparator.compare(SUPERCLASS, INFRACLASS)); + assertEquals(EQUAL, this.comparator.compare(SUBORDER, SUPERFAMILY)); + assertEquals(EQUAL, this.comparator.compare(ORDER, SUPERFAMILY)); + assertEquals(EQUAL, this.comparator.compare(INFRACLASS, ORDER)); + assertEquals(EQUAL, this.comparator.compare(SECTION_BOTANY, SERIES_BOTANY)); + assertEquals(EQUAL, this.comparator.compare(INFRAFAMILY, GENUS)); + assertEquals(EQUAL, this.comparator.compare(PARVORDER, SUBFAMILY)); + } + + @Test + public void testCompare11() { + assertEquals(EQUAL, this.comparator.compare(SUBSPECIES, CULTIVAR)); + assertEquals(EQUAL, this.comparator.compare(VARIETY, FORM)); + assertEquals(EQUAL, this.comparator.compare(SUBVARIETY, SUBSPECIES)); + assertEquals(EQUAL, this.comparator.compare(NOTHOSPECIES, SUBFORM)); + } + + @Test + public void testCompare12() { + assertEquals(EQUAL, this.comparator.compare(SUBSPECIES, INFORMAL)); + assertEquals(DIFFERENT, this.comparator.compare(INCERTAE_SEDIS, FORM)); + assertEquals(EQUAL, this.comparator.compare(UNRANKED, SUBSPECIES)); + assertEquals(DIFFERENT, this.comparator.compare(NOTHOSPECIES, SPECIES_INQUIRENDA)); + } + + + @Test + public void testCompare13() { + assertEquals(EQUAL, this.comparator.compare(INCERTAE_SEDIS, INFORMAL)); + assertEquals(EQUAL, this.comparator.compare(UNRANKED, SPECIES_INQUIRENDA)); + assertEquals(EQUAL, this.comparator.compare(UNRANKED, INCERTAE_SEDIS)); + assertEquals(EQUAL, this.comparator.compare(UNRANKED, SPECIES_INQUIRENDA)); + } + +} \ No newline at end of file diff --git a/src/test/java/au/org/ala/names/parser/util/PhraseNameParserTests.java b/src/test/java/au/org/ala/names/parser/util/PhraseNameParserTests.java index 79562db5d..b49c4bae5 100644 --- a/src/test/java/au/org/ala/names/parser/util/PhraseNameParserTests.java +++ b/src/test/java/au/org/ala/names/parser/util/PhraseNameParserTests.java @@ -314,7 +314,6 @@ public void testSpeciesLevelPhraseName5() throws Exception { assertEquals("(BR Maslin 7761)", ((ALAParsedName) pn).getPhraseVoucher()); assertEquals("Goodlands", ((ALAParsedName) pn).cleanPhrase); assertEquals("Maslin7761", ((ALAParsedName) pn).cleanVoucher); - assertEquals("[aff. resinosa]", ((ALAParsedName) pn).getPhraseNominatingParty()); assertEquals(Rank.SPECIES, pn.getRank()); } @@ -331,7 +330,6 @@ public void testSpeciesLevelPhraseName6() throws Exception { assertEquals("(BR Maslin 7711)", ((ALAParsedName) pn).getPhraseVoucher()); assertEquals("Manmanning", ((ALAParsedName) pn).cleanPhrase); assertEquals("Maslin7711", ((ALAParsedName) pn).cleanVoucher); - assertEquals("[aff. multispicata]", ((ALAParsedName) pn).getPhraseNominatingParty()); assertEquals(Rank.SPECIES, pn.getRank()); } @@ -469,15 +467,29 @@ public void testAllNamesForType() { // See https://github.com/AtlasOfLivingAustralia/ala-name-matching/issues/1 - // At the moment, not able to correctly parse this out - @Ignore @Test - public void testSpeciesMarkerPhraseName() { + public void testRankMarkerPhraseName1() { try { PhraseNameParser parser = new PhraseNameParser(); ParsedName pn = parser.parse("Marsilea sp. Neutral Junction (D.E.Albrecht 9192)"); - pn = parser.parse("Asparagus asparagoides f. Western Cape (R.Taplin 1133)"); - assertEquals("RTaplin1133", ((ALAParsedName) pn).cleanVoucher); + assertEquals(ALAParsedName.class, pn.getClass()); + assertEquals("Albrecht9192", ((ALAParsedName) pn).cleanVoucher); + + } catch (Exception e) { + fail(e.getMessage()); + } + } + + // See https://github.com/AtlasOfLivingAustralia/ala-name-matching/issues/1 + // Form doesn't seem to work correctly as it is treating the voucher as an authort + @Test + @Ignore + public void testRankMarkerPhraseName2() { + try { + PhraseNameParser parser = new PhraseNameParser(); + ParsedName pn = parser.parse("Asparagus asparagoides f. Western Cape (R.Taplin 1133)"); + assertEquals(ALAParsedName.class, pn.getClass()); + assertEquals("Albrecht9192", ((ALAParsedName) pn).cleanVoucher); } catch (Exception e) { fail(e.getMessage()); diff --git a/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java b/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java index a005d1810..c5163709b 100644 --- a/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java +++ b/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java @@ -20,7 +20,7 @@ public class ALANameSearcherTest { @org.junit.BeforeClass public static void init() throws Exception { - searcher = new ALANameSearcher("/data/lucene/namematching-20210629"); + searcher = new ALANameSearcher("/data/lucene/namematching-20210811"); } @Test @@ -1531,6 +1531,7 @@ public void testMetricsLookup1() throws Exception { } @Ignore // Until sub-taxon synonymy decided + @Test public void testMetricsLookup2() throws Exception { String name = "Trigonaphera vinnulum"; // Synonym of Trigonostoma vinnulum LinnaeanRankClassification cl = new LinnaeanRankClassification(); @@ -1824,7 +1825,7 @@ public void testHigherTaxonomy() throws Exception { cl.setFamily(family); MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); assertNotNull(metrics); - assertEquals("NZOR-6-49519", metrics.getResult().getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/81da9a0d-ecb6-4040-a56d-12a44042b63b", metrics.getResult().getLsid()); assertEquals(RankType.FAMILY, metrics.getResult().getRank()); assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); } diff --git a/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java b/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java index 9d1b670b7..f50c41ea6 100644 --- a/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java +++ b/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java @@ -22,17 +22,16 @@ public class BiocacheMatchTest { @org.junit.BeforeClass public static void init() throws Exception { - searcher = new ALANameSearcher("/data/lucene/namematching-20210629"); + searcher = new ALANameSearcher("/data/lucene/namematching-20210811"); } @Test - @Ignore public void testMatchHybrid(){ try{ LinnaeanRankClassification cl = new LinnaeanRankClassification(); cl.setScientificName("Eucalyptus globulus x Eucalyptus ovata"); MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); - assertEquals("hybrid", metrics.getNameType().toString()); + assertEquals(NameType.HYBRID, metrics.getNameType()); assertEquals(RankType.SPECIES, metrics.getResult().getRank()); } catch(Exception e){ @@ -125,7 +124,7 @@ public void testCrossRankHomonym() throws Exception { assertFalse("Cross rank homonym should have been resolved",metrics.getErrors().contains(ErrorType.HOMONYM)); } - // @Test + @Test public void testTibicentibicen() { try { LinnaeanRankClassification cl = new LinnaeanRankClassification(); @@ -343,6 +342,7 @@ public void testSubSpeciesMarker1() { fail("Unexpected search exception " + ex); } } + // See https://github.com/AtlasOfLivingAustralia/ala-name-matching/issues/1 @Test public void testSubSpeciesMarker2() { diff --git a/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java b/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java index cd2c324e0..33fee2f64 100644 --- a/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java +++ b/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java @@ -30,7 +30,7 @@ public class IconicSpeciesTest { @org.junit.BeforeClass public static void init() throws Exception { - searcher = new ALANameSearcher("/data/lucene/namematching-20210629"); + searcher = new ALANameSearcher("/data/lucene/namematching-20210811"); } //@Test diff --git a/src/test/java/au/org/ala/names/search/VernacularMatchTest.java b/src/test/java/au/org/ala/names/search/VernacularMatchTest.java index 62ac8c484..03e07f09b 100644 --- a/src/test/java/au/org/ala/names/search/VernacularMatchTest.java +++ b/src/test/java/au/org/ala/names/search/VernacularMatchTest.java @@ -24,7 +24,7 @@ public class VernacularMatchTest { @org.junit.BeforeClass public static void init() throws Exception { - searcher = new ALANameSearcher("/data/lucene/namematching-20210629"); + searcher = new ALANameSearcher("/data/lucene/namematching-20210811"); } @Test @@ -38,7 +38,7 @@ public void testVernacular1() throws Exception { assertEquals(expectedLsid, result.getLsid()); } - @Ignore // Requires indidgenous names + //@Ignore // Requires indidgenous names @Test public void testVernacular2() throws Exception { String name = "Dhulwa"; From 080a8bcad7ec6778e1066b38d1b5bccdc26e14c0 Mon Sep 17 00:00:00 2001 From: pal155 Date: Tue, 5 Oct 2021 09:21:48 +1100 Subject: [PATCH 11/19] Restructure library into modules. This will allow applications to just pull in the relevant parts and not create a widening chain of dependenies. See https://github.com/AtlasOfLivingAustralia/ala-name-matching/issues/127 --- README.md | 64 ++- ala-name-matching-builder/pom.xml | 79 ++++ .../org/ala/names/index/ALANameAnalyser.java | 0 .../org/ala/names/index/ALATaxonResolver.java | 0 .../java/au/org/ala/names/index/BareName.java | 7 +- .../au/org/ala/names/index/CSVNameSource.java | 5 +- .../org/ala/names/index/DwcaNameSource.java | 12 +- .../names/index/IndexBuilderException.java | 0 .../au/org/ala/names/index/IssueType.java | 0 .../java/au/org/ala/names/index/Name.java | 1 - .../au/org/ala/names/index/NameAnalyser.java | 0 .../java/au/org/ala/names/index/NameKey.java | 0 .../au/org/ala/names/index/NameProvider.java | 0 .../au/org/ala/names/index/NameSource.java | 0 .../org/ala/names/index/RankComparator.java | 0 .../java/au/org/ala/names/index/Reporter.java | 0 .../ala/names/index/ResolutionException.java | 1 - .../org/ala/names/index/ScientificName.java | 5 +- .../au/org/ala/names/index/TaxonConcept.java | 0 .../ala/names/index/TaxonConceptInstance.java | 0 .../org/ala/names/index/TaxonResolution.java | 0 .../names/index/TaxonResolutionException.java | 0 .../au/org/ala/names/index/TaxonResolver.java | 0 .../org/ala/names/index/TaxonomicElement.java | 1 - .../java/au/org/ala/names/index/Taxonomy.java | 0 .../org/ala/names/index/TaxonomyBuilder.java | 6 +- .../names/index/TaxonomyConfiguration.java | 1 - .../names/index/UnrankedScientificName.java | 0 .../index/provider/AndTaxonCondition.java | 0 .../provider/ConceptResolutionPriority.java | 0 .../names/index/provider/DiscardStrategy.java | 0 .../ala/names/index/provider/KeyAdjuster.java | 0 .../names/index/provider/KeyAdjustment.java | 0 .../index/provider/MatchTaxonCondition.java | 2 - .../names/index/provider/NameMatchType.java | 0 .../index/provider/OrTaxonCondition.java | 0 .../names/index/provider/ScoreAdjuster.java | 1 - .../names/index/provider/ScoreAdjustment.java | 0 .../names/index/provider/TaxonCondition.java | 0 .../index/provider/UnrankedStrategy.java | 1 - .../org/ala/names/search/ALANameIndexer.java | 2 +- .../org/ala/names/search/DwcaNameIndexer.java | 4 +- .../au/org/ala/names/util/DwcaWriter.java | 0 .../au/org/ala/names/util/GbifModule.java | 0 .../org/ala/homonyms/cross_rank_homonyms.txt | 0 .../ala/names/index/author_abbreviations.csv | 0 .../au/org/ala/names/index/informal_names.csv | 0 .../ala/names/index/nomenclatural_codes.csv | 0 .../index/nomenclatural_status_codes.csv | 0 .../au/org/ala/names/index/rank_codes.csv | 0 .../ala/names/index/taxonomic_type_codes.csv | 0 .../src}/main/resources/blacklist.txt | 0 .../src}/main/resources/log4j.xml | 0 .../src}/main/resources/taxonomy.properties | 0 .../ala/names/index/ALANameAnalyserTest.java | 0 .../ala/names/index/ALATaxonResolverTest.java | 0 .../ala/names/index/CSVNameSourceTest.java | 0 .../org/ala/names/index/NameProviderTest.java | 3 - .../ala/names/index/RankComparatorTest.java | 0 .../ala/names/index/ScientificNameTest.java | 0 .../org/ala/names/index/TaxonConceptTest.java | 0 .../index/TaxonomyConfiugrationTest.java | 1 - .../au/org/ala/names/index/TaxonomyTest.java | 0 .../index/provider/AndTaxonConditionTest.java | 1 - .../names/index/provider/KeyAdjusterTest.java | 0 .../provider/MatchTaxonConditionTest.java | 0 .../index/provider/OrTaxonConditionTest.java | 0 .../index/provider/ScoreAdjusterTest.java | 0 .../java/au/org/ala/names/util/TestUtils.java | 0 .../au/org/ala/names/index/dwca-1/meta.xml | 0 .../au/org/ala/names/index/dwca-1/taxon.csv | 0 .../names/index/dwca-1/vernacularNames.csv | 0 .../org/ala/names/index/name-provider-1.json | 0 .../names/index/provider/and-condition-1.json | 0 .../index/provider/match-condition-1.json | 0 .../names/index/provider/or-condition-1.json | 0 .../au/org/ala/names/index/taxonomy-1.csv | 0 .../au/org/ala/names/index/taxonomy-10.csv | 0 .../au/org/ala/names/index/taxonomy-11.csv | 0 .../au/org/ala/names/index/taxonomy-12.csv | 0 .../au/org/ala/names/index/taxonomy-13.csv | 0 .../au/org/ala/names/index/taxonomy-14.csv | 0 .../au/org/ala/names/index/taxonomy-15.csv | 0 .../au/org/ala/names/index/taxonomy-16.csv | 0 .../au/org/ala/names/index/taxonomy-17.csv | 0 .../au/org/ala/names/index/taxonomy-18.csv | 0 .../au/org/ala/names/index/taxonomy-19.csv | 0 .../au/org/ala/names/index/taxonomy-2.csv | 0 .../au/org/ala/names/index/taxonomy-20.csv | 0 .../au/org/ala/names/index/taxonomy-21.csv | 0 .../au/org/ala/names/index/taxonomy-22.csv | 0 .../au/org/ala/names/index/taxonomy-23.csv | 0 .../au/org/ala/names/index/taxonomy-24.csv | 0 .../au/org/ala/names/index/taxonomy-25.csv | 0 .../au/org/ala/names/index/taxonomy-26.csv | 0 .../au/org/ala/names/index/taxonomy-27.csv | 0 .../au/org/ala/names/index/taxonomy-28.csv | 0 .../au/org/ala/names/index/taxonomy-29.csv | 0 .../au/org/ala/names/index/taxonomy-3.csv | 0 .../au/org/ala/names/index/taxonomy-30.csv | 0 .../au/org/ala/names/index/taxonomy-31.csv | 0 .../au/org/ala/names/index/taxonomy-4.csv | 0 .../au/org/ala/names/index/taxonomy-5.csv | 0 .../au/org/ala/names/index/taxonomy-6.csv | 0 .../au/org/ala/names/index/taxonomy-7.csv | 0 .../au/org/ala/names/index/taxonomy-8.csv | 0 .../au/org/ala/names/index/taxonomy-9.csv | 0 .../au/org/ala/names/index/taxonomy-bad-1.csv | 0 .../ala/names/index/taxonomy-config-1.json | 0 .../ala/names/index/taxonomy-config-2.json | 0 .../ala/names/index/taxonomy-config-3.json | 0 .../ala/names/index/taxonomy-config-4.json | 0 .../au/org/ala/names/index/vernacular-1.csv | 0 ala-name-matching-distribution/pom.xml | 67 +++ .../src/assembly/assembly.xml | 35 ++ .../src/main/scripts/compare.sh | 4 + .../src/main/scripts/dump.sh | 4 + .../src/main/scripts/generate.sh | 4 + .../src/main/scripts/index.sh | 4 + .../src/main/scripts/merge.sh | 4 + ala-name-matching-model/pom.xml | 44 ++ .../au/org/ala/names/model/ALAParsedName.java | 0 .../au/org/ala/names/model/ErrorType.java | 0 .../model/LinnaeanRankClassification.java | 44 +- .../au/org/ala/names/model/MatchMetrics.java | 3 +- .../au/org/ala/names/model/MatchType.java | 0 .../org/ala/names/model/MetricsResultDTO.java | 0 .../java/au/org/ala/names/model/NameFlag.java | 0 .../org/ala/names/model/NameSearchResult.java | 68 +-- .../java/au/org/ala/names/model/RankType.java | 0 .../au/org/ala/names/model/SynonymType.java | 0 .../au/org/ala/names/model/TaxonomicType.java | 0 .../ala/names/model/TaxonomicTypeGroup.java | 0 .../org/ala/names/model/VernacularType.java | 0 .../names/search/ExcludedNameException.java | 0 .../ala/names/search/HomonymException.java | 4 +- .../ala/names/search/MisappliedException.java | 0 .../search/ParentSynonymChildException.java | 0 .../au/org/ala/names/search/SPPException.java | 0 .../names/search/SearchResultException.java | 3 +- .../ala/names/util/CleanedScientificName.java | 0 .../org/ala/names/util/TaxonNameSoundEx.java | 2 +- .../main/java/au/org/ala/vocab/ALATerm.java | 0 .../org/gbif/nameparser/PhraseNameParser.java | 5 +- .../parser/util/PhraseNameParserTests.java | 0 .../names/util/CleanedScientificNameTest.java | 0 .../gbif/nameparser/PhraseNameParserTest.java | 2 - ala-name-matching-search/pom.xml | 55 +++ .../analyzer/LowerCaseKeywordAnalyzer.java | 9 +- .../org/ala/names/search/ALANameSearcher.java | 100 ++++- .../au/org/ala/names/search}/FieldType.java | 5 +- .../org/ala/names/search}/NameIndexField.java | 12 +- .../java/au/org/ala/names/util/FileUtils.java | 2 +- .../org/ala/names/util/TaxonNameSoundEx.java | 281 ++++++++++++ .../org/ala/homonyms/cross_rank_homonyms.txt | 33 ++ .../org/ala/propertystore/known_homonyms.txt | 0 .../ala/names/search/ALANameSearcherTest.java | 0 .../ala/names/search/BiocacheMatchTest.java | 0 .../ala/names/search/IconicSpeciesTest.java | 9 +- .../ala/names/search/VernacularMatchTest.java | 0 .../ala/names/search/iconic_species_list.csv | 0 ala-name-matching-tools/pom.xml | 38 ++ .../org/ala/names/util/NameListComparer.java | 8 +- .../org/ala/names/util/NameListGenerator.java | 7 +- .../java/au/org/ala/names/util/TermDump.java | 0 .../src/main/resources/log4j.xml | 32 ++ .../CB_script_AFD_synonyms.sql | 0 data/historical/README.md | 1 + data/{ => historical}/all-families.txt | 0 .../historical}/applicationContext-cb.xml | 0 .../search => data/historical}/ba_names.txt | 0 .../historical}/bio_aust_birds.txt | 0 .../historical}/biocache_animal_col.txt | 0 .../search => data/historical}/birds.txt | 0 .../search => data/historical}/caab_fish.txt | 0 .../historical}/db/CoL2010_dwc_export.sql | 22 +- .../historical}/db/CoL_commonNames.sql | 18 +- .../db/checklist_bank_model_additions_ala.sql | 240 +++++----- .../org/ala => data/historical}/db/irmng.sql | 34 +- .../nomenclatural_status.properties | 0 .../spatial-distribution-names.txt | 0 pom.xml | 166 ++----- src/assembly/assembly.xml | 30 -- src/main/java/au/org/ala/vocab/Concept.java | 143 ------ src/main/java/au/org/ala/vocab/TaxonRank.java | 63 --- .../java/au/org/ala/vocab/Vocabulary.java | 98 ---- .../ala/names/search/MatchMetricsTest.java | 422 ------------------ .../java/au/org/ala/vocab/TaxonRankTest.java | 33 -- .../au/org/ala/vocab/taxon-rank-1.json | 14 - 189 files changed, 1084 insertions(+), 1285 deletions(-) create mode 100644 ala-name-matching-builder/pom.xml rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/ALANameAnalyser.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/ALATaxonResolver.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/BareName.java (97%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/CSVNameSource.java (99%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/DwcaNameSource.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/IndexBuilderException.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/IssueType.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/Name.java (99%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/NameAnalyser.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/NameKey.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/NameProvider.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/NameSource.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/RankComparator.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/Reporter.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/ResolutionException.java (96%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/ScientificName.java (99%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/TaxonConcept.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/TaxonConceptInstance.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/TaxonResolution.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/TaxonResolutionException.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/TaxonResolver.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/TaxonomicElement.java (99%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/Taxonomy.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/TaxonomyBuilder.java (98%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/TaxonomyConfiguration.java (99%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/UnrankedScientificName.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/provider/AndTaxonCondition.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/provider/ConceptResolutionPriority.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/provider/DiscardStrategy.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/provider/KeyAdjuster.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/provider/KeyAdjustment.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/provider/MatchTaxonCondition.java (99%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/provider/NameMatchType.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/provider/OrTaxonCondition.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/provider/ScoreAdjuster.java (97%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/provider/ScoreAdjustment.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/provider/TaxonCondition.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/index/provider/UnrankedStrategy.java (99%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/search/ALANameIndexer.java (99%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/search/DwcaNameIndexer.java (99%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/util/DwcaWriter.java (100%) rename {src => ala-name-matching-builder/src}/main/java/au/org/ala/names/util/GbifModule.java (100%) rename {src => ala-name-matching-builder/src}/main/resources/au/org/ala/homonyms/cross_rank_homonyms.txt (100%) rename {src => ala-name-matching-builder/src}/main/resources/au/org/ala/names/index/author_abbreviations.csv (100%) rename {src => ala-name-matching-builder/src}/main/resources/au/org/ala/names/index/informal_names.csv (100%) rename {src => ala-name-matching-builder/src}/main/resources/au/org/ala/names/index/nomenclatural_codes.csv (100%) rename {src => ala-name-matching-builder/src}/main/resources/au/org/ala/names/index/nomenclatural_status_codes.csv (100%) rename {src => ala-name-matching-builder/src}/main/resources/au/org/ala/names/index/rank_codes.csv (100%) rename {src => ala-name-matching-builder/src}/main/resources/au/org/ala/names/index/taxonomic_type_codes.csv (100%) rename {src => ala-name-matching-builder/src}/main/resources/blacklist.txt (100%) rename {src => ala-name-matching-builder/src}/main/resources/log4j.xml (100%) rename {src => ala-name-matching-builder/src}/main/resources/taxonomy.properties (100%) rename {src => ala-name-matching-builder/src}/test/java/au/org/ala/names/index/ALANameAnalyserTest.java (100%) rename {src => ala-name-matching-builder/src}/test/java/au/org/ala/names/index/ALATaxonResolverTest.java (100%) rename {src => ala-name-matching-builder/src}/test/java/au/org/ala/names/index/CSVNameSourceTest.java (100%) rename {src => ala-name-matching-builder/src}/test/java/au/org/ala/names/index/NameProviderTest.java (99%) rename {src => ala-name-matching-builder/src}/test/java/au/org/ala/names/index/RankComparatorTest.java (100%) rename {src => ala-name-matching-builder/src}/test/java/au/org/ala/names/index/ScientificNameTest.java (100%) rename {src => ala-name-matching-builder/src}/test/java/au/org/ala/names/index/TaxonConceptTest.java (100%) rename {src => ala-name-matching-builder/src}/test/java/au/org/ala/names/index/TaxonomyConfiugrationTest.java (99%) rename {src => ala-name-matching-builder/src}/test/java/au/org/ala/names/index/TaxonomyTest.java (100%) rename {src => ala-name-matching-builder/src}/test/java/au/org/ala/names/index/provider/AndTaxonConditionTest.java (99%) rename {src => ala-name-matching-builder/src}/test/java/au/org/ala/names/index/provider/KeyAdjusterTest.java (100%) rename {src => ala-name-matching-builder/src}/test/java/au/org/ala/names/index/provider/MatchTaxonConditionTest.java (100%) rename {src => ala-name-matching-builder/src}/test/java/au/org/ala/names/index/provider/OrTaxonConditionTest.java (100%) rename {src => ala-name-matching-builder/src}/test/java/au/org/ala/names/index/provider/ScoreAdjusterTest.java (100%) rename {src => ala-name-matching-builder/src}/test/java/au/org/ala/names/util/TestUtils.java (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/dwca-1/meta.xml (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/dwca-1/taxon.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/dwca-1/vernacularNames.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/name-provider-1.json (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/provider/and-condition-1.json (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/provider/match-condition-1.json (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/provider/or-condition-1.json (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-1.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-10.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-11.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-12.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-13.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-14.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-15.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-16.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-17.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-18.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-19.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-2.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-20.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-21.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-22.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-23.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-24.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-25.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-26.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-27.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-28.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-29.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-3.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-30.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-31.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-4.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-5.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-6.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-7.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-8.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-9.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-bad-1.csv (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-config-1.json (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-config-2.json (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-config-3.json (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/taxonomy-config-4.json (100%) rename {src => ala-name-matching-builder/src}/test/resources/au/org/ala/names/index/vernacular-1.csv (100%) create mode 100644 ala-name-matching-distribution/pom.xml create mode 100644 ala-name-matching-distribution/src/assembly/assembly.xml create mode 100644 ala-name-matching-distribution/src/main/scripts/compare.sh create mode 100644 ala-name-matching-distribution/src/main/scripts/dump.sh create mode 100644 ala-name-matching-distribution/src/main/scripts/generate.sh create mode 100644 ala-name-matching-distribution/src/main/scripts/index.sh create mode 100644 ala-name-matching-distribution/src/main/scripts/merge.sh create mode 100644 ala-name-matching-model/pom.xml rename {src => ala-name-matching-model/src}/main/java/au/org/ala/names/model/ALAParsedName.java (100%) rename {src => ala-name-matching-model/src}/main/java/au/org/ala/names/model/ErrorType.java (100%) rename {src => ala-name-matching-model/src}/main/java/au/org/ala/names/model/LinnaeanRankClassification.java (87%) rename {src => ala-name-matching-model/src}/main/java/au/org/ala/names/model/MatchMetrics.java (98%) rename {src => ala-name-matching-model/src}/main/java/au/org/ala/names/model/MatchType.java (100%) rename {src => ala-name-matching-model/src}/main/java/au/org/ala/names/model/MetricsResultDTO.java (100%) rename {src => ala-name-matching-model/src}/main/java/au/org/ala/names/model/NameFlag.java (100%) rename {src => ala-name-matching-model/src}/main/java/au/org/ala/names/model/NameSearchResult.java (71%) rename {src => ala-name-matching-model/src}/main/java/au/org/ala/names/model/RankType.java (100%) rename {src => ala-name-matching-model/src}/main/java/au/org/ala/names/model/SynonymType.java (100%) rename {src => ala-name-matching-model/src}/main/java/au/org/ala/names/model/TaxonomicType.java (100%) rename {src => ala-name-matching-model/src}/main/java/au/org/ala/names/model/TaxonomicTypeGroup.java (100%) rename {src => ala-name-matching-model/src}/main/java/au/org/ala/names/model/VernacularType.java (100%) rename {src => ala-name-matching-model/src}/main/java/au/org/ala/names/search/ExcludedNameException.java (100%) rename {src => ala-name-matching-model/src}/main/java/au/org/ala/names/search/HomonymException.java (100%) rename {src => ala-name-matching-model/src}/main/java/au/org/ala/names/search/MisappliedException.java (100%) rename {src => ala-name-matching-model/src}/main/java/au/org/ala/names/search/ParentSynonymChildException.java (100%) rename {src => ala-name-matching-model/src}/main/java/au/org/ala/names/search/SPPException.java (100%) rename {src => ala-name-matching-model/src}/main/java/au/org/ala/names/search/SearchResultException.java (99%) rename {src => ala-name-matching-model/src}/main/java/au/org/ala/names/util/CleanedScientificName.java (100%) rename {src => ala-name-matching-model/src}/main/java/au/org/ala/names/util/TaxonNameSoundEx.java (99%) rename {src => ala-name-matching-model/src}/main/java/au/org/ala/vocab/ALATerm.java (100%) rename {src => ala-name-matching-model/src}/main/java/org/gbif/nameparser/PhraseNameParser.java (98%) rename {src => ala-name-matching-model/src}/test/java/au/org/ala/names/parser/util/PhraseNameParserTests.java (100%) rename {src => ala-name-matching-model/src}/test/java/au/org/ala/names/util/CleanedScientificNameTest.java (100%) rename {src => ala-name-matching-model/src}/test/java/org/gbif/nameparser/PhraseNameParserTest.java (96%) create mode 100644 ala-name-matching-search/pom.xml rename {src => ala-name-matching-search/src}/main/java/au/org/ala/names/lucene/analyzer/LowerCaseKeywordAnalyzer.java (88%) rename {src => ala-name-matching-search/src}/main/java/au/org/ala/names/search/ALANameSearcher.java (95%) rename {src/main/java/au/org/ala/names/model => ala-name-matching-search/src/main/java/au/org/ala/names/search}/FieldType.java (98%) rename {src/main/java/au/org/ala/names/model => ala-name-matching-search/src/main/java/au/org/ala/names/search}/NameIndexField.java (93%) rename {src => ala-name-matching-search/src}/main/java/au/org/ala/names/util/FileUtils.java (98%) create mode 100644 ala-name-matching-search/src/main/java/au/org/ala/names/util/TaxonNameSoundEx.java create mode 100644 ala-name-matching-search/src/main/resources/au/org/ala/homonyms/cross_rank_homonyms.txt rename {src => ala-name-matching-search/src}/main/resources/au/org/ala/propertystore/known_homonyms.txt (100%) rename {src => ala-name-matching-search/src}/test/java/au/org/ala/names/search/ALANameSearcherTest.java (100%) rename {src => ala-name-matching-search/src}/test/java/au/org/ala/names/search/BiocacheMatchTest.java (100%) rename {src => ala-name-matching-search/src}/test/java/au/org/ala/names/search/IconicSpeciesTest.java (98%) rename {src => ala-name-matching-search/src}/test/java/au/org/ala/names/search/VernacularMatchTest.java (100%) rename {src => ala-name-matching-search/src}/test/resources/au/org/ala/names/search/iconic_species_list.csv (100%) create mode 100644 ala-name-matching-tools/pom.xml rename {src => ala-name-matching-tools/src}/main/java/au/org/ala/names/util/NameListComparer.java (98%) rename {src => ala-name-matching-tools/src}/main/java/au/org/ala/names/util/NameListGenerator.java (95%) rename {src => ala-name-matching-tools/src}/main/java/au/org/ala/names/util/TermDump.java (100%) create mode 100644 ala-name-matching-tools/src/main/resources/log4j.xml rename data/{ => historical}/CB_script_AFD_synonyms.sql (100%) create mode 100644 data/historical/README.md rename data/{ => historical}/all-families.txt (100%) rename {src/main/resources/au/org/ala/propertystore => data/historical}/applicationContext-cb.xml (100%) rename {src/test/resources/au/org/ala/names/search => data/historical}/ba_names.txt (100%) rename {src/test/resources/au/org/ala/names/search => data/historical}/bio_aust_birds.txt (100%) rename {src/test/resources/au/org/ala/names/search => data/historical}/biocache_animal_col.txt (100%) rename {src/test/resources/au/org/ala/names/search => data/historical}/birds.txt (100%) rename {src/test/resources/au/org/ala/names/search => data/historical}/caab_fish.txt (100%) rename {src/main/resources/au/org/ala => data/historical}/db/CoL2010_dwc_export.sql (98%) rename {src/main/resources/au/org/ala => data/historical}/db/CoL_commonNames.sql (98%) rename {src/main/resources/au/org/ala => data/historical}/db/checklist_bank_model_additions_ala.sql (98%) rename {src/main/resources/au/org/ala => data/historical}/db/irmng.sql (98%) rename {src/main/resources/au/org/ala/vocab => data/historical}/nomenclatural_status.properties (100%) rename {src/test/resources/au/org/ala/names/search => data/historical}/spatial-distribution-names.txt (100%) delete mode 100644 src/assembly/assembly.xml delete mode 100644 src/main/java/au/org/ala/vocab/Concept.java delete mode 100644 src/main/java/au/org/ala/vocab/TaxonRank.java delete mode 100644 src/main/java/au/org/ala/vocab/Vocabulary.java delete mode 100644 src/test/java/au/org/ala/names/search/MatchMetricsTest.java delete mode 100644 src/test/java/au/org/ala/vocab/TaxonRankTest.java delete mode 100644 src/test/resources/au/org/ala/vocab/taxon-rank-1.json diff --git a/README.md b/README.md index 7abe2c0e5..8b293148f 100644 --- a/README.md +++ b/README.md @@ -6,11 +6,25 @@ This API borrows heavily from the name parsing great work done by [GBIF](https:/ in their [scientific name parser library](https://github.com/gbif/name-parser) This code contains additions for handling some Australian specific issues. +## Modules + +* **ala-name-matching-model** The data model used by the name matching index. + This module contains a number of useful vocabularies that you may want to + include in your application, even if you don' want to name match. +* **ala-name-matching-search** Local name index searching. + Include this in you application if you want to match names against a local name index. +* **ala-name-magcing-builder** Merge taxonomies and build name indexes. + This is a separate module to the searcher so that you can build the name + index that the searcher uses, without importing a shedload of dependencies + if you just want to search for things. +* **ala-name-matching-tools** Some useful utilities that can be used to + do bulk matching for testing and the like. +* **ala-name-matching-distributions** A full distribution zip file, including + some shell scripts to get various commands going. + ## Versions -Currently there are 2 versions of this library, 2.x and 3.x. -* 2.x is using lucene 4. -* 3.x is using lucene 6 or above. +Version 4.x of the library uses Lucene 8. ## Generating a name match index @@ -41,17 +55,19 @@ You can download the IRMNG DwCA for homonyms from the following URL: An assembly zip file for this can be downloaded from our maven repository : -[ala-name-matching-3.5-distribution.zip](http://nexus.ala.org.au/service/local/repositories/releases/content/au/org/ala/ala-name-matching/3.5/ala-name-matching-3.5-distribution.zip) +[ala-name-matching-4.0-SNAPSHOT-distribution.zip](http://nexus.ala.org.au/service/local/repositories/releases/content/au/org/ala/ala-name-matching/3.5/ala-name-matching-3.5-distribution.zip) To generate the name index using the data described above, follow these steps. Alternatively use the [ALA Ansible scripts](https://github.com/AtlasOfLivingAustralia/ala-install) here using the playbook [nameindexer.yml](https://github.com/AtlasOfLivingAustralia/ala-install/blob/master/ansible/nameindexer-standalone.yml) which does it all for you. * Download the zip files linked above to a directory e.g. /data/names/ and extract them -* Download the distribution zip [ala-name-matching-3.5-distribution.zip](http://nexus.ala.org.au/service/local/repositories/releases/content/au/org/ala/ala-name-matching/3.5/ala-name-matching-3.5-distribution.zip) +* Download the distribution zip [ala-name-matching-disribution-4.0-SNAPSHOT-distribution.zip](http://nexus.ala.org.au/service/local/repositories/releases/content/au/org/ala/ala-name-matching/3.5/ala-name-matching-distribution-4.0-SNAPSHOT-distribution.zip) + and unzip it. + You wil find a number of shell scripts in the base directory. * Generate the names index with command: ``` -java -jar ala-name-matching-3.5.jar --all --dwca /data/names/dwca-col --target /data/lucene/testdwc-namematching --irmng /data/names/irmng/IRMNG_DWC_HOMONYMS --common /data/names/col_vernacular.txt +./index.sh --all --dwca /data/names/dwca-col --target /data/lucene/testdwc-namematching --irmng /data/names/irmng/IRMNG_DWC_HOMONYMS --common /data/names/col_vernacular.txt ``` Please be aware that the names indexing could take over an hour to complete. @@ -66,7 +82,7 @@ into a single, combined taxonomy. An example command for the taxonomy builder is: ``` -java --classpath au.org.ala.names.index.TaxonomyBuilder -c /data/names/ala-taxon-config.json -w tmp -o /data/names/combined /data/names/APNI/DwC /data/names/AFD/DwC /data/names/CAAB/DwC +./merge.sh -c /data/names/ala-taxon-config.json -w tmp -o /data/names/combined /data/names/APNI/DwC /data/names/AFD/DwC /data/names/CAAB/DwC ``` More information about the merge configuration can be found [here](doc/merge-config.md). @@ -76,11 +92,15 @@ More information about the merge configuration can be found [here](doc/merge-con This library is built with maven. By default a `mvn install` will try to run a test suite which will fail without a local installation of a name index. To skip this step, run a build with ```mvn install -DskipTests=true```. -The build creates 3 artefacts in the ala-name-matching/target directory: +The build creates one artefact in the `ala-name-matching-distribution/target` directory: + +* ala-name-matching-distribution-4.0-SNAPSHOT-distribution.zip - zip containing the project jar and dependencies -* ala-name-matching-3.5.jar - built jar for the project code only -* ala-name-matching-3.5-distribution.zip - zip containing the project jar and dependencies -* ala-name-matching-3.5-sources.jar - source jar for the project code only +Each module contains two artefacts in the +`ala-name-matching/ala-name-matching-/target` directory: + +* ala-name-matching--4.0-SNAPSHOT.jar - built jar for the project code only +* ala-name-matching--4.0-SNAPSHOT-sources.jar - source jar for the project code only The name index for Australian names lists used in unit tests can be downloaded [from here](https://biocache.ala.org.au/archives/nameindexes/20220629) and needs to be extracted to the directory `/data/lucene/namematching-20210811` @@ -116,19 +136,29 @@ The ALA Name Matching is available as a library that can be used in other projec To use ala-name-matching, include it as a dependency in your pom file: ``` - - au.org.ala - ala-name-matching - 3.5 - + + au.org.ala + ala-name-matching-search + 4.0-SNAPSHOT + ``` +If you just want the handy enums and such-like, use +``` + + au.org.ala + ala-name-matching-model + 4.0-SNAPSHOT + +``` + + If you are using grails 3, you may encounter problems with the newer GBIF libraries having validation code that conflicts with spring validation. You can correct this by using ``` -compile("au.org.ala:ala-name-matching:3.5") { +compile("au.org.ala:ala-name-matching-search:4.0-SNAPSHOT") { exclude group: 'org.slf4j', module: 'slf4j-log4j12' exclude group: 'org.apache.bval', module: 'org.apache.bval.bundle' } diff --git a/ala-name-matching-builder/pom.xml b/ala-name-matching-builder/pom.xml new file mode 100644 index 000000000..8ae325219 --- /dev/null +++ b/ala-name-matching-builder/pom.xml @@ -0,0 +1,79 @@ + + + 4.0.0 + + + au.org.ala + ala-name-matching + 4.0-SNAPSHOT + + + ala-name-matching-builder + jar + ALA Name Matching Taxonomy Merging and Index Building + Tools to first merge multiple taxonomies together and then build a searchable index out of the resulting taxonomy + + + au.org.ala + ala-name-matching-model + ${project.version} + + + au.org.ala + ala-name-matching-search + ${project.version} + + + org.slf4j + slf4j-log4j12 + ${slf4j.version} + + + org.gbif + dwca-io + ${dwca-io.version} + + + commons-io + commons-io + + + org.slf4j + slf4j-api + + + + + org.gbif.checklistbank + checklistbank-common + ${checklist-bank.version} + + + org.gbif.registry + registry-ws-client + + + com.beust + jcommander + + + org.slf4j + jcl-over-slf4j + + + io.dropwizard.metrics + metrics-core + + + io.dropwizard.metrics + metrics-ganglia + + + + + commons-cli + commons-cli + ${commons-cli.version} + + + diff --git a/src/main/java/au/org/ala/names/index/ALANameAnalyser.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALANameAnalyser.java similarity index 100% rename from src/main/java/au/org/ala/names/index/ALANameAnalyser.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALANameAnalyser.java diff --git a/src/main/java/au/org/ala/names/index/ALATaxonResolver.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALATaxonResolver.java similarity index 100% rename from src/main/java/au/org/ala/names/index/ALATaxonResolver.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALATaxonResolver.java diff --git a/src/main/java/au/org/ala/names/index/BareName.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/BareName.java similarity index 97% rename from src/main/java/au/org/ala/names/index/BareName.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/BareName.java index 5902e1974..bfe13354f 100644 --- a/src/main/java/au/org/ala/names/index/BareName.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/BareName.java @@ -1,9 +1,8 @@ package au.org.ala.names.index; -import au.org.ala.names.model.RankType; -import org.gbif.api.vocabulary.NomenclaturalCode; - -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; import java.util.stream.Collectors; /** diff --git a/src/main/java/au/org/ala/names/index/CSVNameSource.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/CSVNameSource.java similarity index 99% rename from src/main/java/au/org/ala/names/index/CSVNameSource.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/CSVNameSource.java index 672d2027c..abe76be62 100644 --- a/src/main/java/au/org/ala/names/index/CSVNameSource.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/CSVNameSource.java @@ -1,8 +1,8 @@ package au.org.ala.names.index; -import au.org.ala.vocab.ALATerm; import au.org.ala.names.model.RankType; import au.org.ala.names.model.TaxonomicType; +import au.org.ala.vocab.ALATerm; import com.opencsv.CSVReader; import com.opencsv.CSVReaderBuilder; import org.apache.lucene.document.Document; @@ -17,7 +17,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.*; +import java.io.IOException; +import java.io.Reader; import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Path; diff --git a/src/main/java/au/org/ala/names/index/DwcaNameSource.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/DwcaNameSource.java similarity index 100% rename from src/main/java/au/org/ala/names/index/DwcaNameSource.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/DwcaNameSource.java index 00e004961..8a5f0f886 100644 --- a/src/main/java/au/org/ala/names/index/DwcaNameSource.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/DwcaNameSource.java @@ -1,9 +1,9 @@ package au.org.ala.names.index; -import au.org.ala.names.model.VernacularType; -import au.org.ala.vocab.ALATerm; import au.org.ala.names.model.RankType; import au.org.ala.names.model.TaxonomicType; +import au.org.ala.names.model.VernacularType; +import au.org.ala.vocab.ALATerm; import org.apache.commons.beanutils.BeanUtils; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -13,15 +13,15 @@ import org.gbif.api.model.registry.Dataset; import org.gbif.api.vocabulary.*; import org.gbif.dwc.terms.DcTerm; -import org.gbif.dwc.terms.GbifTerm; -import org.gbif.dwca.io.MetadataException; -import org.gbif.dwca.record.Record; -import org.gbif.dwca.record.StarRecord; import org.gbif.dwc.terms.DwcTerm; +import org.gbif.dwc.terms.GbifTerm; import org.gbif.dwc.terms.Term; import org.gbif.dwca.io.Archive; import org.gbif.dwca.io.ArchiveFactory; import org.gbif.dwca.io.ArchiveFile; +import org.gbif.dwca.io.MetadataException; +import org.gbif.dwca.record.Record; +import org.gbif.dwca.record.StarRecord; import java.io.File; import java.io.IOException; diff --git a/src/main/java/au/org/ala/names/index/IndexBuilderException.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/IndexBuilderException.java similarity index 100% rename from src/main/java/au/org/ala/names/index/IndexBuilderException.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/IndexBuilderException.java diff --git a/src/main/java/au/org/ala/names/index/IssueType.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/IssueType.java similarity index 100% rename from src/main/java/au/org/ala/names/index/IssueType.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/IssueType.java diff --git a/src/main/java/au/org/ala/names/index/Name.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/Name.java similarity index 99% rename from src/main/java/au/org/ala/names/index/Name.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/Name.java index 96d61b7a0..c8807c42d 100644 --- a/src/main/java/au/org/ala/names/index/Name.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/Name.java @@ -2,7 +2,6 @@ import au.org.ala.names.model.RankType; -import org.gbif.api.vocabulary.NomenclaturalCode; import java.util.*; import java.util.stream.Collectors; diff --git a/src/main/java/au/org/ala/names/index/NameAnalyser.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameAnalyser.java similarity index 100% rename from src/main/java/au/org/ala/names/index/NameAnalyser.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameAnalyser.java diff --git a/src/main/java/au/org/ala/names/index/NameKey.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameKey.java similarity index 100% rename from src/main/java/au/org/ala/names/index/NameKey.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameKey.java diff --git a/src/main/java/au/org/ala/names/index/NameProvider.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameProvider.java similarity index 100% rename from src/main/java/au/org/ala/names/index/NameProvider.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameProvider.java diff --git a/src/main/java/au/org/ala/names/index/NameSource.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameSource.java similarity index 100% rename from src/main/java/au/org/ala/names/index/NameSource.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameSource.java diff --git a/src/main/java/au/org/ala/names/index/RankComparator.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/RankComparator.java similarity index 100% rename from src/main/java/au/org/ala/names/index/RankComparator.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/RankComparator.java diff --git a/src/main/java/au/org/ala/names/index/Reporter.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/Reporter.java similarity index 100% rename from src/main/java/au/org/ala/names/index/Reporter.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/Reporter.java diff --git a/src/main/java/au/org/ala/names/index/ResolutionException.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ResolutionException.java similarity index 96% rename from src/main/java/au/org/ala/names/index/ResolutionException.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/ResolutionException.java index d73e47940..fa9a8c722 100644 --- a/src/main/java/au/org/ala/names/index/ResolutionException.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ResolutionException.java @@ -1,7 +1,6 @@ package au.org.ala.names.index; import javax.annotation.Nullable; -import java.util.Collections; import java.util.List; /** diff --git a/src/main/java/au/org/ala/names/index/ScientificName.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ScientificName.java similarity index 99% rename from src/main/java/au/org/ala/names/index/ScientificName.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/ScientificName.java index d810ef53f..3e2bee463 100644 --- a/src/main/java/au/org/ala/names/index/ScientificName.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ScientificName.java @@ -6,7 +6,10 @@ import org.slf4j.LoggerFactory; import java.io.IOException; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; import java.util.stream.Collectors; /** diff --git a/src/main/java/au/org/ala/names/index/TaxonConcept.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonConcept.java similarity index 100% rename from src/main/java/au/org/ala/names/index/TaxonConcept.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonConcept.java diff --git a/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java similarity index 100% rename from src/main/java/au/org/ala/names/index/TaxonConceptInstance.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java diff --git a/src/main/java/au/org/ala/names/index/TaxonResolution.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonResolution.java similarity index 100% rename from src/main/java/au/org/ala/names/index/TaxonResolution.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonResolution.java diff --git a/src/main/java/au/org/ala/names/index/TaxonResolutionException.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonResolutionException.java similarity index 100% rename from src/main/java/au/org/ala/names/index/TaxonResolutionException.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonResolutionException.java diff --git a/src/main/java/au/org/ala/names/index/TaxonResolver.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonResolver.java similarity index 100% rename from src/main/java/au/org/ala/names/index/TaxonResolver.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonResolver.java diff --git a/src/main/java/au/org/ala/names/index/TaxonomicElement.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomicElement.java similarity index 99% rename from src/main/java/au/org/ala/names/index/TaxonomicElement.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomicElement.java index 339193252..256fc4f7f 100644 --- a/src/main/java/au/org/ala/names/index/TaxonomicElement.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomicElement.java @@ -1,7 +1,6 @@ package au.org.ala.names.index; import au.org.ala.names.model.RankType; -import org.gbif.api.vocabulary.NomenclaturalCode; import java.util.Comparator; diff --git a/src/main/java/au/org/ala/names/index/Taxonomy.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/Taxonomy.java similarity index 100% rename from src/main/java/au/org/ala/names/index/Taxonomy.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/Taxonomy.java diff --git a/src/main/java/au/org/ala/names/index/TaxonomyBuilder.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomyBuilder.java similarity index 98% rename from src/main/java/au/org/ala/names/index/TaxonomyBuilder.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomyBuilder.java index d9784b94d..56648d19a 100644 --- a/src/main/java/au/org/ala/names/index/TaxonomyBuilder.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomyBuilder.java @@ -1,16 +1,14 @@ package au.org.ala.names.index; -import au.org.ala.names.search.ALANameSearcher; import au.org.ala.names.search.DwcaNameIndexer; -import au.org.ala.names.util.FileUtils; import org.apache.commons.cli.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.*; +import java.io.File; +import java.io.FileInputStream; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; import java.util.List; import java.util.stream.Collectors; diff --git a/src/main/java/au/org/ala/names/index/TaxonomyConfiguration.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomyConfiguration.java similarity index 99% rename from src/main/java/au/org/ala/names/index/TaxonomyConfiguration.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomyConfiguration.java index 9a1451a10..c51476068 100644 --- a/src/main/java/au/org/ala/names/index/TaxonomyConfiguration.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomyConfiguration.java @@ -18,7 +18,6 @@ import java.io.*; import java.net.URI; import java.util.*; -import java.util.stream.Collectors; /** * A readable description of a taxonomy construction. diff --git a/src/main/java/au/org/ala/names/index/UnrankedScientificName.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/UnrankedScientificName.java similarity index 100% rename from src/main/java/au/org/ala/names/index/UnrankedScientificName.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/UnrankedScientificName.java diff --git a/src/main/java/au/org/ala/names/index/provider/AndTaxonCondition.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/AndTaxonCondition.java similarity index 100% rename from src/main/java/au/org/ala/names/index/provider/AndTaxonCondition.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/AndTaxonCondition.java diff --git a/src/main/java/au/org/ala/names/index/provider/ConceptResolutionPriority.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ConceptResolutionPriority.java similarity index 100% rename from src/main/java/au/org/ala/names/index/provider/ConceptResolutionPriority.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ConceptResolutionPriority.java diff --git a/src/main/java/au/org/ala/names/index/provider/DiscardStrategy.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/DiscardStrategy.java similarity index 100% rename from src/main/java/au/org/ala/names/index/provider/DiscardStrategy.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/DiscardStrategy.java diff --git a/src/main/java/au/org/ala/names/index/provider/KeyAdjuster.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/KeyAdjuster.java similarity index 100% rename from src/main/java/au/org/ala/names/index/provider/KeyAdjuster.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/KeyAdjuster.java diff --git a/src/main/java/au/org/ala/names/index/provider/KeyAdjustment.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/KeyAdjustment.java similarity index 100% rename from src/main/java/au/org/ala/names/index/provider/KeyAdjustment.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/KeyAdjustment.java diff --git a/src/main/java/au/org/ala/names/index/provider/MatchTaxonCondition.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/MatchTaxonCondition.java similarity index 99% rename from src/main/java/au/org/ala/names/index/provider/MatchTaxonCondition.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/MatchTaxonCondition.java index 52047e6dd..706049778 100644 --- a/src/main/java/au/org/ala/names/index/provider/MatchTaxonCondition.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/MatchTaxonCondition.java @@ -1,12 +1,10 @@ package au.org.ala.names.index.provider; import au.org.ala.names.index.NameKey; -import au.org.ala.names.index.NameProvider; import au.org.ala.names.index.TaxonConceptInstance; import au.org.ala.names.model.RankType; import au.org.ala.names.model.TaxonomicType; import com.fasterxml.jackson.annotation.JsonIgnore; -import com.fasterxml.jackson.annotation.JsonProperty; import org.gbif.api.vocabulary.NameType; import org.gbif.api.vocabulary.NomenclaturalCode; import org.gbif.api.vocabulary.NomenclaturalStatus; diff --git a/src/main/java/au/org/ala/names/index/provider/NameMatchType.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/NameMatchType.java similarity index 100% rename from src/main/java/au/org/ala/names/index/provider/NameMatchType.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/NameMatchType.java diff --git a/src/main/java/au/org/ala/names/index/provider/OrTaxonCondition.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/OrTaxonCondition.java similarity index 100% rename from src/main/java/au/org/ala/names/index/provider/OrTaxonCondition.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/OrTaxonCondition.java diff --git a/src/main/java/au/org/ala/names/index/provider/ScoreAdjuster.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ScoreAdjuster.java similarity index 97% rename from src/main/java/au/org/ala/names/index/provider/ScoreAdjuster.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ScoreAdjuster.java index e1cd00e8a..cbd30d1a5 100644 --- a/src/main/java/au/org/ala/names/index/provider/ScoreAdjuster.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ScoreAdjuster.java @@ -6,7 +6,6 @@ import java.util.ArrayList; import java.util.List; -import java.util.stream.Collectors; /** * A score adjustment for applying to a specific diff --git a/src/main/java/au/org/ala/names/index/provider/ScoreAdjustment.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ScoreAdjustment.java similarity index 100% rename from src/main/java/au/org/ala/names/index/provider/ScoreAdjustment.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ScoreAdjustment.java diff --git a/src/main/java/au/org/ala/names/index/provider/TaxonCondition.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/TaxonCondition.java similarity index 100% rename from src/main/java/au/org/ala/names/index/provider/TaxonCondition.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/TaxonCondition.java diff --git a/src/main/java/au/org/ala/names/index/provider/UnrankedStrategy.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/UnrankedStrategy.java similarity index 99% rename from src/main/java/au/org/ala/names/index/provider/UnrankedStrategy.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/UnrankedStrategy.java index 1f997c0cb..1b1691cae 100644 --- a/src/main/java/au/org/ala/names/index/provider/UnrankedStrategy.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/UnrankedStrategy.java @@ -5,7 +5,6 @@ import au.org.ala.names.model.TaxonomicType; import java.util.Arrays; -import java.util.Collection; import java.util.HashSet; import java.util.Set; diff --git a/src/main/java/au/org/ala/names/search/ALANameIndexer.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/search/ALANameIndexer.java similarity index 99% rename from src/main/java/au/org/ala/names/search/ALANameIndexer.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/search/ALANameIndexer.java index f96890c65..1b22633e6 100644 --- a/src/main/java/au/org/ala/names/search/ALANameIndexer.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/search/ALANameIndexer.java @@ -24,7 +24,7 @@ import com.opencsv.CSVReaderBuilder; import org.apache.commons.io.FileUtils; import org.apache.commons.io.LineIterator; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.lucene.analysis.Analyzer; diff --git a/src/main/java/au/org/ala/names/search/DwcaNameIndexer.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/search/DwcaNameIndexer.java similarity index 99% rename from src/main/java/au/org/ala/names/search/DwcaNameIndexer.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/search/DwcaNameIndexer.java index a74bdb705..c3f02b8c5 100644 --- a/src/main/java/au/org/ala/names/search/DwcaNameIndexer.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/search/DwcaNameIndexer.java @@ -21,8 +21,8 @@ import com.opencsv.CSVReader; import org.apache.commons.cli.*; import org.apache.commons.io.FileUtils; -import org.apache.commons.lang.StringUtils; -import org.apache.commons.lang.time.DateFormatUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.time.DateFormatUtils; import org.apache.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.KeywordAnalyzer; diff --git a/src/main/java/au/org/ala/names/util/DwcaWriter.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/util/DwcaWriter.java similarity index 100% rename from src/main/java/au/org/ala/names/util/DwcaWriter.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/util/DwcaWriter.java diff --git a/src/main/java/au/org/ala/names/util/GbifModule.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/util/GbifModule.java similarity index 100% rename from src/main/java/au/org/ala/names/util/GbifModule.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/util/GbifModule.java diff --git a/src/main/resources/au/org/ala/homonyms/cross_rank_homonyms.txt b/ala-name-matching-builder/src/main/resources/au/org/ala/homonyms/cross_rank_homonyms.txt similarity index 100% rename from src/main/resources/au/org/ala/homonyms/cross_rank_homonyms.txt rename to ala-name-matching-builder/src/main/resources/au/org/ala/homonyms/cross_rank_homonyms.txt diff --git a/src/main/resources/au/org/ala/names/index/author_abbreviations.csv b/ala-name-matching-builder/src/main/resources/au/org/ala/names/index/author_abbreviations.csv similarity index 100% rename from src/main/resources/au/org/ala/names/index/author_abbreviations.csv rename to ala-name-matching-builder/src/main/resources/au/org/ala/names/index/author_abbreviations.csv diff --git a/src/main/resources/au/org/ala/names/index/informal_names.csv b/ala-name-matching-builder/src/main/resources/au/org/ala/names/index/informal_names.csv similarity index 100% rename from src/main/resources/au/org/ala/names/index/informal_names.csv rename to ala-name-matching-builder/src/main/resources/au/org/ala/names/index/informal_names.csv diff --git a/src/main/resources/au/org/ala/names/index/nomenclatural_codes.csv b/ala-name-matching-builder/src/main/resources/au/org/ala/names/index/nomenclatural_codes.csv similarity index 100% rename from src/main/resources/au/org/ala/names/index/nomenclatural_codes.csv rename to ala-name-matching-builder/src/main/resources/au/org/ala/names/index/nomenclatural_codes.csv diff --git a/src/main/resources/au/org/ala/names/index/nomenclatural_status_codes.csv b/ala-name-matching-builder/src/main/resources/au/org/ala/names/index/nomenclatural_status_codes.csv similarity index 100% rename from src/main/resources/au/org/ala/names/index/nomenclatural_status_codes.csv rename to ala-name-matching-builder/src/main/resources/au/org/ala/names/index/nomenclatural_status_codes.csv diff --git a/src/main/resources/au/org/ala/names/index/rank_codes.csv b/ala-name-matching-builder/src/main/resources/au/org/ala/names/index/rank_codes.csv similarity index 100% rename from src/main/resources/au/org/ala/names/index/rank_codes.csv rename to ala-name-matching-builder/src/main/resources/au/org/ala/names/index/rank_codes.csv diff --git a/src/main/resources/au/org/ala/names/index/taxonomic_type_codes.csv b/ala-name-matching-builder/src/main/resources/au/org/ala/names/index/taxonomic_type_codes.csv similarity index 100% rename from src/main/resources/au/org/ala/names/index/taxonomic_type_codes.csv rename to ala-name-matching-builder/src/main/resources/au/org/ala/names/index/taxonomic_type_codes.csv diff --git a/src/main/resources/blacklist.txt b/ala-name-matching-builder/src/main/resources/blacklist.txt similarity index 100% rename from src/main/resources/blacklist.txt rename to ala-name-matching-builder/src/main/resources/blacklist.txt diff --git a/src/main/resources/log4j.xml b/ala-name-matching-builder/src/main/resources/log4j.xml similarity index 100% rename from src/main/resources/log4j.xml rename to ala-name-matching-builder/src/main/resources/log4j.xml diff --git a/src/main/resources/taxonomy.properties b/ala-name-matching-builder/src/main/resources/taxonomy.properties similarity index 100% rename from src/main/resources/taxonomy.properties rename to ala-name-matching-builder/src/main/resources/taxonomy.properties diff --git a/src/test/java/au/org/ala/names/index/ALANameAnalyserTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/ALANameAnalyserTest.java similarity index 100% rename from src/test/java/au/org/ala/names/index/ALANameAnalyserTest.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/index/ALANameAnalyserTest.java diff --git a/src/test/java/au/org/ala/names/index/ALATaxonResolverTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/ALATaxonResolverTest.java similarity index 100% rename from src/test/java/au/org/ala/names/index/ALATaxonResolverTest.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/index/ALATaxonResolverTest.java diff --git a/src/test/java/au/org/ala/names/index/CSVNameSourceTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/CSVNameSourceTest.java similarity index 100% rename from src/test/java/au/org/ala/names/index/CSVNameSourceTest.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/index/CSVNameSourceTest.java diff --git a/src/test/java/au/org/ala/names/index/NameProviderTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/NameProviderTest.java similarity index 99% rename from src/test/java/au/org/ala/names/index/NameProviderTest.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/index/NameProviderTest.java index 89fc351bd..686394d4d 100644 --- a/src/test/java/au/org/ala/names/index/NameProviderTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/NameProviderTest.java @@ -2,14 +2,11 @@ import au.org.ala.names.model.TaxonomicType; import au.org.ala.names.util.TestUtils; -import com.fasterxml.jackson.databind.JavaType; import com.fasterxml.jackson.databind.ObjectMapper; import org.gbif.api.vocabulary.NomenclaturalCode; import org.junit.Before; import org.junit.Test; -import java.util.List; - import static org.junit.Assert.*; /** diff --git a/src/test/java/au/org/ala/names/index/RankComparatorTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/RankComparatorTest.java similarity index 100% rename from src/test/java/au/org/ala/names/index/RankComparatorTest.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/index/RankComparatorTest.java diff --git a/src/test/java/au/org/ala/names/index/ScientificNameTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/ScientificNameTest.java similarity index 100% rename from src/test/java/au/org/ala/names/index/ScientificNameTest.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/index/ScientificNameTest.java diff --git a/src/test/java/au/org/ala/names/index/TaxonConceptTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonConceptTest.java similarity index 100% rename from src/test/java/au/org/ala/names/index/TaxonConceptTest.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonConceptTest.java diff --git a/src/test/java/au/org/ala/names/index/TaxonomyConfiugrationTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonomyConfiugrationTest.java similarity index 99% rename from src/test/java/au/org/ala/names/index/TaxonomyConfiugrationTest.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonomyConfiugrationTest.java index 1a108e5a2..fd668a849 100644 --- a/src/test/java/au/org/ala/names/index/TaxonomyConfiugrationTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonomyConfiugrationTest.java @@ -5,7 +5,6 @@ import org.gbif.api.vocabulary.NomenclaturalCode; import org.gbif.checklistbank.authorship.AuthorComparator; import org.gbif.checklistbank.model.Equality; -import org.junit.Before; import org.junit.Test; import java.io.StringWriter; diff --git a/src/test/java/au/org/ala/names/index/TaxonomyTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonomyTest.java similarity index 100% rename from src/test/java/au/org/ala/names/index/TaxonomyTest.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonomyTest.java diff --git a/src/test/java/au/org/ala/names/index/provider/AndTaxonConditionTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/AndTaxonConditionTest.java similarity index 99% rename from src/test/java/au/org/ala/names/index/provider/AndTaxonConditionTest.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/AndTaxonConditionTest.java index 3f6e3be73..d79006135 100644 --- a/src/test/java/au/org/ala/names/index/provider/AndTaxonConditionTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/AndTaxonConditionTest.java @@ -4,7 +4,6 @@ import au.org.ala.names.index.NameKey; import au.org.ala.names.index.NameProvider; import au.org.ala.names.index.TaxonConceptInstance; -import au.org.ala.names.model.RankType; import au.org.ala.names.model.TaxonomicType; import au.org.ala.names.util.TestUtils; import com.fasterxml.jackson.databind.ObjectMapper; diff --git a/src/test/java/au/org/ala/names/index/provider/KeyAdjusterTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/KeyAdjusterTest.java similarity index 100% rename from src/test/java/au/org/ala/names/index/provider/KeyAdjusterTest.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/KeyAdjusterTest.java diff --git a/src/test/java/au/org/ala/names/index/provider/MatchTaxonConditionTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/MatchTaxonConditionTest.java similarity index 100% rename from src/test/java/au/org/ala/names/index/provider/MatchTaxonConditionTest.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/MatchTaxonConditionTest.java diff --git a/src/test/java/au/org/ala/names/index/provider/OrTaxonConditionTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/OrTaxonConditionTest.java similarity index 100% rename from src/test/java/au/org/ala/names/index/provider/OrTaxonConditionTest.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/OrTaxonConditionTest.java diff --git a/src/test/java/au/org/ala/names/index/provider/ScoreAdjusterTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/ScoreAdjusterTest.java similarity index 100% rename from src/test/java/au/org/ala/names/index/provider/ScoreAdjusterTest.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/ScoreAdjusterTest.java diff --git a/src/test/java/au/org/ala/names/util/TestUtils.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/util/TestUtils.java similarity index 100% rename from src/test/java/au/org/ala/names/util/TestUtils.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/util/TestUtils.java diff --git a/src/test/resources/au/org/ala/names/index/dwca-1/meta.xml b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/dwca-1/meta.xml similarity index 100% rename from src/test/resources/au/org/ala/names/index/dwca-1/meta.xml rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/dwca-1/meta.xml diff --git a/src/test/resources/au/org/ala/names/index/dwca-1/taxon.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/dwca-1/taxon.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/dwca-1/taxon.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/dwca-1/taxon.csv diff --git a/src/test/resources/au/org/ala/names/index/dwca-1/vernacularNames.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/dwca-1/vernacularNames.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/dwca-1/vernacularNames.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/dwca-1/vernacularNames.csv diff --git a/src/test/resources/au/org/ala/names/index/name-provider-1.json b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/name-provider-1.json similarity index 100% rename from src/test/resources/au/org/ala/names/index/name-provider-1.json rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/name-provider-1.json diff --git a/src/test/resources/au/org/ala/names/index/provider/and-condition-1.json b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/provider/and-condition-1.json similarity index 100% rename from src/test/resources/au/org/ala/names/index/provider/and-condition-1.json rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/provider/and-condition-1.json diff --git a/src/test/resources/au/org/ala/names/index/provider/match-condition-1.json b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/provider/match-condition-1.json similarity index 100% rename from src/test/resources/au/org/ala/names/index/provider/match-condition-1.json rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/provider/match-condition-1.json diff --git a/src/test/resources/au/org/ala/names/index/provider/or-condition-1.json b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/provider/or-condition-1.json similarity index 100% rename from src/test/resources/au/org/ala/names/index/provider/or-condition-1.json rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/provider/or-condition-1.json diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-1.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-1.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-1.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-1.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-10.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-10.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-10.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-10.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-11.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-11.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-11.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-11.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-12.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-12.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-12.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-12.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-13.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-13.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-13.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-13.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-14.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-14.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-14.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-14.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-15.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-15.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-15.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-15.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-16.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-16.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-16.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-16.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-17.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-17.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-17.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-17.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-18.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-18.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-18.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-18.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-19.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-19.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-19.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-19.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-2.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-2.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-2.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-2.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-20.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-20.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-20.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-20.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-21.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-21.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-21.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-21.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-22.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-22.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-22.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-22.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-23.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-23.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-23.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-23.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-24.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-24.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-24.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-24.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-25.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-25.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-25.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-25.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-26.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-26.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-26.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-26.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-27.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-27.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-27.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-27.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-28.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-28.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-28.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-28.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-29.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-29.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-29.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-29.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-3.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-3.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-3.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-3.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-30.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-30.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-30.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-30.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-31.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-31.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-31.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-31.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-4.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-4.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-4.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-4.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-5.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-5.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-5.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-5.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-6.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-6.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-6.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-6.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-7.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-7.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-7.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-7.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-8.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-8.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-8.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-8.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-9.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-9.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-9.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-9.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-bad-1.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-bad-1.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-bad-1.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-bad-1.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-config-1.json b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-config-1.json similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-config-1.json rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-config-1.json diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-config-2.json b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-config-2.json similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-config-2.json rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-config-2.json diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-config-3.json b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-config-3.json similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-config-3.json rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-config-3.json diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-config-4.json b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-config-4.json similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-config-4.json rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-config-4.json diff --git a/src/test/resources/au/org/ala/names/index/vernacular-1.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/vernacular-1.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/vernacular-1.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/vernacular-1.csv diff --git a/ala-name-matching-distribution/pom.xml b/ala-name-matching-distribution/pom.xml new file mode 100644 index 000000000..870f50442 --- /dev/null +++ b/ala-name-matching-distribution/pom.xml @@ -0,0 +1,67 @@ + + + + ala-name-matching + au.org.ala + 4.0-SNAPSHOT + + 4.0.0 + + ala-name-matching-distribution + + ALA Name Matching Distribution + Distribution along with dependencies + + + + ${project.parent.groupId} + ala-name-matching-model + ${project.version} + + + ${project.parent.groupId} + ala-name-matching-search + ${project.version} + + + ${project.parent.groupId} + ala-name-matching-builder + ${project.version} + + + ${project.parent.groupId} + ala-name-matching-tools + ${project.version} + + + + + + + org.apache.maven.plugins + maven-assembly-plugin + + + src/assembly/assembly.xml + + + + true + lib/ + + + + + + package + + single + + + + + + + \ No newline at end of file diff --git a/ala-name-matching-distribution/src/assembly/assembly.xml b/ala-name-matching-distribution/src/assembly/assembly.xml new file mode 100644 index 000000000..6c2fe4958 --- /dev/null +++ b/ala-name-matching-distribution/src/assembly/assembly.xml @@ -0,0 +1,35 @@ + + distribution + + zip + + false + + + true + + au.org.ala:ala-name-matching-model + au.org.ala:ala-name-matching-search + au.org.ala:ala-name-matching-builder + au.org.ala:ala-name-matching-tools + + + lib + false + + + + + + ${project.build.scriptSourceDirectory} + + + merge.sh + index.sh + compare.sh + generate.sh + dump.sh + + + + \ No newline at end of file diff --git a/ala-name-matching-distribution/src/main/scripts/compare.sh b/ala-name-matching-distribution/src/main/scripts/compare.sh new file mode 100644 index 000000000..208e76ee3 --- /dev/null +++ b/ala-name-matching-distribution/src/main/scripts/compare.sh @@ -0,0 +1,4 @@ +#!/bin/sh +SCRIPT_HOME=`dirname $0` +JAVA_OPTIONS="${JAVA_OPTIONS}" +exec java ${JAVA_OPTIONS} -cp "$SCRIPT_HOME/lib/*" au.org.ala.names.util.NameListComparer $* \ No newline at end of file diff --git a/ala-name-matching-distribution/src/main/scripts/dump.sh b/ala-name-matching-distribution/src/main/scripts/dump.sh new file mode 100644 index 000000000..bb43a64e8 --- /dev/null +++ b/ala-name-matching-distribution/src/main/scripts/dump.sh @@ -0,0 +1,4 @@ +#!/bin/sh +SCRIPT_HOME=`dirname $0` +JAVA_OPTIONS="${JAVA_OPTIONS}" +exec java ${JAVA_OPTIONS} -cp "$SCRIPT_HOME/lib/*" au.org.ala.names.util.TermDump $* \ No newline at end of file diff --git a/ala-name-matching-distribution/src/main/scripts/generate.sh b/ala-name-matching-distribution/src/main/scripts/generate.sh new file mode 100644 index 000000000..5043cc190 --- /dev/null +++ b/ala-name-matching-distribution/src/main/scripts/generate.sh @@ -0,0 +1,4 @@ +#!/bin/sh +SCRIPT_HOME=`dirname $0` +JAVA_OPTIONS="${JAVA_OPTIONS}" +exec java ${JAVA_OPTIONS} -cp "$SCRIPT_HOME/lib/*" au.org.ala.names.util.NameListGenerator $* \ No newline at end of file diff --git a/ala-name-matching-distribution/src/main/scripts/index.sh b/ala-name-matching-distribution/src/main/scripts/index.sh new file mode 100644 index 000000000..c2a771981 --- /dev/null +++ b/ala-name-matching-distribution/src/main/scripts/index.sh @@ -0,0 +1,4 @@ +#!/bin/sh +SCRIPT_HOME=`dirname $0` +JAVA_OPTIONS="${JAVA_OPTIONS}" +exec java ${JAVA_OPTIONS} -cp "$SCRIPT_HOME/lib/*" au.org.ala.names.search.DwcaNameIndexer $* \ No newline at end of file diff --git a/ala-name-matching-distribution/src/main/scripts/merge.sh b/ala-name-matching-distribution/src/main/scripts/merge.sh new file mode 100644 index 000000000..82ecda5f1 --- /dev/null +++ b/ala-name-matching-distribution/src/main/scripts/merge.sh @@ -0,0 +1,4 @@ +#!/bin/sh +SCRIPT_HOME=`dirname $0` +JAVA_OPTIONS="${JAVA_OPTIONS} -Xmx6G" +exec java ${JAVA_OPTIONS} -cp "$SCRIPT_HOME/lib/*" au.org.ala.names.index.TaxonomyBuilder $* \ No newline at end of file diff --git a/ala-name-matching-model/pom.xml b/ala-name-matching-model/pom.xml new file mode 100644 index 000000000..5e4bf9222 --- /dev/null +++ b/ala-name-matching-model/pom.xml @@ -0,0 +1,44 @@ + + + + ala-name-matching + au.org.ala + 4.0-SNAPSHOT + + 4.0.0 + + ala-name-matching-model + ALA Name Matching Model + Core name matching data model and vocabularies + + + + org.apache.commons + commons-lang3 + 3.12.0 + + + org.gbif + gbif-common + 0.37 + + + org.gbif + name-parser + 2.24 + + + uk.ac.shef.wit + simmetrics + ${simmetrics.version} + + + com.opencsv + opencsv + ${opencsv.version} + test + + + \ No newline at end of file diff --git a/src/main/java/au/org/ala/names/model/ALAParsedName.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/ALAParsedName.java similarity index 100% rename from src/main/java/au/org/ala/names/model/ALAParsedName.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/model/ALAParsedName.java diff --git a/src/main/java/au/org/ala/names/model/ErrorType.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/ErrorType.java similarity index 100% rename from src/main/java/au/org/ala/names/model/ErrorType.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/model/ErrorType.java diff --git a/src/main/java/au/org/ala/names/model/LinnaeanRankClassification.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/LinnaeanRankClassification.java similarity index 87% rename from src/main/java/au/org/ala/names/model/LinnaeanRankClassification.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/model/LinnaeanRankClassification.java index d21949828..958b8041a 100644 --- a/src/main/java/au/org/ala/names/model/LinnaeanRankClassification.java +++ b/ala-name-matching-model/src/main/java/au/org/ala/names/model/LinnaeanRankClassification.java @@ -1,12 +1,9 @@ package au.org.ala.names.model; -import org.apache.commons.lang.builder.EqualsBuilder; -import org.apache.commons.lang.builder.HashCodeBuilder; -import org.apache.commons.lang.builder.ToStringBuilder; -import org.apache.commons.lang3.StringUtils; -import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.BooleanQuery; +import org.apache.commons.lang3.builder.EqualsBuilder; +import org.apache.commons.lang3.builder.HashCodeBuilder; +import org.apache.commons.lang3.builder.ToStringBuilder; /** * A model object that represents a Linnaean Classification. @@ -360,7 +357,7 @@ public void setRank(String rank) { } /** - * @see java.lang.Object#toString() + * @see Object#toString() */ public String toString() { return new ToStringBuilder(this) @@ -388,7 +385,7 @@ public String toCSV(char sep) { } /** - * @see java.lang.Object#equals(Object) + * @see Object#equals(Object) */ public boolean equals(Object object) { if (!(object instanceof LinnaeanRankClassification)) { @@ -461,35 +458,6 @@ public boolean hasIdenticalClassification(LinnaeanRankClassification lrc, RankTy return true; } - /** - * Returns the additional string that needs to be included in a search - * - * @param optional Indicates whether the the terms should be optional - * @return - */ - public void appendLuceneQuery(BooleanQuery.Builder builder, boolean optional) { - BooleanClause.Occur occurs = optional ? BooleanClause.Occur.SHOULD : BooleanClause.Occur.FILTER; - StringBuilder sb = new StringBuilder(); - if (StringUtils.isNotEmpty(kingdom)) - builder.add(NameIndexField.KINGDOM.search(this.kingdom), occurs); - if (StringUtils.isNotEmpty(phylum)) - builder.add(NameIndexField.PHYLUM.search(this.phylum), occurs); - if (StringUtils.isNotEmpty(klass)) - builder.add(NameIndexField.CLASS.search(this.klass), occurs); - if (StringUtils.isNotEmpty(order)) - builder.add(NameIndexField.ORDER.search(this.order), occurs); - if (StringUtils.isNotEmpty(family)) - builder.add(NameIndexField.FAMILY.search(this.family), occurs); - if (StringUtils.isNotEmpty(genus)) - builder.add(NameIndexField.GENUS.search(this.genus), occurs); - if (StringUtils.isNotEmpty(species)) - builder.add(NameIndexField.SPECIES.search(this.species), occurs); - //authorship is always optional due to inconsistencies in the name format etc... - if (StringUtils.isNotEmpty(authorship)) - builder.add(NameIndexField.AUTHOR.search(this.authorship), BooleanClause.Occur.SHOULD); - } - - public static void main(String[] args) { LinnaeanRankClassification a = new LinnaeanRankClassification(null, null, null, null, null, null, "AuS bus"); LinnaeanRankClassification b = new LinnaeanRankClassification(null, null, null, null, null, null, new String("Aus bus")); @@ -498,7 +466,7 @@ public static void main(String[] args) { } /** - * @see java.lang.Object#hashCode() + * @see Object#hashCode() */ public int hashCode() { return new HashCodeBuilder(1497136033, 448920019).append(this.scientificName).append( diff --git a/src/main/java/au/org/ala/names/model/MatchMetrics.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/MatchMetrics.java similarity index 98% rename from src/main/java/au/org/ala/names/model/MatchMetrics.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/model/MatchMetrics.java index a3a8efd1d..0121e3a50 100644 --- a/src/main/java/au/org/ala/names/model/MatchMetrics.java +++ b/ala-name-matching-model/src/main/java/au/org/ala/names/model/MatchMetrics.java @@ -1,6 +1,5 @@ package au.org.ala.names.model; -import au.org.ala.names.index.TaxonConceptInstance; import org.apache.commons.lang3.StringUtils; import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric; import uk.ac.shef.wit.simmetrics.similaritymetrics.SmithWatermanGotoh; @@ -47,7 +46,7 @@ public MatchMetrics() { * * @return The priority * - * @see TaxonConceptInstance#getScore() + * @see au.org.ala.names.index.TaxonConceptInstance#getScore() */ public int getPriority() { return priority; diff --git a/src/main/java/au/org/ala/names/model/MatchType.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/MatchType.java similarity index 100% rename from src/main/java/au/org/ala/names/model/MatchType.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/model/MatchType.java diff --git a/src/main/java/au/org/ala/names/model/MetricsResultDTO.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/MetricsResultDTO.java similarity index 100% rename from src/main/java/au/org/ala/names/model/MetricsResultDTO.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/model/MetricsResultDTO.java diff --git a/src/main/java/au/org/ala/names/model/NameFlag.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/NameFlag.java similarity index 100% rename from src/main/java/au/org/ala/names/model/NameFlag.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/model/NameFlag.java diff --git a/src/main/java/au/org/ala/names/model/NameSearchResult.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/NameSearchResult.java similarity index 71% rename from src/main/java/au/org/ala/names/model/NameSearchResult.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/model/NameSearchResult.java index fc2abd7eb..cdf1f083b 100644 --- a/src/main/java/au/org/ala/names/model/NameSearchResult.java +++ b/ala-name-matching-model/src/main/java/au/org/ala/names/model/NameSearchResult.java @@ -15,9 +15,7 @@ package au.org.ala.names.model; -import org.apache.commons.lang.StringUtils; -import org.apache.lucene.document.Document; -import org.apache.lucene.index.IndexableField; +import org.apache.commons.lang3.StringUtils; import java.util.LinkedHashMap; import java.util.Map; @@ -54,48 +52,30 @@ public NameSearchResult(String id, String lsid, MatchType type) { this.matchMetrics = new MatchMetrics(); } - public NameSearchResult(Document doc, MatchType type) { - this(doc.get(NameIndexField.ID.toString()), doc.get(NameIndexField.LSID.toString()), type); - kingdom = doc.get(RankType.KINGDOM.getRank()); - //System.out.println("Rank to use : " +doc.get(IndexField.RANK.toString())); - try { - rank = RankType.getForId(Integer.parseInt(doc.get(NameIndexField.RANK_ID.toString()))); - } catch (Exception e) { - } - String name = doc.get(NameIndexField.NAME_CANONICAL.toString()); - if (name == null) - name = doc.get(NameIndexField.NAME.toString()); - if (name == null) - name = doc.get(NameIndexField.NAME_COMPLETE.toString()); - rankClass = new LinnaeanRankClassification(doc.get(RankType.KINGDOM.getRank()), - doc.get(RankType.PHYLUM.getRank()), - doc.get(RankType.CLASS.getRank()), - doc.get(RankType.ORDER.getRank()), - doc.get(RankType.FAMILY.getRank()), - doc.get(RankType.GENUS.getRank()), - name); - rankClass.setSpecies(doc.get(RankType.SPECIES.getRank())); - //add the ids - rankClass.setKid(doc.get("kid")); - rankClass.setPid(doc.get("pid")); - rankClass.setCid(doc.get("cid")); - rankClass.setOid(doc.get("oid")); - rankClass.setFid(doc.get("fid")); - rankClass.setGid(doc.get("gid")); - rankClass.setSid(doc.get("sid")); - rankClass.setAuthorship(doc.get(NameIndexField.AUTHOR.toString())); - //left and right values for the taxon concept - left = doc.get("left"); - right = doc.get("right"); - synonymType = SynonymType.getTypeFor(doc.get(NameIndexField.SYNONYM_TYPE.toString())); - String syn = doc.get(NameIndexField.ACCEPTED.toString()); - if (syn != null) { - acceptedLsid = syn; - } - IndexableField priority = doc.getField(NameIndexField.PRIORITY.toString()); + /** + * Construct a fully filled out result + * + * @param id The result identifier + * @param lsid The lsid of the concept + * @param acceptedLsid The lsid of the accepted concept + * @param left The left-value + * @param right The right-value + * @param rankClass The linnaean classification + * @param rank The rank + * @param type The match type + * @param synonymType The synonym type + * @param priority An optional match priority + */ + public NameSearchResult(String id, String lsid, String acceptedLsid, String left, String right, LinnaeanRankClassification rankClass, RankType rank, MatchType type, SynonymType synonymType, Integer priority) { + this(id, lsid, type); + this.acceptedLsid = acceptedLsid; + this.left = left; + this.right = right; + this.rankClass = rankClass; + this.rank = rank; + this.synonymType = synonymType; if (priority != null) - this.matchMetrics.setPriority(priority.numericValue().intValue()); - + this.matchMetrics.setPriority(priority); } public SynonymType getSynonymType() { diff --git a/src/main/java/au/org/ala/names/model/RankType.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/RankType.java similarity index 100% rename from src/main/java/au/org/ala/names/model/RankType.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/model/RankType.java diff --git a/src/main/java/au/org/ala/names/model/SynonymType.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/SynonymType.java similarity index 100% rename from src/main/java/au/org/ala/names/model/SynonymType.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/model/SynonymType.java diff --git a/src/main/java/au/org/ala/names/model/TaxonomicType.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/TaxonomicType.java similarity index 100% rename from src/main/java/au/org/ala/names/model/TaxonomicType.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/model/TaxonomicType.java diff --git a/src/main/java/au/org/ala/names/model/TaxonomicTypeGroup.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/TaxonomicTypeGroup.java similarity index 100% rename from src/main/java/au/org/ala/names/model/TaxonomicTypeGroup.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/model/TaxonomicTypeGroup.java diff --git a/src/main/java/au/org/ala/names/model/VernacularType.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/VernacularType.java similarity index 100% rename from src/main/java/au/org/ala/names/model/VernacularType.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/model/VernacularType.java diff --git a/src/main/java/au/org/ala/names/search/ExcludedNameException.java b/ala-name-matching-model/src/main/java/au/org/ala/names/search/ExcludedNameException.java similarity index 100% rename from src/main/java/au/org/ala/names/search/ExcludedNameException.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/search/ExcludedNameException.java diff --git a/src/main/java/au/org/ala/names/search/HomonymException.java b/ala-name-matching-model/src/main/java/au/org/ala/names/search/HomonymException.java similarity index 100% rename from src/main/java/au/org/ala/names/search/HomonymException.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/search/HomonymException.java index 18e634062..0eee0734f 100644 --- a/src/main/java/au/org/ala/names/search/HomonymException.java +++ b/ala-name-matching-model/src/main/java/au/org/ala/names/search/HomonymException.java @@ -15,10 +15,10 @@ package au.org.ala.names.search; -import java.util.List; - import au.org.ala.names.model.NameSearchResult; +import java.util.List; + /** * Exception that is thrown when the result is an unresolved * homonym diff --git a/src/main/java/au/org/ala/names/search/MisappliedException.java b/ala-name-matching-model/src/main/java/au/org/ala/names/search/MisappliedException.java similarity index 100% rename from src/main/java/au/org/ala/names/search/MisappliedException.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/search/MisappliedException.java diff --git a/src/main/java/au/org/ala/names/search/ParentSynonymChildException.java b/ala-name-matching-model/src/main/java/au/org/ala/names/search/ParentSynonymChildException.java similarity index 100% rename from src/main/java/au/org/ala/names/search/ParentSynonymChildException.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/search/ParentSynonymChildException.java diff --git a/src/main/java/au/org/ala/names/search/SPPException.java b/ala-name-matching-model/src/main/java/au/org/ala/names/search/SPPException.java similarity index 100% rename from src/main/java/au/org/ala/names/search/SPPException.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/search/SPPException.java diff --git a/src/main/java/au/org/ala/names/search/SearchResultException.java b/ala-name-matching-model/src/main/java/au/org/ala/names/search/SearchResultException.java similarity index 99% rename from src/main/java/au/org/ala/names/search/SearchResultException.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/search/SearchResultException.java index a277c7046..82044dfdb 100644 --- a/src/main/java/au/org/ala/names/search/SearchResultException.java +++ b/ala-name-matching-model/src/main/java/au/org/ala/names/search/SearchResultException.java @@ -16,11 +16,10 @@ package au.org.ala.names.search; import au.org.ala.names.model.ErrorType; +import au.org.ala.names.model.NameSearchResult; import java.util.List; -import au.org.ala.names.model.NameSearchResult; - /** * The generic search result exception that can be thrown during a search. This exception * will be used to wrap any exception that occurs that do not fall into the other categories. diff --git a/src/main/java/au/org/ala/names/util/CleanedScientificName.java b/ala-name-matching-model/src/main/java/au/org/ala/names/util/CleanedScientificName.java similarity index 100% rename from src/main/java/au/org/ala/names/util/CleanedScientificName.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/util/CleanedScientificName.java diff --git a/src/main/java/au/org/ala/names/util/TaxonNameSoundEx.java b/ala-name-matching-model/src/main/java/au/org/ala/names/util/TaxonNameSoundEx.java similarity index 99% rename from src/main/java/au/org/ala/names/util/TaxonNameSoundEx.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/util/TaxonNameSoundEx.java index 96009a3cb..bb941f19e 100644 --- a/src/main/java/au/org/ala/names/util/TaxonNameSoundEx.java +++ b/ala-name-matching-model/src/main/java/au/org/ala/names/util/TaxonNameSoundEx.java @@ -5,7 +5,7 @@ import java.util.List; import java.util.StringTokenizer; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; /** * A Java implementation of the sound ex algorithm supplied by Tony Rees diff --git a/src/main/java/au/org/ala/vocab/ALATerm.java b/ala-name-matching-model/src/main/java/au/org/ala/vocab/ALATerm.java similarity index 100% rename from src/main/java/au/org/ala/vocab/ALATerm.java rename to ala-name-matching-model/src/main/java/au/org/ala/vocab/ALATerm.java diff --git a/src/main/java/org/gbif/nameparser/PhraseNameParser.java b/ala-name-matching-model/src/main/java/org/gbif/nameparser/PhraseNameParser.java similarity index 98% rename from src/main/java/org/gbif/nameparser/PhraseNameParser.java rename to ala-name-matching-model/src/main/java/org/gbif/nameparser/PhraseNameParser.java index 51e493ac9..f25a66e8e 100644 --- a/src/main/java/org/gbif/nameparser/PhraseNameParser.java +++ b/ala-name-matching-model/src/main/java/org/gbif/nameparser/PhraseNameParser.java @@ -16,8 +16,7 @@ package org.gbif.nameparser; import au.org.ala.names.model.ALAParsedName; -import au.org.ala.names.model.RankType; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.text.WordUtils; import org.gbif.api.exception.UnparsableException; import org.gbif.api.model.checklistbank.ParsedName; @@ -25,8 +24,6 @@ import org.gbif.api.vocabulary.Rank; import java.util.HashMap; -import java.util.HashSet; -import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; diff --git a/src/test/java/au/org/ala/names/parser/util/PhraseNameParserTests.java b/ala-name-matching-model/src/test/java/au/org/ala/names/parser/util/PhraseNameParserTests.java similarity index 100% rename from src/test/java/au/org/ala/names/parser/util/PhraseNameParserTests.java rename to ala-name-matching-model/src/test/java/au/org/ala/names/parser/util/PhraseNameParserTests.java diff --git a/src/test/java/au/org/ala/names/util/CleanedScientificNameTest.java b/ala-name-matching-model/src/test/java/au/org/ala/names/util/CleanedScientificNameTest.java similarity index 100% rename from src/test/java/au/org/ala/names/util/CleanedScientificNameTest.java rename to ala-name-matching-model/src/test/java/au/org/ala/names/util/CleanedScientificNameTest.java diff --git a/src/test/java/org/gbif/nameparser/PhraseNameParserTest.java b/ala-name-matching-model/src/test/java/org/gbif/nameparser/PhraseNameParserTest.java similarity index 96% rename from src/test/java/org/gbif/nameparser/PhraseNameParserTest.java rename to ala-name-matching-model/src/test/java/org/gbif/nameparser/PhraseNameParserTest.java index 49e1e56b9..976cb7d22 100644 --- a/src/test/java/org/gbif/nameparser/PhraseNameParserTest.java +++ b/ala-name-matching-model/src/test/java/org/gbif/nameparser/PhraseNameParserTest.java @@ -3,8 +3,6 @@ package org.gbif.nameparser; import au.org.ala.names.model.ALAParsedName; -import au.org.ala.names.util.CleanedScientificName; -import org.codehaus.jackson.map.jsontype.NamedType; import org.gbif.api.model.checklistbank.ParsedName; import org.gbif.api.vocabulary.NameType; import org.junit.Before; diff --git a/ala-name-matching-search/pom.xml b/ala-name-matching-search/pom.xml new file mode 100644 index 000000000..5bfb1842a --- /dev/null +++ b/ala-name-matching-search/pom.xml @@ -0,0 +1,55 @@ + + + 4.0.0 + + + au.org.ala + ala-name-matching + 4.0-SNAPSHOT + + + ala-name-matching-search + jar + + ALA Name Matching Search Library + A library that connects to a local name index and provides name lookup services. + + + + ${project.parent.groupId} + ala-name-matching-model + ${project.version} + + + com.opencsv + opencsv + ${opencsv.version} + + + commons-io + commons-io + ${commons-io.version} + + + + org.apache.lucene + lucene-core + ${org.apache.lucene.version} + + + org.apache.lucene + lucene-backward-codecs + ${org.apache.lucene.version} + + + org.apache.lucene + lucene-analyzers-common + ${org.apache.lucene.version} + + + org.apache.lucene + lucene-queryparser + ${org.apache.lucene.version} + + + diff --git a/src/main/java/au/org/ala/names/lucene/analyzer/LowerCaseKeywordAnalyzer.java b/ala-name-matching-search/src/main/java/au/org/ala/names/lucene/analyzer/LowerCaseKeywordAnalyzer.java similarity index 88% rename from src/main/java/au/org/ala/names/lucene/analyzer/LowerCaseKeywordAnalyzer.java rename to ala-name-matching-search/src/main/java/au/org/ala/names/lucene/analyzer/LowerCaseKeywordAnalyzer.java index e73a7551c..a8710a900 100644 --- a/src/main/java/au/org/ala/names/lucene/analyzer/LowerCaseKeywordAnalyzer.java +++ b/ala-name-matching-search/src/main/java/au/org/ala/names/lucene/analyzer/LowerCaseKeywordAnalyzer.java @@ -14,20 +14,15 @@ */ package au.org.ala.names.lucene.analyzer; -import java.io.IOException; -import java.io.Reader; - import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.KeywordTokenizerFactory; -import org.apache.lucene.analysis.core.LowerCaseFilter; -import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.LowerCaseFilterFactory; import org.apache.lucene.analysis.custom.CustomAnalyzer; -import org.apache.lucene.util.Version; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; + /** * A custom KeywordAnalyzer that converts the text to lowercase before tokenizing * the complete string as one token diff --git a/src/main/java/au/org/ala/names/search/ALANameSearcher.java b/ala-name-matching-search/src/main/java/au/org/ala/names/search/ALANameSearcher.java similarity index 95% rename from src/main/java/au/org/ala/names/search/ALANameSearcher.java rename to ala-name-matching-search/src/main/java/au/org/ala/names/search/ALANameSearcher.java index d1355169a..451e522b4 100644 --- a/src/main/java/au/org/ala/names/search/ALANameSearcher.java +++ b/ala-name-matching-search/src/main/java/au/org/ala/names/search/ALANameSearcher.java @@ -17,12 +17,12 @@ import au.org.ala.names.model.*; import au.org.ala.names.util.CleanedScientificName; import au.org.ala.names.util.TaxonNameSoundEx; -import org.apache.commons.io.FileUtils; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexableField; import org.apache.lucene.search.*; import org.apache.lucene.store.FSDirectory; import org.gbif.api.exception.UnparsableException; @@ -1157,8 +1157,8 @@ private List performSearch(List compulsoryValues, RankT builder.add(rankBuilder.build(), BooleanClause.Occur.MUST); } if (cl != null) { - cl.appendLuceneQuery(builder, true); - } + this.appendLuceneQuery(cl, builder, true); + } Query query = builder.build(); TopDocs hits = cbSearcher.search(query, max);//cbSearcher.search(boolQuery, max); @@ -1167,7 +1167,7 @@ private List performSearch(List compulsoryValues, RankT List results = new java.util.ArrayList(); for (ScoreDoc sdoc : hits.scoreDocs) { - NameSearchResult nsr = new NameSearchResult(cbReader.document(sdoc.doc), type); + NameSearchResult nsr = this.createResult(cbReader.document(sdoc.doc), type); nsr.computeMatch(cl); results.add(nsr); } @@ -1481,7 +1481,7 @@ public TopDocs getIRMNGGenus(LinnaeanRankClassification cl, RankType rank) { try { BooleanQuery.Builder builder = new BooleanQuery.Builder(); builder.add(NameIndexField.RANK.search(rank.getRank()), BooleanClause.Occur.MUST); - cl.appendLuceneQuery(builder, false); + this.appendLuceneQuery(cl, builder, false); Query query = builder.build(); log.debug("getIRMNG query: " + query.toString()); return irmngSearcher.search(query, 10); @@ -1758,7 +1758,7 @@ public NameSearchResult searchForRecordByLsid(String lsid) { if (hits.totalHits.value == 0) hits = this.cbSearcher.search(query, 1); if (hits.totalHits.value > 0) - return new NameSearchResult(cbSearcher.doc(hits.scoreDocs[0].doc), MatchType.TAXON_ID); + return this.createResult(cbSearcher.doc(hits.scoreDocs[0].doc), MatchType.TAXON_ID); } catch (Exception ex) { log.error("Unable to search for record by LSID " + lsid, ex); } @@ -1800,7 +1800,7 @@ private void appendAutocompleteResults(Map output, TopDocs results, Document src = commonNameResults ? vernSearcher.doc(i.doc) : cbSearcher.doc(i.doc); NameSearchResult nsr = commonNameResults ? searchForRecordByLsid(src.get("lsid")) - : new NameSearchResult(src, null); + : this.createResult(src, null); if (nsr == null || (nsr.getLeft() == null && !includeSynonyms)) continue; @@ -2112,6 +2112,90 @@ private String escapeQueryChars(String s) { return sb.toString(); } + /** + * Construct a name match result from a document. + * + * @param doc The document + * @param type The match type + * + * @return The resultng name match + */ + protected NameSearchResult createResult(Document doc, MatchType type) { + String name = doc.get(NameIndexField.NAME_CANONICAL.toString()); + if (name == null) + name = doc.get(NameIndexField.NAME.toString()); + if (name == null) + name = doc.get(NameIndexField.NAME_COMPLETE.toString()); + LinnaeanRankClassification rankClass = new LinnaeanRankClassification(doc.get(RankType.KINGDOM.getRank()), + doc.get(RankType.PHYLUM.getRank()), + doc.get(RankType.CLASS.getRank()), + doc.get(RankType.ORDER.getRank()), + doc.get(RankType.FAMILY.getRank()), + doc.get(RankType.GENUS.getRank()), + name); + rankClass.setSpecies(doc.get(RankType.SPECIES.getRank())); + //add the ids + rankClass.setKid(doc.get("kid")); + rankClass.setPid(doc.get("pid")); + rankClass.setCid(doc.get("cid")); + rankClass.setOid(doc.get("oid")); + rankClass.setFid(doc.get("fid")); + rankClass.setGid(doc.get("gid")); + rankClass.setSid(doc.get("sid")); + rankClass.setAuthorship(doc.get(NameIndexField.AUTHOR.toString())); + + String id = doc.get(NameIndexField.ID.toString()); + String lsid = doc.get(NameIndexField.LSID.toString()); + String kingdom = doc.get(RankType.KINGDOM.getRank()); + RankType rank = null; + try { + rank = RankType.getForId(Integer.parseInt(doc.get(NameIndexField.RANK_ID.toString()))); + } catch (Exception e) { + } + //left and right values for the taxon concept + String left = doc.get("left"); + String right = doc.get("right"); + SynonymType synonymType = SynonymType.getTypeFor(doc.get(NameIndexField.SYNONYM_TYPE.toString())); + String acceptedLsid = doc.get(NameIndexField.ACCEPTED.toString()); + IndexableField pf = doc.getField(NameIndexField.PRIORITY.toString()); + Integer priority = pf == null ? null : pf.numericValue().intValue(); + NameSearchResult result = new NameSearchResult(id, lsid, acceptedLsid, left, right, rankClass, rank, type, synonymType, priority); + result.setRank(rank); + result.setLeft(left); + result.setRight(right); + return result; + } + + + /** + * Returns the additional string that needs to be included in a search + * + * @param optional Indicates whether the the terms should be optional + * @return + */ + public void appendLuceneQuery(LinnaeanRankClassification classification, BooleanQuery.Builder builder, boolean optional) { + BooleanClause.Occur occurs = optional ? BooleanClause.Occur.SHOULD : BooleanClause.Occur.FILTER; + StringBuilder sb = new StringBuilder(); + if (StringUtils.isNotEmpty(classification.getKingdom())) + builder.add(NameIndexField.KINGDOM.search(classification.getKingdom()), occurs); + if (StringUtils.isNotEmpty(classification.getPhylum())) + builder.add(NameIndexField.PHYLUM.search(classification.getPhylum()), occurs); + if (StringUtils.isNotEmpty(classification.getKlass())) + builder.add(NameIndexField.CLASS.search(classification.getKlass()), occurs); + if (StringUtils.isNotEmpty(classification.getOrder())) + builder.add(NameIndexField.ORDER.search(classification.getOrder()), occurs); + if (StringUtils.isNotEmpty(classification.getFamily())) + builder.add(NameIndexField.FAMILY.search(classification.getFamily()), occurs); + if (StringUtils.isNotEmpty(classification.getGenus())) + builder.add(NameIndexField.GENUS.search(classification.getGenus()), occurs); + if (StringUtils.isNotEmpty(classification.getSpecies())) + builder.add(NameIndexField.SPECIES.search(classification.getSpecies()), occurs); + //authorship is always optional due to inconsistencies in the name format etc... + if (StringUtils.isNotEmpty(classification.getAuthorship())) + builder.add(NameIndexField.AUTHOR.search(classification.getAuthorship()), BooleanClause.Occur.SHOULD); + } + + public static void main(String[] args) throws IOException { ALANameSearcher nameindex = new ALANameSearcher(args[0]); diff --git a/src/main/java/au/org/ala/names/model/FieldType.java b/ala-name-matching-search/src/main/java/au/org/ala/names/search/FieldType.java similarity index 98% rename from src/main/java/au/org/ala/names/model/FieldType.java rename to ala-name-matching-search/src/main/java/au/org/ala/names/search/FieldType.java index 16231e761..61d1d45f3 100644 --- a/src/main/java/au/org/ala/names/model/FieldType.java +++ b/ala-name-matching-search/src/main/java/au/org/ala/names/search/FieldType.java @@ -1,4 +1,4 @@ -package au.org.ala.names.model; +package au.org.ala.names.search; import au.org.ala.names.lucene.analyzer.LowerCaseKeywordAnalyzer; import org.apache.lucene.analysis.Analyzer; @@ -8,9 +8,6 @@ import org.apache.lucene.search.TermQuery; import org.apache.lucene.util.QueryBuilder; -import java.util.function.BiConsumer; -import java.util.function.BiFunction; - /** * The type of field stored in the lucene index. *

diff --git a/src/main/java/au/org/ala/names/model/NameIndexField.java b/ala-name-matching-search/src/main/java/au/org/ala/names/search/NameIndexField.java similarity index 93% rename from src/main/java/au/org/ala/names/model/NameIndexField.java rename to ala-name-matching-search/src/main/java/au/org/ala/names/search/NameIndexField.java index ec5237840..16219e6d6 100644 --- a/src/main/java/au/org/ala/names/model/NameIndexField.java +++ b/ala-name-matching-search/src/main/java/au/org/ala/names/search/NameIndexField.java @@ -12,7 +12,7 @@ * implied. See the License for the specific language governing * rights and limitations under the License. */ -package au.org.ala.names.model; +package au.org.ala.names.search; import org.apache.lucene.document.Document; import org.apache.lucene.index.Term; @@ -20,9 +20,7 @@ import org.apache.lucene.search.WildcardQuery; /** - * An Enum for all the fields that are indexed for the name matching. This enum is used by - * {@link au.org.ala.names.search.ALANameIndexer} to create the index and - * {@link au.org.ala.names.search.ALANameSearcher} to search the index + * An Enum for all the fields that are indexed for the name matching. * * @author Natasha Carter */ @@ -97,7 +95,7 @@ public String toString() { * @param value The value * @param document The document */ - public void store(Object value, Document document) { + public void store(T value, Document document) { if (value == null) return; this.type.store(value, this.name, document); @@ -110,7 +108,7 @@ public void store(Object value, Document document) { * * @return A matching query */ - public Query search(Object value) { + public Query search(T value) { return this.type.search(value, this.name); } @@ -122,7 +120,7 @@ public Query search(Object value) { * * @return A matching query */ - public Query searchRange(Object lower, Object upper) { + public Query searchRange(T lower, T upper) { return this.type.searchRange(lower, upper, this.name); } diff --git a/src/main/java/au/org/ala/names/util/FileUtils.java b/ala-name-matching-search/src/main/java/au/org/ala/names/util/FileUtils.java similarity index 98% rename from src/main/java/au/org/ala/names/util/FileUtils.java rename to ala-name-matching-search/src/main/java/au/org/ala/names/util/FileUtils.java index cd3cee8ca..8b0fde71e 100644 --- a/src/main/java/au/org/ala/names/util/FileUtils.java +++ b/ala-name-matching-search/src/main/java/au/org/ala/names/util/FileUtils.java @@ -19,7 +19,7 @@ import java.util.Set; import org.apache.commons.io.LineIterator; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; /** * Some Generic file utilities. diff --git a/ala-name-matching-search/src/main/java/au/org/ala/names/util/TaxonNameSoundEx.java b/ala-name-matching-search/src/main/java/au/org/ala/names/util/TaxonNameSoundEx.java new file mode 100644 index 000000000..bb941f19e --- /dev/null +++ b/ala-name-matching-search/src/main/java/au/org/ala/names/util/TaxonNameSoundEx.java @@ -0,0 +1,281 @@ +package au.org.ala.names.util; + +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; +import java.util.StringTokenizer; + +import org.apache.commons.lang3.StringUtils; + +/** + * A Java implementation of the sound ex algorithm supplied by Tony Rees + * Copied from Taxamatch project. We don't need full taxamatch... + */ +public class TaxonNameSoundEx { + + private static String translate(String source, String transSource, String transTarget) { + String result = source; + + while (transSource.length() > transTarget.length()) { + transTarget += " "; + } + for (int i = 0; i < transSource.length(); i++) { + result = result.replace(transSource.charAt(i), transTarget.charAt(i)); + } + return result; + } + + + public static String normalize(String str) { + + if (str == null) return null; + + String output = str; + + // trim any leading, trailing spaces or line feeds + //output = ltrim(rtrim(str)); + + output = output.replace(" cf ", " "); + output = output.replace(" cf. ", " "); + output = output.replace(" near ", " "); + output = output.replace(" aff. ", " "); + output = output.replace(" sp.", " "); + output = output.replace(" spp.", " "); + output = output.replace(" spp ", " "); + + output = str.toUpperCase(); + + // replace any HTML ampersands + output = output.replace(" & ", " & "); + + // remove any content in angle brackets (e.g. html tags - , , etc.) + output = output.replaceAll("\\<.+?\\>", ""); + + output = translate(output, "\u00c1\u00c9\u00cd\u00d3\u00da\u00c0\u00c8\u00cc\u00d2\u00d9" + + "\u00c2\u00ca\u00ce\u00d4\u00db\u00c4\u00cb\u00cf\u00d6\u00dc\u00c3\u00d1\u00d5" + + "\u00c5\u00c7\u00d8", "AEIOUAEIOUAEIOUAEIOUANOACO"); + + output = output.replace("\u00c6", "AE"); + output = output.replaceAll("[^a-zA-Z .]", ""); + output = StringUtils.trimToNull(output); + + return output; + } + + + public static String treatWord(String str2, String wordType) { + char startLetter; + String temp = normalize(str2); + // Do some selective replacement on the leading letter/s only: + if (StringUtils.isNotEmpty(temp)) { + if (temp.startsWith("AE")) { + temp = "E" + temp.substring(2); + } else if (temp.startsWith("CN")) { + temp = "N" + temp.substring(2); + } else if (temp.startsWith("CT")) { + temp = "T" + temp.substring(2); + } else if (temp.startsWith("CZ")) { + temp = "C" + temp.substring(2); + } else if (temp.startsWith("DJ")) { + temp = "J" + temp.substring(2); + } else if (temp.startsWith("EA")) { + temp = "E" + temp.substring(2); + } else if (temp.startsWith("EU")) { + temp = "U" + temp.substring(2); + } else if (temp.startsWith("GN")) { + temp = "N" + temp.substring(2); + } else if (temp.startsWith("KN")) { + temp = "N" + temp.substring(2); + } else if (temp.startsWith("MC")) { + temp = "MAC" + temp.substring(2); + } else if (temp.startsWith("MN")) { + temp = "N" + temp.substring(2); + } else if (temp.startsWith("OE")) { + temp = "E" + temp.substring(2); + } else if (temp.startsWith("QU")) { + temp = "Q" + temp.substring(2); + } else if (temp.startsWith("PS")) { + temp = "S" + temp.substring(2); + } else if (temp.startsWith("PT")) { + temp = "T" + temp.substring(2); + } else if (temp.startsWith("TS")) { + temp = "S" + temp.substring(2); + } else if (temp.startsWith("WR")) { + temp = "R" + temp.substring(2); + } else if (temp.startsWith("X")) { + temp = "Z" + temp.substring(2); + } + // Now keep the leading character, then do selected "soundalike" replacements. The + // following letters are equated: AE, OE, E, U, Y and I; IA and A are equated; + // K and C; Z and S; and H is dropped. Also, A and O are equated, MAC and MC are equated, and SC and S. + startLetter = temp.charAt(0); // quarantine the leading letter + temp = temp.substring(1); // snip off the leading letter + // now do the replacements + temp = temp.replaceAll("AE", "I"); + temp = temp.replaceAll("IA", "A"); + temp = temp.replaceAll("OE", "I"); + temp = temp.replaceAll("OI", "A"); + temp = temp.replaceAll("SC", "S"); + temp = temp.replaceAll("E", "I"); + temp = temp.replaceAll("O", "A"); + temp = temp.replaceAll("U", "I"); + temp = temp.replaceAll("Y", "I"); + temp = temp.replaceAll("K", "C"); + temp = temp.replaceAll("Z", "C"); + temp = temp.replaceAll("H", ""); + // add back the leading letter + temp = startLetter + temp; + // now drop any repeated characters (AA becomes A, BB or BBB becomes B, etc.) + temp = temp.replaceAll("(\\w)\\1+", "$1"); + + if (wordType == "species") { + if (temp.endsWith("IS")) { + temp = temp.substring(0, temp.length() - 2) + "A"; + } else if (temp.endsWith("IM")) { + temp = temp.substring(0, temp.length() - 2) + "A"; + } else if (temp.endsWith("AS")) { + temp = temp.substring(0, temp.length() - 2) + "A"; + } + //temp = temp.replaceAll("(\\w)\\1+", "$1"); + } + } + return temp; + } + + + /** + * Returns the SoundEx for the source string + * + * @param source String to get the sound ex of + * @return The sound ex string + */ + public String soundEx(String source) { + String temp = source.toUpperCase(); + temp = selectiveReplaceFirstChar(temp); + temp = selectiveReplaceWithoutFirstChar(temp); + temp = removeRepeatedChars(temp); + temp = alphabetiseWordsIgnoringFirstLetter(temp); + + return temp; + } + + /** + * Ignoring the first letter, alphabetise each word + */ + String alphabetiseWordsIgnoringFirstLetter(String source) { + StringTokenizer st = new StringTokenizer(source, " "); + StringBuffer sb = new StringBuffer(); + while (st.hasMoreTokens()) { + String token = st.nextToken(); + char[] chars = token.toCharArray(); + List charList = new LinkedList(); + for (int i = 1; i < chars.length; i++) { + charList.add(chars[i]); + } + Collections.sort(charList); + sb.append(chars[0]); + for (Character c : charList) { + sb.append(c); + } + if (st.hasMoreTokens()) { + sb.append(" "); + } + } + return sb.toString(); + } + + /** + * Removes repeated characters + * Can't get the regex version working so pretty primitive... + */ + String removeRepeatedChars(String source) { + StringBuffer sb = new StringBuffer(); + char c = ' '; + for (int i = 0; i < source.length(); i++) { + char sourceC = source.charAt(i); + if (sourceC != c) { + sb.append(sourceC); + } + c = sourceC; + } + return sb.toString(); + } + + /** + * Ignoring the first character, selectively replace sound alikes + */ + String selectiveReplaceWithoutFirstChar(String source) { + if (source.length() > 1) { + String temp = source.substring(1); + temp = temp.replaceAll("AE", "I"); + temp = temp.replaceAll("IA", "A"); + temp = temp.replaceAll("OE", "I"); + temp = temp.replaceAll("OI", "A"); + temp = temp.replaceAll("MC", "MAC"); + temp = temp.replaceAll("SC", "S"); + temp = temp.replaceAll("EOUYKZH", "IAIICS"); + + return source.substring(0, 1) + temp; + } else { + return source; + } + } + + /** + * Selectively replaces the first character + */ + String selectiveReplaceFirstChar(String source) { + if (source.startsWith("Æ")) { + return source.replaceFirst("Æ", "E"); + + } else if (source.startsWith("AE")) { + return source.replaceFirst("AE", "E"); + + } else if (source.startsWith("CN")) { + return source.replaceFirst("CN", "N"); + + } else if (source.startsWith("CT")) { + return source.replaceFirst("CT", "T"); + + } else if (source.startsWith("CZ")) { + return source.replaceFirst("CZ", "C"); + + } else if (source.startsWith("DJ")) { + return source.replaceFirst("DJ", "J"); + + } else if (source.startsWith("EA")) { + return source.replaceFirst("EA", "E"); + + } else if (source.startsWith("EU")) { + return source.replaceFirst("EU", "U"); + + } else if (source.startsWith("GN")) { + return source.replaceFirst("GN", "N"); + + } else if (source.startsWith("KN")) { + return source.replaceFirst("KN", "N"); + + } else if (source.startsWith("MN")) { + return source.replaceFirst("MN", "N"); + + } else if (source.startsWith("OE")) { + return source.replaceFirst("OE", "E"); + + } else if (source.startsWith("QU")) { + return source.replaceFirst("QU", "Q"); + + } else if (source.startsWith("PS")) { + return source.replaceFirst("PS", "S"); + + } else if (source.startsWith("PT")) { + return source.replaceFirst("PT", "T"); + + } else if (source.startsWith("TS")) { + return source.replaceFirst("TS", "S"); + + } else if (source.startsWith("X")) { + return source.replaceFirst("X", "Z"); + + } else return source; + } +} \ No newline at end of file diff --git a/ala-name-matching-search/src/main/resources/au/org/ala/homonyms/cross_rank_homonyms.txt b/ala-name-matching-search/src/main/resources/au/org/ala/homonyms/cross_rank_homonyms.txt new file mode 100644 index 000000000..6f3cca49b --- /dev/null +++ b/ala-name-matching-search/src/main/resources/au/org/ala/homonyms/cross_rank_homonyms.txt @@ -0,0 +1,33 @@ +# A list of the cross rank homonyms according to Tony Rees source: wikispecies homonyms lists, 1/12/2010: +Acanthocephala +Acantharia +Acrasia +Adenophora +Adenophora +Anisoptera +Anura +Articulata +Cerapoda +Coccoidea +Ctenophora +Dirina +Echinacea +Ensifera +Ephemeroidea +Furnariidae +Lestoidea +Lichina +Lobata +Oligochaeta +Ommatophora +Patellina +Pholidota +Platynota +Plecoptera +Pogonophora +Polychaeta +Polyphaga +Pterygota +Raphiinae +Symphyta +Theria \ No newline at end of file diff --git a/src/main/resources/au/org/ala/propertystore/known_homonyms.txt b/ala-name-matching-search/src/main/resources/au/org/ala/propertystore/known_homonyms.txt similarity index 100% rename from src/main/resources/au/org/ala/propertystore/known_homonyms.txt rename to ala-name-matching-search/src/main/resources/au/org/ala/propertystore/known_homonyms.txt diff --git a/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java b/ala-name-matching-search/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java similarity index 100% rename from src/test/java/au/org/ala/names/search/ALANameSearcherTest.java rename to ala-name-matching-search/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java diff --git a/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java b/ala-name-matching-search/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java similarity index 100% rename from src/test/java/au/org/ala/names/search/BiocacheMatchTest.java rename to ala-name-matching-search/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java diff --git a/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java b/ala-name-matching-search/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java similarity index 98% rename from src/test/java/au/org/ala/names/search/IconicSpeciesTest.java rename to ala-name-matching-search/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java index 33fee2f64..d628d60a7 100644 --- a/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java +++ b/ala-name-matching-search/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java @@ -1,15 +1,12 @@ package au.org.ala.names.search; -import com.opencsv.CSVReader; -import au.org.ala.names.model.NameSearchResult; import au.org.ala.names.model.LinnaeanRankClassification; +import au.org.ala.names.model.NameSearchResult; import au.org.ala.names.model.RankType; -import org.apache.commons.lang.StringUtils; -import org.junit.Ignore; +import com.opencsv.CSVReader; +import org.apache.commons.lang3.StringUtils; import org.junit.Test; -import java.io.File; -import java.io.FileReader; import java.io.InputStreamReader; import java.util.List; diff --git a/src/test/java/au/org/ala/names/search/VernacularMatchTest.java b/ala-name-matching-search/src/test/java/au/org/ala/names/search/VernacularMatchTest.java similarity index 100% rename from src/test/java/au/org/ala/names/search/VernacularMatchTest.java rename to ala-name-matching-search/src/test/java/au/org/ala/names/search/VernacularMatchTest.java diff --git a/src/test/resources/au/org/ala/names/search/iconic_species_list.csv b/ala-name-matching-search/src/test/resources/au/org/ala/names/search/iconic_species_list.csv similarity index 100% rename from src/test/resources/au/org/ala/names/search/iconic_species_list.csv rename to ala-name-matching-search/src/test/resources/au/org/ala/names/search/iconic_species_list.csv diff --git a/ala-name-matching-tools/pom.xml b/ala-name-matching-tools/pom.xml new file mode 100644 index 000000000..41b6a19b1 --- /dev/null +++ b/ala-name-matching-tools/pom.xml @@ -0,0 +1,38 @@ + + + + ala-name-matching + au.org.ala + 4.0-SNAPSHOT + + 4.0.0 + + ala-name-matching-tools + ALA Name Matching Tools + Tools for testing and analysing name matching indexes + + + + au.org.ala + ala-name-matching-model + ${project.version} + + + au.org.ala + ala-name-matching-search + ${project.version} + + + org.slf4j + slf4j-log4j12 + ${slf4j.version} + + + commons-cli + commons-cli + ${commons-cli.version} + + + \ No newline at end of file diff --git a/src/main/java/au/org/ala/names/util/NameListComparer.java b/ala-name-matching-tools/src/main/java/au/org/ala/names/util/NameListComparer.java similarity index 98% rename from src/main/java/au/org/ala/names/util/NameListComparer.java rename to ala-name-matching-tools/src/main/java/au/org/ala/names/util/NameListComparer.java index 1b19ad0bb..622b43856 100644 --- a/src/main/java/au/org/ala/names/util/NameListComparer.java +++ b/ala-name-matching-tools/src/main/java/au/org/ala/names/util/NameListComparer.java @@ -4,9 +4,9 @@ import au.org.ala.names.model.*; import au.org.ala.names.search.*; import org.apache.commons.cli.*; -import org.apache.commons.lang.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; +import org.apache.commons.lang3.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.*; import java.util.*; @@ -19,7 +19,7 @@ * Copyright (c) 2016 CSIRO */ public class NameListComparer { - private static Log log = LogFactory.getLog(NameListComparer.class); + private static Logger log = LoggerFactory.getLogger(NameListComparer.class); private static String[][] TERMS = { { "originalId", "Species", "taxonConceptID", "taxon_concept_lsid", "taxonID" }, diff --git a/src/main/java/au/org/ala/names/util/NameListGenerator.java b/ala-name-matching-tools/src/main/java/au/org/ala/names/util/NameListGenerator.java similarity index 95% rename from src/main/java/au/org/ala/names/util/NameListGenerator.java rename to ala-name-matching-tools/src/main/java/au/org/ala/names/util/NameListGenerator.java index e1ee0e0f7..2a000a38b 100644 --- a/src/main/java/au/org/ala/names/util/NameListGenerator.java +++ b/ala-name-matching-tools/src/main/java/au/org/ala/names/util/NameListGenerator.java @@ -3,13 +3,12 @@ import com.opencsv.CSVWriter; import au.org.ala.names.model.SynonymType; import org.apache.commons.cli.*; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexableField; import org.apache.lucene.store.FSDirectory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.*; import java.util.*; @@ -23,7 +22,7 @@ * Copyright (c) 2015 CSIRO */ public class NameListGenerator implements Closeable { - private static Log log = LogFactory.getLog(NameListGenerator.class); + private static Logger log = LoggerFactory.getLogger(NameListGenerator.class); private static String[][] FIELDS = { {"lsid", "taxonID"}, diff --git a/src/main/java/au/org/ala/names/util/TermDump.java b/ala-name-matching-tools/src/main/java/au/org/ala/names/util/TermDump.java similarity index 100% rename from src/main/java/au/org/ala/names/util/TermDump.java rename to ala-name-matching-tools/src/main/java/au/org/ala/names/util/TermDump.java diff --git a/ala-name-matching-tools/src/main/resources/log4j.xml b/ala-name-matching-tools/src/main/resources/log4j.xml new file mode 100644 index 000000000..e0fbd4773 --- /dev/null +++ b/ala-name-matching-tools/src/main/resources/log4j.xml @@ -0,0 +1,32 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/data/CB_script_AFD_synonyms.sql b/data/historical/CB_script_AFD_synonyms.sql similarity index 100% rename from data/CB_script_AFD_synonyms.sql rename to data/historical/CB_script_AFD_synonyms.sql diff --git a/data/historical/README.md b/data/historical/README.md new file mode 100644 index 000000000..b3d56a5bf --- /dev/null +++ b/data/historical/README.md @@ -0,0 +1 @@ +Historical files showing the history of data exctraction and use. diff --git a/data/all-families.txt b/data/historical/all-families.txt similarity index 100% rename from data/all-families.txt rename to data/historical/all-families.txt diff --git a/src/main/resources/au/org/ala/propertystore/applicationContext-cb.xml b/data/historical/applicationContext-cb.xml similarity index 100% rename from src/main/resources/au/org/ala/propertystore/applicationContext-cb.xml rename to data/historical/applicationContext-cb.xml diff --git a/src/test/resources/au/org/ala/names/search/ba_names.txt b/data/historical/ba_names.txt similarity index 100% rename from src/test/resources/au/org/ala/names/search/ba_names.txt rename to data/historical/ba_names.txt diff --git a/src/test/resources/au/org/ala/names/search/bio_aust_birds.txt b/data/historical/bio_aust_birds.txt similarity index 100% rename from src/test/resources/au/org/ala/names/search/bio_aust_birds.txt rename to data/historical/bio_aust_birds.txt diff --git a/src/test/resources/au/org/ala/names/search/biocache_animal_col.txt b/data/historical/biocache_animal_col.txt similarity index 100% rename from src/test/resources/au/org/ala/names/search/biocache_animal_col.txt rename to data/historical/biocache_animal_col.txt diff --git a/src/test/resources/au/org/ala/names/search/birds.txt b/data/historical/birds.txt similarity index 100% rename from src/test/resources/au/org/ala/names/search/birds.txt rename to data/historical/birds.txt diff --git a/src/test/resources/au/org/ala/names/search/caab_fish.txt b/data/historical/caab_fish.txt similarity index 100% rename from src/test/resources/au/org/ala/names/search/caab_fish.txt rename to data/historical/caab_fish.txt diff --git a/src/main/resources/au/org/ala/db/CoL2010_dwc_export.sql b/data/historical/db/CoL2010_dwc_export.sql similarity index 98% rename from src/main/resources/au/org/ala/db/CoL2010_dwc_export.sql rename to data/historical/db/CoL2010_dwc_export.sql index 240309127..d3b475f22 100644 --- a/src/main/resources/au/org/ala/db/CoL2010_dwc_export.sql +++ b/data/historical/db/CoL2010_dwc_export.sql @@ -1,12 +1,12 @@ ---create the COL2010 DwC archive for use in Checklist Bank --- This script needs to group by record id to prevent mulitle entries occurring when a taxa's name has multiple entries in the scientific_name table ---? What is the reason for having mulitple names?? ---Query OK, 2424622 rows affected (23 min 52.22 sec) - -select t.record_id ,ifnull(t.lsid,''), ifnull(replace(replace(t.name, '\n', ' '), '\r',''),'') , if(t.parent_id>0, cast(t.parent_id as CHAR), '') , ifnull(t.taxon,'') , ifnull(cast(accepted.record_id as CHAR),'') , ifnull(replace(replace(accepted.name, '\n', ' '), '\r', ''), ''), ifnull(replace(replace(name.author,'\n',' '), '\r', ''),''), ifnull(replace(replace(name.infraspecies,'\n', ' '), '\r', ''), '') -INTO OUTFILE '/data/checklistbank/rawdata/col2010/DarwinCore.txt' character set UTF8 -from taxa t -LEFT JOIN scientific_names name on t.name_code = name.name_code -LEFT JOIN taxa accepted ON name.accepted_name_code = accepted.name_code and accepted.record_id <> t.record_id -group by t.record_id +--create the COL2010 DwC archive for use in Checklist Bank +-- This script needs to group by record id to prevent mulitle entries occurring when a taxa's name has multiple entries in the scientific_name table +--? What is the reason for having mulitple names?? +--Query OK, 2424622 rows affected (23 min 52.22 sec) + +select t.record_id ,ifnull(t.lsid,''), ifnull(replace(replace(t.name, '\n', ' '), '\r',''),'') , if(t.parent_id>0, cast(t.parent_id as CHAR), '') , ifnull(t.taxon,'') , ifnull(cast(accepted.record_id as CHAR),'') , ifnull(replace(replace(accepted.name, '\n', ' '), '\r', ''), ''), ifnull(replace(replace(name.author,'\n',' '), '\r', ''),''), ifnull(replace(replace(name.infraspecies,'\n', ' '), '\r', ''), '') +INTO OUTFILE '/data/checklistbank/rawdata/col2010/DarwinCore.txt' character set UTF8 +from taxa t +LEFT JOIN scientific_names name on t.name_code = name.name_code +LEFT JOIN taxa accepted ON name.accepted_name_code = accepted.name_code and accepted.record_id <> t.record_id +group by t.record_id order by t.record_id \ No newline at end of file diff --git a/src/main/resources/au/org/ala/db/CoL_commonNames.sql b/data/historical/db/CoL_commonNames.sql similarity index 98% rename from src/main/resources/au/org/ala/db/CoL_commonNames.sql rename to data/historical/db/CoL_commonNames.sql index ab89eb367..5e65a96ef 100644 --- a/src/main/resources/au/org/ala/db/CoL_commonNames.sql +++ b/data/historical/db/CoL_commonNames.sql @@ -1,10 +1,10 @@ ---export the CoL common names for use in the name matching API --- We are only interested in the names that are in English or have no assigned language --- Mark the Australian common names so that they be given a higher rating -SELECT cn.common_name, t.name, t.lsid,CASE WHEN cn.country='Australia' then 'T' ELSE '' END -INTO OUTFILE '/data/exports/col_common_names.txt' FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n' -FROM common_names cn -JOIN scientific_names sn ON cn.name_code = sn.name_code -JOIN taxa t ON sn.accepted_name_code = t.name_code -WHERE cn.language = 'English' or cn.language is null or cn.language='English;English' or cn.language ='' +--export the CoL common names for use in the name matching API +-- We are only interested in the names that are in English or have no assigned language +-- Mark the Australian common names so that they be given a higher rating +SELECT cn.common_name, t.name, t.lsid,CASE WHEN cn.country='Australia' then 'T' ELSE '' END +INTO OUTFILE '/data/exports/col_common_names.txt' FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n' +FROM common_names cn +JOIN scientific_names sn ON cn.name_code = sn.name_code +JOIN taxa t ON sn.accepted_name_code = t.name_code +WHERE cn.language = 'English' or cn.language is null or cn.language='English;English' or cn.language ='' GROUP BY cn.common_name, t.name,t.lsid,CASE WHEN cn.country='Australia' then 'T' ELSE '' END \ No newline at end of file diff --git a/src/main/resources/au/org/ala/db/checklist_bank_model_additions_ala.sql b/data/historical/db/checklist_bank_model_additions_ala.sql similarity index 98% rename from src/main/resources/au/org/ala/db/checklist_bank_model_additions_ala.sql rename to data/historical/db/checklist_bank_model_additions_ala.sql index 5578d88ff..ea10cfe45 100644 --- a/src/main/resources/au/org/ala/db/checklist_bank_model_additions_ala.sql +++ b/data/historical/db/checklist_bank_model_additions_ala.sql @@ -1,121 +1,121 @@ --- inserting some missing ranks so that all the exported values will have a valid (?) rank --- not 100% sure about some of the mappings. --- 575 infrageneric --- 725 subvariety --- 825 cultivar --- 875 unranked --- 900 supergenericname - -DELETE FROM term_gbif_portal_rank WHERE term_fk in (575, 725, 825, 875, 900); - -INSERT INTO term_gbif_portal_rank (term_fk, portal_rank) VALUES -(575, 6925), -(725,8015), -(825,8050), -(875, 0), -(900,8200) -; - --- Create the view that is necessary to export the taxon names in the format that ALA needs ---WHEN sci_pn.is_hybrid_formula = true THEN 1 (necessary for the old CB repository) ---WHEN sci.type = 5 THEN 1 (for the new) -CREATE OR REPLACE VIEW export_ala_taxon_name AS - SELECT COALESCE(can.id, sci.id) AS id, COALESCE(can.scientific_name, sci.scientific_name) AS canonical, - - CASE - WHEN tr.portal_rank < 6000 THEN sci_pn.monomial - ELSE NULL::character varying - END AS supra_generic, - CASE - WHEN tr.portal_rank >= 6000 THEN sci_pn.monomial - ELSE NULL::character varying - END AS generic, NULL::text AS infrageneric, sci_pn.specific_epithet, sci_pn.infra_specific_epithet AS infraspecific, NULL::text AS infraspecific_marker, - CASE - WHEN sci.type = 5 THEN 1 - ELSE 0 - END AS is_hybrid, tr.portal_rank AS rank, sci_pn.authorship AS author, NULL::unknown AS searchable_canonical - FROM name_usage nu - JOIN name_string sci ON nu.name_fk = sci.id - LEFT JOIN name_string can ON sci.canonical_name_fk = can.id - LEFT JOIN parsed_name sci_pn ON sci_pn.name_fk = sci.id - LEFT JOIN term_gbif_portal_rank tr ON nu.rank_fk = tr.term_fk - WHERE nu.checklist_fk = 1 - GROUP BY 1,2,3,4,5,6,7,8,9,10,11 - ORDER BY COALESCE(can.scientific_name, sci.scientific_name), tr.portal_rank, sci_pn.authorship; - ---create the view used for the taxon_concepts ---remove all the parent_fks and kingdom_fks that refer back to the "incertae sedis" record ie id=9. - -CREATE OR REPLACE VIEW ala_dwc_classification AS - SELECT u.id AS id, u.name_fk, n.id as sci_name_id,COALESCE(n.canonical_name_fk, n.id) as can_id,n.scientific_name, COALESCE(nc.scientific_name, n.scientific_name) AS canonical_name, u.lexical_group_fk, u.lft AS lft, u.rgt AS rgt, (COALESCE(np.authorship, ''::character varying)::text || - CASE - WHEN np.year IS NOT NULL THEN ', '::text || np.year::text - ELSE ''::text - END) || - CASE - WHEN np.authorship_basionym IS NOT NULL OR np.year_basionym IS NOT NULL THEN (' ('::text || COALESCE((np.authorship_basionym::text || ', '::text) || np.year_basionym::text, np.authorship_basionym::text, np.year_basionym::text)) || ')'::text - ELSE ''::text - END AS authorship, case u.parent_fk when 9 then null else u.parent_fk end, u.is_synonym, u.rank_fk, r.term as rank, case u.kingdom_fk when 9 then null else u.kingdom_fk end, knc.scientific_name AS kingdom, u.phylum_fk, COALESCE(pnc.scientific_name, pn.scientific_name) AS phylum, u.class_fk, COALESCE(cnc.scientific_name, cn.scientific_name) AS class, u.order_fk, COALESCE(onc.scientific_name, onn.scientific_name) AS "order", u.family_fk, COALESCE(fnc.scientific_name, fn.scientific_name) AS family, u.genus_fk, COALESCE(gnc.scientific_name, gn.scientific_name) AS genus, u.species_fk, COALESCE(snc.scientific_name, sn.scientific_name) AS species - FROM name_usage u - LEFT JOIN name_string n ON u.name_fk = n.id - LEFT JOIN name_string nc ON n.canonical_name_fk = nc.id - LEFT JOIN parsed_name np ON np.name_fk = n.id - LEFT JOIN term r ON u.rank_fk = r.id - LEFT JOIN name_usage ku ON u.kingdom_fk = ku.id - LEFT JOIN name_string kn ON ku.name_fk = kn.id - LEFT JOIN name_string knc ON kn.canonical_name_fk = knc.id - LEFT JOIN name_usage pu ON u.phylum_fk = pu.id - LEFT JOIN name_string pn ON pu.name_fk = pn.id - LEFT JOIN name_string pnc ON pn.canonical_name_fk = pnc.id - LEFT JOIN name_usage cu ON u.class_fk = cu.id - LEFT JOIN name_string cn ON cu.name_fk = cn.id - LEFT JOIN name_string cnc ON cn.canonical_name_fk = cnc.id - LEFT JOIN name_usage ou ON u.order_fk = ou.id - LEFT JOIN name_string onn ON ou.name_fk = onn.id - LEFT JOIN name_string onc ON onn.canonical_name_fk = onc.id - LEFT JOIN name_usage fu ON u.family_fk = fu.id - LEFT JOIN name_string fn ON fu.name_fk = fn.id - LEFT JOIN name_string fnc ON fn.canonical_name_fk = fnc.id - LEFT JOIN name_usage gu ON u.genus_fk = gu.id - LEFT JOIN name_string gn ON gu.name_fk = gn.id - LEFT JOIN name_string gnc ON gn.canonical_name_fk = gnc.id - LEFT JOIN name_usage su ON u.species_fk = su.id - LEFT JOIN name_string sn ON su.name_fk = sn.id - LEFT JOIN name_string snc ON sn.canonical_name_fk = snc.id - WHERE u.checklist_fk = 1; - ---may need to materialise the view so that SELECT statements are performant Query returned successfully with no result in 3842271 ms. -drop table IF EXISTS tmp_export_name_usage; - -create table tmp_export_name_usage AS SELECT * from ala_dwc_classification; -CREATE INDEX tmp_export_name_id_idx - ON tmp_export_name_usage - USING btree - (id) - WITH (FILLFACTOR=90); - ---create a tmp table with index on lookup columns to improve the performance of the lsid identifier lookup -drop table IF EXISTS tmp_identifiers; - -create table tmp_identifiers( -id serial NOT NULL, -lexical_group_fk integer, -name_fk integer, -identifier character varying(500), -checklist_fk integer -); -CREATE INDEX idx_tmp_ids_lg - ON tmp_identifiers - USING btree - (lexical_group_fk, name_fk); - ---insert the lsid type identifiers into the temporary identifiers table. 2636708 rows affected, 1107075 ms ---2622695 rows affected, 1129295 ms ---Query returned successfully: 2560234 rows affected, 1781306 ms execution time. --- NC: Added a order by identifier so that the consistent LSIDs are reported when multiple LSIDs exist for one taxon -INSERT into tmp_identifiers (lexical_group_fk, name_fk, identifier,checklist_fk) -SELECT nu.lexical_group_fk, COALESCE(ns.canonical_name_fk, ns.id), i.identifier, nu.checklist_fk FROM identifier i JOIN name_usage nu ON i.usage_fk = nu.id JOIN name_string ns on nu.name_fk = ns.id where i.type_fk = 2001 ORDER BY CASE nu.checklist_fk WHEN 1001 THEN 1 WHEN 1002 THEN 2 WHEN 1003 THEN 3 ELSE 4 END, i.identifier; - ---The SQL below identifies potential lexical groups that will have issues when the nub is genertaed ---The is specific to when 2 different ranks belong to the same lexical group eg Plecoptera is an ORDER and GENUS +-- inserting some missing ranks so that all the exported values will have a valid (?) rank +-- not 100% sure about some of the mappings. +-- 575 infrageneric +-- 725 subvariety +-- 825 cultivar +-- 875 unranked +-- 900 supergenericname + +DELETE FROM term_gbif_portal_rank WHERE term_fk in (575, 725, 825, 875, 900); + +INSERT INTO term_gbif_portal_rank (term_fk, portal_rank) VALUES +(575, 6925), +(725,8015), +(825,8050), +(875, 0), +(900,8200) +; + +-- Create the view that is necessary to export the taxon names in the format that ALA needs +--WHEN sci_pn.is_hybrid_formula = true THEN 1 (necessary for the old CB repository) +--WHEN sci.type = 5 THEN 1 (for the new) +CREATE OR REPLACE VIEW export_ala_taxon_name AS + SELECT COALESCE(can.id, sci.id) AS id, COALESCE(can.scientific_name, sci.scientific_name) AS canonical, + + CASE + WHEN tr.portal_rank < 6000 THEN sci_pn.monomial + ELSE NULL::character varying + END AS supra_generic, + CASE + WHEN tr.portal_rank >= 6000 THEN sci_pn.monomial + ELSE NULL::character varying + END AS generic, NULL::text AS infrageneric, sci_pn.specific_epithet, sci_pn.infra_specific_epithet AS infraspecific, NULL::text AS infraspecific_marker, + CASE + WHEN sci.type = 5 THEN 1 + ELSE 0 + END AS is_hybrid, tr.portal_rank AS rank, sci_pn.authorship AS author, NULL::unknown AS searchable_canonical + FROM name_usage nu + JOIN name_string sci ON nu.name_fk = sci.id + LEFT JOIN name_string can ON sci.canonical_name_fk = can.id + LEFT JOIN parsed_name sci_pn ON sci_pn.name_fk = sci.id + LEFT JOIN term_gbif_portal_rank tr ON nu.rank_fk = tr.term_fk + WHERE nu.checklist_fk = 1 + GROUP BY 1,2,3,4,5,6,7,8,9,10,11 + ORDER BY COALESCE(can.scientific_name, sci.scientific_name), tr.portal_rank, sci_pn.authorship; + +--create the view used for the taxon_concepts +--remove all the parent_fks and kingdom_fks that refer back to the "incertae sedis" record ie id=9. + +CREATE OR REPLACE VIEW ala_dwc_classification AS + SELECT u.id AS id, u.name_fk, n.id as sci_name_id,COALESCE(n.canonical_name_fk, n.id) as can_id,n.scientific_name, COALESCE(nc.scientific_name, n.scientific_name) AS canonical_name, u.lexical_group_fk, u.lft AS lft, u.rgt AS rgt, (COALESCE(np.authorship, ''::character varying)::text || + CASE + WHEN np.year IS NOT NULL THEN ', '::text || np.year::text + ELSE ''::text + END) || + CASE + WHEN np.authorship_basionym IS NOT NULL OR np.year_basionym IS NOT NULL THEN (' ('::text || COALESCE((np.authorship_basionym::text || ', '::text) || np.year_basionym::text, np.authorship_basionym::text, np.year_basionym::text)) || ')'::text + ELSE ''::text + END AS authorship, case u.parent_fk when 9 then null else u.parent_fk end, u.is_synonym, u.rank_fk, r.term as rank, case u.kingdom_fk when 9 then null else u.kingdom_fk end, knc.scientific_name AS kingdom, u.phylum_fk, COALESCE(pnc.scientific_name, pn.scientific_name) AS phylum, u.class_fk, COALESCE(cnc.scientific_name, cn.scientific_name) AS class, u.order_fk, COALESCE(onc.scientific_name, onn.scientific_name) AS "order", u.family_fk, COALESCE(fnc.scientific_name, fn.scientific_name) AS family, u.genus_fk, COALESCE(gnc.scientific_name, gn.scientific_name) AS genus, u.species_fk, COALESCE(snc.scientific_name, sn.scientific_name) AS species + FROM name_usage u + LEFT JOIN name_string n ON u.name_fk = n.id + LEFT JOIN name_string nc ON n.canonical_name_fk = nc.id + LEFT JOIN parsed_name np ON np.name_fk = n.id + LEFT JOIN term r ON u.rank_fk = r.id + LEFT JOIN name_usage ku ON u.kingdom_fk = ku.id + LEFT JOIN name_string kn ON ku.name_fk = kn.id + LEFT JOIN name_string knc ON kn.canonical_name_fk = knc.id + LEFT JOIN name_usage pu ON u.phylum_fk = pu.id + LEFT JOIN name_string pn ON pu.name_fk = pn.id + LEFT JOIN name_string pnc ON pn.canonical_name_fk = pnc.id + LEFT JOIN name_usage cu ON u.class_fk = cu.id + LEFT JOIN name_string cn ON cu.name_fk = cn.id + LEFT JOIN name_string cnc ON cn.canonical_name_fk = cnc.id + LEFT JOIN name_usage ou ON u.order_fk = ou.id + LEFT JOIN name_string onn ON ou.name_fk = onn.id + LEFT JOIN name_string onc ON onn.canonical_name_fk = onc.id + LEFT JOIN name_usage fu ON u.family_fk = fu.id + LEFT JOIN name_string fn ON fu.name_fk = fn.id + LEFT JOIN name_string fnc ON fn.canonical_name_fk = fnc.id + LEFT JOIN name_usage gu ON u.genus_fk = gu.id + LEFT JOIN name_string gn ON gu.name_fk = gn.id + LEFT JOIN name_string gnc ON gn.canonical_name_fk = gnc.id + LEFT JOIN name_usage su ON u.species_fk = su.id + LEFT JOIN name_string sn ON su.name_fk = sn.id + LEFT JOIN name_string snc ON sn.canonical_name_fk = snc.id + WHERE u.checklist_fk = 1; + +--may need to materialise the view so that SELECT statements are performant Query returned successfully with no result in 3842271 ms. +drop table IF EXISTS tmp_export_name_usage; + +create table tmp_export_name_usage AS SELECT * from ala_dwc_classification; +CREATE INDEX tmp_export_name_id_idx + ON tmp_export_name_usage + USING btree + (id) + WITH (FILLFACTOR=90); + +--create a tmp table with index on lookup columns to improve the performance of the lsid identifier lookup +drop table IF EXISTS tmp_identifiers; + +create table tmp_identifiers( +id serial NOT NULL, +lexical_group_fk integer, +name_fk integer, +identifier character varying(500), +checklist_fk integer +); +CREATE INDEX idx_tmp_ids_lg + ON tmp_identifiers + USING btree + (lexical_group_fk, name_fk); + +--insert the lsid type identifiers into the temporary identifiers table. 2636708 rows affected, 1107075 ms +--2622695 rows affected, 1129295 ms +--Query returned successfully: 2560234 rows affected, 1781306 ms execution time. +-- NC: Added a order by identifier so that the consistent LSIDs are reported when multiple LSIDs exist for one taxon +INSERT into tmp_identifiers (lexical_group_fk, name_fk, identifier,checklist_fk) +SELECT nu.lexical_group_fk, COALESCE(ns.canonical_name_fk, ns.id), i.identifier, nu.checklist_fk FROM identifier i JOIN name_usage nu ON i.usage_fk = nu.id JOIN name_string ns on nu.name_fk = ns.id where i.type_fk = 2001 ORDER BY CASE nu.checklist_fk WHEN 1001 THEN 1 WHEN 1002 THEN 2 WHEN 1003 THEN 3 ELSE 4 END, i.identifier; + +--The SQL below identifies potential lexical groups that will have issues when the nub is genertaed +--The is specific to when 2 different ranks belong to the same lexical group eg Plecoptera is an ORDER and GENUS -- select lg.id, count(distinct preferred_term_fk) from lexical_group lg join name_usage nu on lg.id = nu.lexical_group_fk join term r on nu.rank_fk=r.id group by lg.id having count(distinct preferred_term_fk)>1 \ No newline at end of file diff --git a/src/main/resources/au/org/ala/db/irmng.sql b/data/historical/db/irmng.sql similarity index 98% rename from src/main/resources/au/org/ala/db/irmng.sql rename to data/historical/db/irmng.sql index db5cef0cd..d7af6c34a 100644 --- a/src/main/resources/au/org/ala/db/irmng.sql +++ b/data/historical/db/irmng.sql @@ -1,17 +1,17 @@ ---create the list of known homonyms. ---The list assumes that a genus name that appears more than once in genus table is a homonym ---It is better to use this statement to create a list of known homonyms just in case Tony has not updated the DUPLICATE_FLAG field. -select GENUS into outfile '/tmp/known_homonyms.txt' from MASTER_GENLIST GROUP BY GENUS having count(GENUS)>1; - ---create the classification for all the Genus in IRMNG ---This will be used when trying to verify that a synonym has the correct higher classification -select case when UPPER(mf.KINGDOM) like '%UNALLOCATED%' then '' else mf.KINGDOM end, -case when UPPER(mf.PHYLUM) like '%UNALLOCATED%' then '' else mf.PHYLUM end, -case when UPPER(mf.CLASS) like '%UNALLOCATED%' then '' else mf.CLASS end , -case when UPPER(mf.ORDERNAME) like '%UNALLOCATED%' then '' else mf.ORDERNAME end, -case when UPPER(mf.FAMILY) like '%UNALLOCATED%' then '' else mf.FAMILY end, -mg.GENUS, mg.GENUS_ID, IFNULL(mg.SYNONYM_FLAG, ''), IFNULL(mg.IS_SYN_OF_CODE, ''), -IFNULL(mg.IS_SYN_OF_NAME, ''),IFNULL( mg.DUPLICATE_FLAG,'') -into outfile '/tmp/irmng_classification.txt' -from MASTER_GENLIST mg JOIN MASTER_FAMLIST mf on mg.FAMILY_ID = mf.FAMILY_ID - +--create the list of known homonyms. +--The list assumes that a genus name that appears more than once in genus table is a homonym +--It is better to use this statement to create a list of known homonyms just in case Tony has not updated the DUPLICATE_FLAG field. +select GENUS into outfile '/tmp/known_homonyms.txt' from MASTER_GENLIST GROUP BY GENUS having count(GENUS)>1; + +--create the classification for all the Genus in IRMNG +--This will be used when trying to verify that a synonym has the correct higher classification +select case when UPPER(mf.KINGDOM) like '%UNALLOCATED%' then '' else mf.KINGDOM end, +case when UPPER(mf.PHYLUM) like '%UNALLOCATED%' then '' else mf.PHYLUM end, +case when UPPER(mf.CLASS) like '%UNALLOCATED%' then '' else mf.CLASS end , +case when UPPER(mf.ORDERNAME) like '%UNALLOCATED%' then '' else mf.ORDERNAME end, +case when UPPER(mf.FAMILY) like '%UNALLOCATED%' then '' else mf.FAMILY end, +mg.GENUS, mg.GENUS_ID, IFNULL(mg.SYNONYM_FLAG, ''), IFNULL(mg.IS_SYN_OF_CODE, ''), +IFNULL(mg.IS_SYN_OF_NAME, ''),IFNULL( mg.DUPLICATE_FLAG,'') +into outfile '/tmp/irmng_classification.txt' +from MASTER_GENLIST mg JOIN MASTER_FAMLIST mf on mg.FAMILY_ID = mf.FAMILY_ID + diff --git a/src/main/resources/au/org/ala/vocab/nomenclatural_status.properties b/data/historical/nomenclatural_status.properties similarity index 100% rename from src/main/resources/au/org/ala/vocab/nomenclatural_status.properties rename to data/historical/nomenclatural_status.properties diff --git a/src/test/resources/au/org/ala/names/search/spatial-distribution-names.txt b/data/historical/spatial-distribution-names.txt similarity index 100% rename from src/test/resources/au/org/ala/names/search/spatial-distribution-names.txt rename to data/historical/spatial-distribution-names.txt diff --git a/pom.xml b/pom.xml index 66835aac5..52dcc94d2 100644 --- a/pom.xml +++ b/pom.xml @@ -9,17 +9,27 @@ 4.0.0 au.org.ala ala-name-matching - jar + pom 4.0-SNAPSHOT + + + ala-name-matching-model + ala-name-matching-search + ala-name-matching-builder + ala-name-matching-tools + ala-name-matching-distribution + + ALA Name Matching (for Lucene 8 or above) - + scm:git:git@github.com:AtlasOfLivingAustralia/ala-name-matching.git https://github.com/AtlasOfLivingAustralia/ala-name-matching scm:git:git@github.com:AtlasOfLivingAustralia/ala-name-matching.git HEAD + UTF-8 @@ -28,122 +38,29 @@ 1.8 java18 1.0 + 4.13.1 + 1.7.25 + 1.6.2 + 4.1 + 1.2 + 2.4 + 1.32 + 2.61 - - org.gbif - dwca-io - 1.27 - - - commons-io - commons-io - - - org.slf4j - slf4j-api - - - - - - org.slf4j - slf4j-log4j12 - 1.7.25 - - - org.slf4j - slf4j-api - 1.7.25 - - - commons-collections - commons-collections - 3.2.2 - - - - org.gbif - gbif-common - 0.37 - - - org.gbif - name-parser - 2.24 - - - org.gbif.checklistbank - checklistbank-common - 2.61 - - - org.gbif - dwca-io - 1.32 - - junit junit - 4.13.1 + ${junit.version} test - - - org.apache.lucene - lucene-core - ${org.apache.lucene.version} - - - org.apache.lucene - lucene-backward-codecs - ${org.apache.lucene.version} - - - org.apache.lucene - lucene-analyzers-common - ${org.apache.lucene.version} - - - org.apache.lucene - lucene-queryparser - ${org.apache.lucene.version} - - - commons-lang - commons-lang - 2.6 - - - xerces - xercesImpl - - - - - com.opencsv - opencsv - 4.1 - jar - + - uk.ac.shef.wit - simmetrics - 1.6.2 + org.slf4j + slf4j-api + ${slf4j.version} - + com.fasterxml.jackson.core jackson-core @@ -159,12 +76,6 @@ jackson-annotations ${com.fasterxml.jackson.version} - - - commons-cli - commons-cli - 1.2 - @@ -181,29 +92,6 @@ UTF8 - - org.apache.maven.plugins - maven-assembly-plugin - - - src/assembly/assembly.xml - - - - true - lib/ - - - - - - package - - single - - - - org.apache.maven.plugins maven-source-plugin @@ -273,7 +161,7 @@ maven-assembly-plugin - src/assembly/assembly.xml + old/assembly/assembly.xml diff --git a/src/assembly/assembly.xml b/src/assembly/assembly.xml deleted file mode 100644 index 25a1e4f4c..000000000 --- a/src/assembly/assembly.xml +++ /dev/null @@ -1,30 +0,0 @@ - - distribution - - zip - - false - - - true - true - false - runtime - 0755 - 0755 - - au.org.ala:ala-name-matching - - - - false - true - false - runtime - 0644 - 0755 - lib - - - - \ No newline at end of file diff --git a/src/main/java/au/org/ala/vocab/Concept.java b/src/main/java/au/org/ala/vocab/Concept.java deleted file mode 100644 index 705acfb4d..000000000 --- a/src/main/java/au/org/ala/vocab/Concept.java +++ /dev/null @@ -1,143 +0,0 @@ -package au.org.ala.vocab; - -import com.fasterxml.jackson.annotation.*; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.SerializationFeature; - -import java.io.IOException; -import java.io.Writer; -import java.net.URI; -import java.net.URISyntaxException; -import java.util.Arrays; -import java.util.List; - -/** - * Abstract vocabulary concept. - *

- * These are modelled as data rather than enums or the like so that ... ahem ... unique source - * vocabularies can be mapped. - *

- * - * @author Doug Palmer <Doug.Palmer@csiro.au> - * @copyright Copyright © 2017 Atlas of Living Australia - */ -@JsonIdentityInfo(generator = ObjectIdGenerators.PropertyGenerator.class, property = "id") -@JsonTypeInfo(use=JsonTypeInfo.Id.CLASS, include=JsonTypeInfo.As.PROPERTY, property="@class") -@JsonInclude(JsonInclude.Include.NON_NULL) -abstract public class Concept> { - /** The concept URI */ - @JsonProperty - private URI uri; - /** The concept id; a unique identifier */ - @JsonProperty - private String id; - /** Alternative names for a concept */ - @JsonProperty - private List names; - /** The concept description */ - @JsonProperty - private String description; - /** The concept vocabulary that this concept is a member of */ - @JsonManagedReference - private Vocabulary vocabulary; - /** A parent concept */ - @JsonProperty - private Concept parent; - - public Concept() { - } - - public Concept(Vocabulary vocabulary, URI uri, String id, String description, Concept parent, String... names) { - this.vocabulary = vocabulary; - this.uri = uri; - this.id = id; - this.names = names == null ? null : Arrays.asList(names); - this.description = description; - this.parent = parent; - } - - public Concept(Vocabulary vocabulary, String id, String description, Concept parent, String... names) { - this(vocabulary, null, id, description, parent, names); - try { - this.uri = new URI(this.vocabulary.getUri().getScheme(), this.vocabulary.getUri().getSchemeSpecificPart(), id); - } catch (URISyntaxException ex) { - throw new IllegalArgumentException("Unable to construct concept " + id, ex); - } - } - - public Concept(Vocabulary vocabulary, String id, String... names) { - this(vocabulary, id, null, null, names); - } - - - /** - * Get the URI associated with this concept. - * - * @return The concept URI - */ - public URI getUri() { - return uri; - } - - /** - * Get the id of the concept. - *

- * The id is a unique identifier for this concept. - *

- * - * @return The concept id - */ - public String getId() { - return id; - } - - /** - * Get the list of alternative names for a concept. - * - * @return The alternative name list - */ - public List getNames() { - return names; - } - - /** - * Get the long description of the concept. - * - * @return The long description - */ - public String getDescription() { - return description; - } - - - /** - * Get the vocabulary that the concept is part of - * - * @return The vocabulary - */ - public Vocabulary getVocabulary() { - return vocabulary; - } - - /** - * Get the parent concept. - * - * @return A wider or more general version of the concept. - */ - public Concept getParent() { - return parent; - } - - /** - * Write the concept to a writer - * - * @param writer - * @throws IOException - */ - public void write(Writer writer) throws IOException { - ObjectMapper mapper = new ObjectMapper(); - - mapper.enable(SerializationFeature.INDENT_OUTPUT); - mapper.writeValue(writer, this); - } -} diff --git a/src/main/java/au/org/ala/vocab/TaxonRank.java b/src/main/java/au/org/ala/vocab/TaxonRank.java deleted file mode 100644 index e03e5c9b5..000000000 --- a/src/main/java/au/org/ala/vocab/TaxonRank.java +++ /dev/null @@ -1,63 +0,0 @@ -package au.org.ala.vocab; - -/** - * A taxonomic rank - * - * @author Doug Palmer <Doug.Palmer@csiro.au> - * @copyright Copyright © 2017 Atlas of Living Australia - */ -public class TaxonRank extends Concept { - /** The rank level */ - private int level; - /** Is this rank comparable */ - private boolean comparable; - /** Is this one of the standard linnaean ranks? */ - private boolean linnaean; - - public TaxonRank() { - } - - public TaxonRank(Vocabulary vocabulary, String id, int level, boolean comparable, boolean linnaean, String... names) { - super(vocabulary, id, names); - this.level = level; - this.comparable = comparable; - this.linnaean = linnaean; - } - - /** - * The rank level. - *

- * Larger indicates a lower order (more specific) taxon - *

- * - * @return The level - */ - public int getLevel() { - return level; - } - - /** - * Is this a comparable rank? - *

- * Comparable ranks should have taxa with parent-child in order. - * Non-comparable ranks indicate a - *

- * - * @return True if the rank is comparable - */ - public boolean isComparable() { - return comparable; - } - - /** - * Is this a Linnaean rank? - *

- * One of the big seven (kingdom, phylum, class, order, familty, genus, species) - *

- * - * @return True if a linnaean rank - */ - public boolean isLinnaean() { - return linnaean; - } -} diff --git a/src/main/java/au/org/ala/vocab/Vocabulary.java b/src/main/java/au/org/ala/vocab/Vocabulary.java deleted file mode 100644 index aa8fdc87e..000000000 --- a/src/main/java/au/org/ala/vocab/Vocabulary.java +++ /dev/null @@ -1,98 +0,0 @@ -package au.org.ala.vocab; - -import com.fasterxml.jackson.annotation.JsonBackReference; -import com.fasterxml.jackson.databind.annotation.JsonDeserialize; -import com.fasterxml.jackson.databind.util.StdConverter; - -import java.net.URI; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * A vocabulary constructed from - * - * @author Doug Palmer <Doug.Palmer@csiro.au> - * @copyright Copyright © 2017 Atlas of Living Australia - */ -@JsonDeserialize(converter = Vocabulary.VocabularyConverter.class) -public class Vocabulary> extends Concept> { - /** The concepts */ - @JsonBackReference - private List> concepts; - /** The vocabulary concepts */ - private Map> uriConceptMap; - /** The vocabulary names */ - private Map> nameConceptMap; - - public Vocabulary() { - this.concepts = new ArrayList<>(); - this.uriConceptMap = new HashMap<>(); - this.nameConceptMap = new HashMap<>(); - } - - public Vocabulary(URI uri, String id, String description) { - super(null, uri, id, description, null, null); - this.concepts = new ArrayList<>(); - this.uriConceptMap = new HashMap<>(); - this.nameConceptMap = new HashMap<>(); - } - - /** - * Add a concept to the vocabulary - * - * @param concept The concept - */ - public void add(Concept concept) { - this.concepts.add(concept); - this.resolve(concept); - } - - /** - * Build vocabulary maps to allow get by name/get by URI - */ - protected void resolve() { - this.uriConceptMap = new HashMap<>(this.concepts.size()); - this.nameConceptMap = new HashMap<>(this.concepts.size()); - for (Concept concept: this.concepts) - this.resolve(concept); - } - - /** - * Add a concept to the lookup tables - * - * @param concept The concept to add - * - * @throws IllegalStateException if the concept URI or name has already been added - */ - protected void resolve(Concept concept) { - if (concept.getUri() != null) { - if (this.uriConceptMap.containsKey(concept.getUri())) - throw new IllegalStateException("Duplicate uri " + concept.getUri() + " for " + concept.getId()); - this.uriConceptMap.put(concept.getUri(), concept); - } - if (this.nameConceptMap.containsKey(concept.getId())) - throw new IllegalStateException("Duplicate id " + concept.getId()); - this.nameConceptMap.put(concept.getId(), concept); - if (concept.getNames() != null) { - for (String name: concept.getNames()) { - if (this.nameConceptMap.containsKey(concept.getId())) - throw new IllegalStateException("Duplicate name " + name + " for " + concept.getId()); - this.nameConceptMap.put(name, concept); - } - } - } - - /** - * Converter to allow post-construction concept maps to be built - */ - protected static class VocabularyConverter extends StdConverter, Vocabulary> { - @Override - public Vocabulary convert(Vocabulary value) { - value.resolve(); - return value; - } - } - -} diff --git a/src/test/java/au/org/ala/names/search/MatchMetricsTest.java b/src/test/java/au/org/ala/names/search/MatchMetricsTest.java deleted file mode 100644 index 20ea83f95..000000000 --- a/src/test/java/au/org/ala/names/search/MatchMetricsTest.java +++ /dev/null @@ -1,422 +0,0 @@ -package au.org.ala.names.search; - -import au.org.ala.names.model.LinnaeanRankClassification; -import au.org.ala.names.model.MatchMetrics; -import org.junit.Before; -import org.junit.Test; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -/** - * Tests for the rank classification - */ -public class MatchMetricsTest { - private static final float MATCH_TOLERANCE = 0.01f; - private static final LinnaeanRankClassification CLASS1 = new LinnaeanRankClassification("Animalia", "Arthropoda", "Insecta", "Hymenoptra", "Formicidae", "Huberria", "Huberia striata", "(Smith, 1876)"); - private static final LinnaeanRankClassification CLASS2 = new LinnaeanRankClassification("Charophyta", "Arthropoda", "Equisetopsida", "Gentianales", "Apocynaceae", "Oxypetalum", "Oxypetalum caeruleum", "(D.Don) Decne."); - private MatchMetrics metrics; - - @Before - public void setup() { - this.metrics = new MatchMetrics(); - } - - @Test - public void testComputeMatch1() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch2() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setKingdom(null); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch3() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setPhylum(null); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch4() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setKlass(null); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch5() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setOrder(null); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch6() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setFamily(null); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch7() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setGenus(null); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch8() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setScientificName(null); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch9() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setAuthorship(null); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch10() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setKingdom(null); - this.metrics.computeMatch(query, result, false); - assertEquals(0.746, metrics.getMatch(), MATCH_TOLERANCE); - } - - - @Test - public void testComputeMatch11() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setKingdom("Plantae"); - this.metrics.computeMatch(query, result, false); - assertEquals(0.816, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch12() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setPhylum(null); - this.metrics.computeMatch(query, result, false); - assertEquals(0.976, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch13() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setPhylum("Chordata"); - this.metrics.computeMatch(query, result, false); - assertEquals(0.958, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch14() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setPhylum("ARTHROPODA"); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch15() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setKlass("Hexapodia"); - this.metrics.computeMatch(query, result, false); - assertEquals(0.947, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch16() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setFamily("PERIPATOPSIDAE"); - this.metrics.computeMatch(query, result, false); - assertEquals(0.942, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch17() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setGenus("Vescerro"); - this.metrics.computeMatch(query, result, false); - assertEquals(0.929, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch18() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setSpecificEpithet("striata"); - result.setSpecificEpithet("trigona"); - this.metrics.computeMatch(query, result, false); - assertEquals(0.975, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch19() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setAuthorship("Smith, 1876"); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch20() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setAuthorship("Smith"); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch21() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setAuthorship("Jones, 1876"); - this.metrics.computeMatch(query, result, false); - assertEquals(0.854, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch22() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setAuthorship("Jones"); - this.metrics.computeMatch(query, result, false); - assertEquals(0.756, metrics.getMatch(), MATCH_TOLERANCE); - } - - /** - * Testing bad authors without much context, should result in a lowered match. - * - * @throws Exception - */ - @Test - public void testComputeMatch23() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setKingdom(null); - query.setPhylum(null); - query.setKlass(null); - query.setOrder(null); - query.setFamily(null); - query.setGenus(null); - result.setAuthorship("Smith, 1876"); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch24() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setKingdom(null); - query.setPhylum(null); - query.setKlass(null); - query.setOrder(null); - query.setFamily(null); - query.setGenus(null); - result.setAuthorship("Smith"); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch25() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setKingdom(null); - query.setPhylum(null); - query.setKlass(null); - query.setOrder(null); - query.setFamily(null); - query.setGenus(null); - result.setAuthorship("Jones, 1876"); - this.metrics.computeMatch(query, result, false); - assertEquals(0.554, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch26() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setKingdom(null); - query.setPhylum(null); - query.setKlass(null); - query.setOrder(null); - query.setFamily(null); - query.setGenus(null); - result.setAuthorship("Jones"); - this.metrics.computeMatch(query, result, false); - assertEquals(0.255, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatchSynonym1() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS2); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS2); - result.setKingdom(null); - result.setPhylum(null); - result.setKlass(null); - result.setOrder(null); - result.setFamily(null); - result.setGenus(null); - this.metrics.computeMatch(query, result, true); - assertEquals(0.655, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatchSynonym2() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS2); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS2); - result.setKingdom(null); - result.setPhylum(null); - result.setKlass(null); - result.setOrder(null); - result.setFamily(null); - this.metrics.computeMatch(query, result, true); - assertEquals(0.675, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatchSynonym3() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS2); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS2); - result.setKingdom(null); - result.setPhylum(null); - result.setKlass(null); - result.setOrder(null); - result.setFamily(null); - result.setAuthorship(null); - this.metrics.computeMatch(query, result, true); - assertEquals(0.482, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatchSynonym4() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS2); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS2); - result.setKingdom(null); - result.setPhylum(null); - result.setKlass(null); - result.setOrder(null); - result.setFamily(null); - query.setAuthorship(null); - result.setAuthorship(null); - this.metrics.computeMatch(query, result, true); - assertEquals(0.471, metrics.getMatch(), MATCH_TOLERANCE); - } - - - /** - * Test match computation takes less than 1us per match for simple cases - * - * @throws Exception - */ - @Test - public void testComputeMatchTiming1() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - long start = System.currentTimeMillis(); - for (int i = 0; i < 1000000; i++) - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - long time = System.currentTimeMillis() - start; - assertTrue("Took " + time + "ms. Required to be less than 4000ms", time < 4000); - } - - /** - * Test match computation takes less than 1us per match for simple cases - * - * @throws Exception - */ - @Test - public void testComputeMatchTiming2() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setKingdom(null); - query.setPhylum(null); - query.setKlass(null); - query.setOrder(null); - long start = System.currentTimeMillis(); - for (int i = 0; i < 1000000; i++) - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - long time = System.currentTimeMillis() - start; - assertTrue("Took " + time + "ms. Required to be less than 4000ms", time < 4000); - } - - /** - * Test match computation takes less than 10us per match for one bodgy result - * - * @throws Exception - */ - @Test - public void testComputeMatchTiming3() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setPhylum("Chordata"); - long start = System.currentTimeMillis(); - for (int i = 0; i < 100000; i++) - this.metrics.computeMatch(query, result, false); - assertEquals(0.958, metrics.getMatch(), MATCH_TOLERANCE); - long time = System.currentTimeMillis() - start; - assertTrue("Took " + time + "ms. Required to be less than 4000ms", time < 4000); - } - - /** - * Test match computation takes less than 10us per match for two bodgy results - * - * @throws Exception - */ - @Test - public void testComputeMatchTiming4() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setPhylum("Chordata"); - result.setGenus("Acacia"); - long start = System.currentTimeMillis(); - for (int i = 0; i < 100000; i++) - this.metrics.computeMatch(query, result, false); - assertEquals(0.873, metrics.getMatch(), MATCH_TOLERANCE); - long time = System.currentTimeMillis() - start; - assertTrue("Took " + time + "ms. Required to be less than 4000ms", time < 4000); - } - - -} diff --git a/src/test/java/au/org/ala/vocab/TaxonRankTest.java b/src/test/java/au/org/ala/vocab/TaxonRankTest.java deleted file mode 100644 index 4696cef1c..000000000 --- a/src/test/java/au/org/ala/vocab/TaxonRankTest.java +++ /dev/null @@ -1,33 +0,0 @@ -package au.org.ala.vocab; - -import au.org.ala.names.util.TestUtils; -import org.junit.Before; -import org.junit.Test; - -import java.io.StringWriter; -import java.net.URI; - -import static org.junit.Assert.assertEquals; - -/** - * Tests for a taxonomic rank. - * - * @author Doug Palmer <Doug.Palmer@csiro.au> - * @copyright Copyright © 2017 Atlas of Living Australia - */ -public class TaxonRankTest extends TestUtils { - private Vocabulary vocabulary; - - @Before - public void setup() throws Exception { - this.vocabulary = new Vocabulary<>(URI.create("urm:x-ala:vocabulary:tr-1"), "tr-1", null); - } - - @Test - public void testWrite1() throws Exception { - TaxonRank rank = new TaxonRank(this.vocabulary, "genus", 6000, true, true); - StringWriter sw = new StringWriter(); - rank.write(sw); - assertEquals(this.loadResource("taxon-rank-1.json"), sw.toString()); - } -} diff --git a/src/test/resources/au/org/ala/vocab/taxon-rank-1.json b/src/test/resources/au/org/ala/vocab/taxon-rank-1.json deleted file mode 100644 index 515dc85f0..000000000 --- a/src/test/resources/au/org/ala/vocab/taxon-rank-1.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "@class" : "au.org.ala.vocab.TaxonRank", - "id" : "genus", - "uri" : "urm:x-ala:vocabulary:tr-1#genus", - "names" : [ ], - "vocabulary" : { - "@class" : "au.org.ala.vocab.Vocabulary", - "id" : "tr-1", - "uri" : "urm:x-ala:vocabulary:tr-1" - }, - "level" : 6000, - "comparable" : true, - "linnaean" : true -} \ No newline at end of file From 8f04ffabe84a3d66856654c7c089eafd3f77127a Mon Sep 17 00:00:00 2001 From: pal155 Date: Tue, 5 Oct 2021 09:59:25 +1100 Subject: [PATCH 12/19] Update POM to remove travis assembly --- pom.xml | 50 -------------------------------------------------- 1 file changed, 50 deletions(-) diff --git a/pom.xml b/pom.xml index 52dcc94d2..7a031300c 100644 --- a/pom.xml +++ b/pom.xml @@ -121,19 +121,6 @@ - - org.apache.maven.plugins - maven-jar-plugin - - - - true - lib/ - au.org.ala.names.search.DwcaNameIndexer - - - - @@ -142,43 +129,6 @@ travis - - org.apache.maven.plugins - maven-jar-plugin - 2.4 - - - - true - lib/ - au.org.ala.names.search.DwcaNameIndexer - - - - - - org.apache.maven.plugins - maven-assembly-plugin - - - old/assembly/assembly.xml - - - - true - lib/ - - - - - - package - - single - - - - From d0739763048fffc24cf371b84d9fd9c9d74ad356 Mon Sep 17 00:00:00 2001 From: pal155 Date: Wed, 6 Oct 2021 09:56:24 +1100 Subject: [PATCH 13/19] Add copyright notices --- LICENSE | 469 ++++++++++++++++++ .../org/ala/names/index/ALANameAnalyser.java | 16 + .../org/ala/names/index/ALATaxonResolver.java | 16 + .../java/au/org/ala/names/index/BareName.java | 16 + .../au/org/ala/names/index/CSVNameSource.java | 16 + .../org/ala/names/index/DwcaNameSource.java | 16 + .../names/index/IndexBuilderException.java | 16 + .../au/org/ala/names/index/IssueType.java | 16 + .../java/au/org/ala/names/index/Name.java | 16 + .../au/org/ala/names/index/NameAnalyser.java | 16 + .../java/au/org/ala/names/index/NameKey.java | 16 + .../au/org/ala/names/index/NameProvider.java | 16 + .../au/org/ala/names/index/NameSource.java | 16 + .../org/ala/names/index/RankComparator.java | 16 + .../java/au/org/ala/names/index/Reporter.java | 16 + .../ala/names/index/ResolutionException.java | 16 + .../org/ala/names/index/ScientificName.java | 16 + .../au/org/ala/names/index/TaxonConcept.java | 16 + .../ala/names/index/TaxonConceptInstance.java | 16 + .../org/ala/names/index/TaxonResolution.java | 16 + .../names/index/TaxonResolutionException.java | 16 + .../au/org/ala/names/index/TaxonResolver.java | 16 + .../org/ala/names/index/TaxonomicElement.java | 16 + .../java/au/org/ala/names/index/Taxonomy.java | 16 + .../org/ala/names/index/TaxonomyBuilder.java | 16 + .../names/index/TaxonomyConfiguration.java | 16 + .../names/index/UnrankedScientificName.java | 16 + .../index/provider/AndTaxonCondition.java | 16 + .../provider/ConceptResolutionPriority.java | 16 + .../names/index/provider/DiscardStrategy.java | 16 + .../ala/names/index/provider/KeyAdjuster.java | 16 + .../names/index/provider/KeyAdjustment.java | 16 + .../index/provider/MatchTaxonCondition.java | 16 + .../names/index/provider/NameMatchType.java | 16 + .../index/provider/OrTaxonCondition.java | 16 + .../names/index/provider/ScoreAdjuster.java | 16 + .../names/index/provider/ScoreAdjustment.java | 16 + .../names/index/provider/TaxonCondition.java | 16 + .../index/provider/UnrankedStrategy.java | 16 + .../au/org/ala/names/util/DwcaWriter.java | 16 + .../au/org/ala/names/util/GbifModule.java | 16 + .../ala/names/index/ALANameAnalyserTest.java | 16 + .../ala/names/index/ALATaxonResolverTest.java | 16 + .../ala/names/index/CSVNameSourceTest.java | 16 + .../org/ala/names/index/NameProviderTest.java | 16 + .../ala/names/index/RankComparatorTest.java | 16 + .../ala/names/index/ScientificNameTest.java | 16 + .../org/ala/names/index/TaxonConceptTest.java | 16 + .../index/TaxonomyConfiugrationTest.java | 16 + .../au/org/ala/names/index/TaxonomyTest.java | 16 + .../index/provider/AndTaxonConditionTest.java | 16 + .../names/index/provider/KeyAdjusterTest.java | 16 + .../provider/MatchTaxonConditionTest.java | 16 + .../index/provider/OrTaxonConditionTest.java | 16 + .../index/provider/ScoreAdjusterTest.java | 16 + .../java/au/org/ala/names/util/TestUtils.java | 16 + .../au/org/ala/names/index/dwca-1/meta.xml | 16 + .../src/main/scripts/compare.sh | 16 + .../src/main/scripts/dump.sh | 16 + .../src/main/scripts/generate.sh | 16 + .../src/main/scripts/index.sh | 16 + .../src/main/scripts/merge.sh | 16 + .../model/LinnaeanRankClassification.java | 16 + .../au/org/ala/names/model/MatchMetrics.java | 16 + .../java/au/org/ala/names/model/NameFlag.java | 16 + .../au/org/ala/names/model/TaxonomicType.java | 16 + .../ala/names/model/TaxonomicTypeGroup.java | 16 + .../org/ala/names/model/VernacularType.java | 16 + .../ala/names/util/CleanedScientificName.java | 16 + .../org/ala/names/util/TaxonNameSoundEx.java | 16 + .../main/java/au/org/ala/vocab/ALATerm.java | 16 + .../parser/util/PhraseNameParserTests.java | 16 + .../names/util/CleanedScientificNameTest.java | 16 + .../gbif/nameparser/PhraseNameParserTest.java | 16 + .../au/org/ala/names/search/FieldType.java | 16 + .../org/ala/names/util/TaxonNameSoundEx.java | 16 + .../ala/names/search/ALANameSearcherTest.java | 16 + .../ala/names/search/BiocacheMatchTest.java | 16 + .../ala/names/search/IconicSpeciesTest.java | 16 + .../ala/names/search/VernacularMatchTest.java | 16 + .../org/ala/names/util/NameListComparer.java | 16 + .../org/ala/names/util/NameListGenerator.java | 16 + .../java/au/org/ala/names/util/TermDump.java | 16 + 83 files changed, 1781 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..566908108 --- /dev/null +++ b/LICENSE @@ -0,0 +1,469 @@ + MOZILLA PUBLIC LICENSE + Version 1.1 + + --------------- + +1. Definitions. + + 1.0.1. "Commercial Use" means distribution or otherwise making the + Covered Code available to a third party. + + 1.1. "Contributor" means each entity that creates or contributes to + the creation of Modifications. + + 1.2. "Contributor Version" means the combination of the Original + Code, prior Modifications used by a Contributor, and the Modifications + made by that particular Contributor. + + 1.3. "Covered Code" means the Original Code or Modifications or the + combination of the Original Code and Modifications, in each case + including portions thereof. + + 1.4. "Electronic Distribution Mechanism" means a mechanism generally + accepted in the software development community for the electronic + transfer of data. + + 1.5. "Executable" means Covered Code in any form other than Source + Code. + + 1.6. "Initial Developer" means the individual or entity identified + as the Initial Developer in the Source Code notice required by Exhibit + A. + + 1.7. "Larger Work" means a work which combines Covered Code or + portions thereof with code not governed by the terms of this License. + + 1.8. "License" means this document. + + 1.8.1. "Licensable" means having the right to grant, to the maximum + extent possible, whether at the time of the initial grant or + subsequently acquired, any and all of the rights conveyed herein. + + 1.9. "Modifications" means any addition to or deletion from the + substance or structure of either the Original Code or any previous + Modifications. When Covered Code is released as a series of files, a + Modification is: + A. Any addition to or deletion from the contents of a file + containing Original Code or previous Modifications. + + B. Any new file that contains any part of the Original Code or + previous Modifications. + + 1.10. "Original Code" means Source Code of computer software code + which is described in the Source Code notice required by Exhibit A as + Original Code, and which, at the time of its release under this + License is not already Covered Code governed by this License. + + 1.10.1. "Patent Claims" means any patent claim(s), now owned or + hereafter acquired, including without limitation, method, process, + and apparatus claims, in any patent Licensable by grantor. + + 1.11. "Source Code" means the preferred form of the Covered Code for + making modifications to it, including all modules it contains, plus + any associated interface definition files, scripts used to control + compilation and installation of an Executable, or source code + differential comparisons against either the Original Code or another + well known, available Covered Code of the Contributor's choice. The + Source Code can be in a compressed or archival form, provided the + appropriate decompression or de-archiving software is widely available + for no charge. + + 1.12. "You" (or "Your") means an individual or a legal entity + exercising rights under, and complying with all of the terms of, this + License or a future version of this License issued under Section 6.1. + For legal entities, "You" includes any entity which controls, is + controlled by, or is under common control with You. For purposes of + this definition, "control" means (a) the power, direct or indirect, + to cause the direction or management of such entity, whether by + contract or otherwise, or (b) ownership of more than fifty percent + (50%) of the outstanding shares or beneficial ownership of such + entity. + +2. Source Code License. + + 2.1. The Initial Developer Grant. + The Initial Developer hereby grants You a world-wide, royalty-free, + non-exclusive license, subject to third party intellectual property + claims: + (a) under intellectual property rights (other than patent or + trademark) Licensable by Initial Developer to use, reproduce, + modify, display, perform, sublicense and distribute the Original + Code (or portions thereof) with or without Modifications, and/or + as part of a Larger Work; and + + (b) under Patents Claims infringed by the making, using or + selling of Original Code, to make, have made, use, practice, + sell, and offer for sale, and/or otherwise dispose of the + Original Code (or portions thereof). + + (c) the licenses granted in this Section 2.1(a) and (b) are + effective on the date Initial Developer first distributes + Original Code under the terms of this License. + + (d) Notwithstanding Section 2.1(b) above, no patent license is + granted: 1) for code that You delete from the Original Code; 2) + separate from the Original Code; or 3) for infringements caused + by: i) the modification of the Original Code or ii) the + combination of the Original Code with other software or devices. + + 2.2. Contributor Grant. + Subject to third party intellectual property claims, each Contributor + hereby grants You a world-wide, royalty-free, non-exclusive license + + (a) under intellectual property rights (other than patent or + trademark) Licensable by Contributor, to use, reproduce, modify, + display, perform, sublicense and distribute the Modifications + created by such Contributor (or portions thereof) either on an + unmodified basis, with other Modifications, as Covered Code + and/or as part of a Larger Work; and + + (b) under Patent Claims infringed by the making, using, or + selling of Modifications made by that Contributor either alone + and/or in combination with its Contributor Version (or portions + of such combination), to make, use, sell, offer for sale, have + made, and/or otherwise dispose of: 1) Modifications made by that + Contributor (or portions thereof); and 2) the combination of + Modifications made by that Contributor with its Contributor + Version (or portions of such combination). + + (c) the licenses granted in Sections 2.2(a) and 2.2(b) are + effective on the date Contributor first makes Commercial Use of + the Covered Code. + + (d) Notwithstanding Section 2.2(b) above, no patent license is + granted: 1) for any code that Contributor has deleted from the + Contributor Version; 2) separate from the Contributor Version; + 3) for infringements caused by: i) third party modifications of + Contributor Version or ii) the combination of Modifications made + by that Contributor with other software (except as part of the + Contributor Version) or other devices; or 4) under Patent Claims + infringed by Covered Code in the absence of Modifications made by + that Contributor. + +3. Distribution Obligations. + + 3.1. Application of License. + The Modifications which You create or to which You contribute are + governed by the terms of this License, including without limitation + Section 2.2. The Source Code version of Covered Code may be + distributed only under the terms of this License or a future version + of this License released under Section 6.1, and You must include a + copy of this License with every copy of the Source Code You + distribute. You may not offer or impose any terms on any Source Code + version that alters or restricts the applicable version of this + License or the recipients' rights hereunder. However, You may include + an additional document offering the additional rights described in + Section 3.5. + + 3.2. Availability of Source Code. + Any Modification which You create or to which You contribute must be + made available in Source Code form under the terms of this License + either on the same media as an Executable version or via an accepted + Electronic Distribution Mechanism to anyone to whom you made an + Executable version available; and if made available via Electronic + Distribution Mechanism, must remain available for at least twelve (12) + months after the date it initially became available, or at least six + (6) months after a subsequent version of that particular Modification + has been made available to such recipients. You are responsible for + ensuring that the Source Code version remains available even if the + Electronic Distribution Mechanism is maintained by a third party. + + 3.3. Description of Modifications. + You must cause all Covered Code to which You contribute to contain a + file documenting the changes You made to create that Covered Code and + the date of any change. You must include a prominent statement that + the Modification is derived, directly or indirectly, from Original + Code provided by the Initial Developer and including the name of the + Initial Developer in (a) the Source Code, and (b) in any notice in an + Executable version or related documentation in which You describe the + origin or ownership of the Covered Code. + + 3.4. Intellectual Property Matters + (a) Third Party Claims. + If Contributor has knowledge that a license under a third party's + intellectual property rights is required to exercise the rights + granted by such Contributor under Sections 2.1 or 2.2, + Contributor must include a text file with the Source Code + distribution titled "LEGAL" which describes the claim and the + party making the claim in sufficient detail that a recipient will + know whom to contact. If Contributor obtains such knowledge after + the Modification is made available as described in Section 3.2, + Contributor shall promptly modify the LEGAL file in all copies + Contributor makes available thereafter and shall take other steps + (such as notifying appropriate mailing lists or newsgroups) + reasonably calculated to inform those who received the Covered + Code that new knowledge has been obtained. + + (b) Contributor APIs. + If Contributor's Modifications include an application programming + interface and Contributor has knowledge of patent licenses which + are reasonably necessary to implement that API, Contributor must + also include this information in the LEGAL file. + + (c) Representations. + Contributor represents that, except as disclosed pursuant to + Section 3.4(a) above, Contributor believes that Contributor's + Modifications are Contributor's original creation(s) and/or + Contributor has sufficient rights to grant the rights conveyed by + this License. + + 3.5. Required Notices. + You must duplicate the notice in Exhibit A in each file of the Source + Code. If it is not possible to put such notice in a particular Source + Code file due to its structure, then You must include such notice in a + location (such as a relevant directory) where a user would be likely + to look for such a notice. If You created one or more Modification(s) + You may add your name as a Contributor to the notice described in + Exhibit A. You must also duplicate this License in any documentation + for the Source Code where You describe recipients' rights or ownership + rights relating to Covered Code. You may choose to offer, and to + charge a fee for, warranty, support, indemnity or liability + obligations to one or more recipients of Covered Code. However, You + may do so only on Your own behalf, and not on behalf of the Initial + Developer or any Contributor. You must make it absolutely clear than + any such warranty, support, indemnity or liability obligation is + offered by You alone, and You hereby agree to indemnify the Initial + Developer and every Contributor for any liability incurred by the + Initial Developer or such Contributor as a result of warranty, + support, indemnity or liability terms You offer. + + 3.6. Distribution of Executable Versions. + You may distribute Covered Code in Executable form only if the + requirements of Section 3.1-3.5 have been met for that Covered Code, + and if You include a notice stating that the Source Code version of + the Covered Code is available under the terms of this License, + including a description of how and where You have fulfilled the + obligations of Section 3.2. The notice must be conspicuously included + in any notice in an Executable version, related documentation or + collateral in which You describe recipients' rights relating to the + Covered Code. You may distribute the Executable version of Covered + Code or ownership rights under a license of Your choice, which may + contain terms different from this License, provided that You are in + compliance with the terms of this License and that the license for the + Executable version does not attempt to limit or alter the recipient's + rights in the Source Code version from the rights set forth in this + License. If You distribute the Executable version under a different + license You must make it absolutely clear that any terms which differ + from this License are offered by You alone, not by the Initial + Developer or any Contributor. You hereby agree to indemnify the + Initial Developer and every Contributor for any liability incurred by + the Initial Developer or such Contributor as a result of any such + terms You offer. + + 3.7. Larger Works. + You may create a Larger Work by combining Covered Code with other code + not governed by the terms of this License and distribute the Larger + Work as a single product. In such a case, You must make sure the + requirements of this License are fulfilled for the Covered Code. + +4. Inability to Comply Due to Statute or Regulation. + + If it is impossible for You to comply with any of the terms of this + License with respect to some or all of the Covered Code due to + statute, judicial order, or regulation then You must: (a) comply with + the terms of this License to the maximum extent possible; and (b) + describe the limitations and the code they affect. Such description + must be included in the LEGAL file described in Section 3.4 and must + be included with all distributions of the Source Code. Except to the + extent prohibited by statute or regulation, such description must be + sufficiently detailed for a recipient of ordinary skill to be able to + understand it. + +5. Application of this License. + + This License applies to code to which the Initial Developer has + attached the notice in Exhibit A and to related Covered Code. + +6. Versions of the License. + + 6.1. New Versions. + Netscape Communications Corporation ("Netscape") may publish revised + and/or new versions of the License from time to time. Each version + will be given a distinguishing version number. + + 6.2. Effect of New Versions. + Once Covered Code has been published under a particular version of the + License, You may always continue to use it under the terms of that + version. You may also choose to use such Covered Code under the terms + of any subsequent version of the License published by Netscape. No one + other than Netscape has the right to modify the terms applicable to + Covered Code created under this License. + + 6.3. Derivative Works. + If You create or use a modified version of this License (which you may + only do in order to apply it to code which is not already Covered Code + governed by this License), You must (a) rename Your license so that + the phrases "Mozilla", "MOZILLAPL", "MOZPL", "Netscape", + "MPL", "NPL" or any confusingly similar phrase do not appear in your + license (except to note that your license differs from this License) + and (b) otherwise make it clear that Your version of the license + contains terms which differ from the Mozilla Public License and + Netscape Public License. (Filling in the name of the Initial + Developer, Original Code or Contributor in the notice described in + Exhibit A shall not of themselves be deemed to be modifications of + this License.) + +7. DISCLAIMER OF WARRANTY. + + COVERED CODE IS PROVIDED UNDER THIS LICENSE ON AN "AS IS" BASIS, + WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, + WITHOUT LIMITATION, WARRANTIES THAT THE COVERED CODE IS FREE OF + DEFECTS, MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE OR NON-INFRINGING. + THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE COVERED CODE + IS WITH YOU. SHOULD ANY COVERED CODE PROVE DEFECTIVE IN ANY RESPECT, + YOU (NOT THE INITIAL DEVELOPER OR ANY OTHER CONTRIBUTOR) ASSUME THE + COST OF ANY NECESSARY SERVICING, REPAIR OR CORRECTION. THIS DISCLAIMER + OF WARRANTY CONSTITUTES AN ESSENTIAL PART OF THIS LICENSE. NO USE OF + ANY COVERED CODE IS AUTHORIZED HEREUNDER EXCEPT UNDER THIS DISCLAIMER. + +8. TERMINATION. + + 8.1. This License and the rights granted hereunder will terminate + automatically if You fail to comply with terms herein and fail to cure + such breach within 30 days of becoming aware of the breach. All + sublicenses to the Covered Code which are properly granted shall + survive any termination of this License. Provisions which, by their + nature, must remain in effect beyond the termination of this License + shall survive. + + 8.2. If You initiate litigation by asserting a patent infringement + claim (excluding declatory judgment actions) against Initial Developer + or a Contributor (the Initial Developer or Contributor against whom + You file such action is referred to as "Participant") alleging that: + + (a) such Participant's Contributor Version directly or indirectly + infringes any patent, then any and all rights granted by such + Participant to You under Sections 2.1 and/or 2.2 of this License + shall, upon 60 days notice from Participant terminate prospectively, + unless if within 60 days after receipt of notice You either: (i) + agree in writing to pay Participant a mutually agreeable reasonable + royalty for Your past and future use of Modifications made by such + Participant, or (ii) withdraw Your litigation claim with respect to + the Contributor Version against such Participant. If within 60 days + of notice, a reasonable royalty and payment arrangement are not + mutually agreed upon in writing by the parties or the litigation claim + is not withdrawn, the rights granted by Participant to You under + Sections 2.1 and/or 2.2 automatically terminate at the expiration of + the 60 day notice period specified above. + + (b) any software, hardware, or device, other than such Participant's + Contributor Version, directly or indirectly infringes any patent, then + any rights granted to You by such Participant under Sections 2.1(b) + and 2.2(b) are revoked effective as of the date You first made, used, + sold, distributed, or had made, Modifications made by that + Participant. + + 8.3. If You assert a patent infringement claim against Participant + alleging that such Participant's Contributor Version directly or + indirectly infringes any patent where such claim is resolved (such as + by license or settlement) prior to the initiation of patent + infringement litigation, then the reasonable value of the licenses + granted by such Participant under Sections 2.1 or 2.2 shall be taken + into account in determining the amount or value of any payment or + license. + + 8.4. In the event of termination under Sections 8.1 or 8.2 above, + all end user license agreements (excluding distributors and resellers) + which have been validly granted by You or any distributor hereunder + prior to termination shall survive termination. + +9. LIMITATION OF LIABILITY. + + UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT + (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL YOU, THE INITIAL + DEVELOPER, ANY OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF COVERED CODE, + OR ANY SUPPLIER OF ANY OF SUCH PARTIES, BE LIABLE TO ANY PERSON FOR + ANY INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY + CHARACTER INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF GOODWILL, + WORK STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER + COMMERCIAL DAMAGES OR LOSSES, EVEN IF SUCH PARTY SHALL HAVE BEEN + INFORMED OF THE POSSIBILITY OF SUCH DAMAGES. THIS LIMITATION OF + LIABILITY SHALL NOT APPLY TO LIABILITY FOR DEATH OR PERSONAL INJURY + RESULTING FROM SUCH PARTY'S NEGLIGENCE TO THE EXTENT APPLICABLE LAW + PROHIBITS SUCH LIMITATION. SOME JURISDICTIONS DO NOT ALLOW THE + EXCLUSION OR LIMITATION OF INCIDENTAL OR CONSEQUENTIAL DAMAGES, SO + THIS EXCLUSION AND LIMITATION MAY NOT APPLY TO YOU. + +10. U.S. GOVERNMENT END USERS. + + The Covered Code is a "commercial item," as that term is defined in + 48 C.F.R. 2.101 (Oct. 1995), consisting of "commercial computer + software" and "commercial computer software documentation," as such + terms are used in 48 C.F.R. 12.212 (Sept. 1995). Consistent with 48 + C.F.R. 12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (June 1995), + all U.S. Government End Users acquire Covered Code with only those + rights set forth herein. + +11. MISCELLANEOUS. + + This License represents the complete agreement concerning subject + matter hereof. If any provision of this License is held to be + unenforceable, such provision shall be reformed only to the extent + necessary to make it enforceable. This License shall be governed by + California law provisions (except to the extent applicable law, if + any, provides otherwise), excluding its conflict-of-law provisions. + With respect to disputes in which at least one party is a citizen of, + or an entity chartered or registered to do business in the United + States of America, any litigation relating to this License shall be + subject to the jurisdiction of the Federal Courts of the Northern + District of California, with venue lying in Santa Clara County, + California, with the losing party responsible for costs, including + without limitation, court costs and reasonable attorneys' fees and + expenses. The application of the United Nations Convention on + Contracts for the International Sale of Goods is expressly excluded. + Any law or regulation which provides that the language of a contract + shall be construed against the drafter shall not apply to this + License. + +12. RESPONSIBILITY FOR CLAIMS. + + As between Initial Developer and the Contributors, each party is + responsible for claims and damages arising, directly or indirectly, + out of its utilization of rights under this License and You agree to + work with Initial Developer and Contributors to distribute such + responsibility on an equitable basis. Nothing herein is intended or + shall be deemed to constitute any admission of liability. + +13. MULTIPLE-LICENSED CODE. + + Initial Developer may designate portions of the Covered Code as + "Multiple-Licensed". "Multiple-Licensed" means that the Initial + Developer permits you to utilize portions of the Covered Code under + Your choice of the MPL or the alternative licenses, if any, specified + by the Initial Developer in the file described in Exhibit A. + +EXHIBIT A -Mozilla Public License. + + ``The contents of this file are subject to the Mozilla Public License + Version 1.1 (the "License"); you may not use this file except in + compliance with the License. You may obtain a copy of the License at + https://www.mozilla.org/MPL/ + + Software distributed under the License is distributed on an "AS IS" + basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the + License for the specific language governing rights and limitations + under the License. + + The Original Code is ______________________________________. + + The Initial Developer of the Original Code is ________________________. + Portions created by ______________________ are Copyright (C) ______ + _______________________. All Rights Reserved. + + Contributor(s): ______________________________________. + + Alternatively, the contents of this file may be used under the terms + of the _____ license (the "[___] License"), in which case the + provisions of [______] License are applicable instead of those + above. If you wish to allow use of your version of this file only + under the terms of the [____] License and not to allow others to use + your version of this file under the MPL, indicate your decision by + deleting the provisions above and replace them with the notice and + other provisions required by the [___] License. If you do not delete + the provisions above, a recipient may use your version of this file + under either the MPL or the [___] License." + + [NOTE: The text of this Exhibit A may differ slightly from the text of + the notices in the Source Code files of the Original Code. You should + use the text of this Exhibit A rather than the text found in the + Original Code Source Code for Your Modifications.] diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALANameAnalyser.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALANameAnalyser.java index 607eddb8e..46ccb467c 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALANameAnalyser.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALANameAnalyser.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.*; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALATaxonResolver.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALATaxonResolver.java index 3e50a75e9..90f4e41fe 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALATaxonResolver.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALATaxonResolver.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.index.provider.ConceptResolutionPriority; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/BareName.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/BareName.java index bfe13354f..e2ea911ed 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/BareName.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/BareName.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import java.util.ArrayList; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/CSVNameSource.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/CSVNameSource.java index abe76be62..1647e977e 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/CSVNameSource.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/CSVNameSource.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/DwcaNameSource.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/DwcaNameSource.java index 8a5f0f886..7b88dc89d 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/DwcaNameSource.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/DwcaNameSource.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/IndexBuilderException.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/IndexBuilderException.java index 62dfd56bc..478aea1e7 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/IndexBuilderException.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/IndexBuilderException.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; /** diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/IssueType.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/IssueType.java index 57bfc62a8..a32294293 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/IssueType.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/IssueType.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; /** diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/Name.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/Name.java index c8807c42d..6f5a6873c 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/Name.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/Name.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameAnalyser.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameAnalyser.java index c7ad9c649..71a94218e 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameAnalyser.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameAnalyser.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameKey.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameKey.java index 9542c9f02..0454df956 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameKey.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameKey.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; /** diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameProvider.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameProvider.java index 00ef182e6..8d1d8933b 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameProvider.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameProvider.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.index.provider.*; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameSource.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameSource.java index 926c766d7..b10bd9fad 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameSource.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameSource.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.vocab.ALATerm; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/RankComparator.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/RankComparator.java index 9fbacf7d8..2849b5948 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/RankComparator.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/RankComparator.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/Reporter.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/Reporter.java index 0e97c042e..960fb80f7 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/Reporter.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/Reporter.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import java.util.List; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ResolutionException.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ResolutionException.java index fa9a8c722..ab4ecf920 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ResolutionException.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ResolutionException.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import javax.annotation.Nullable; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ScientificName.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ScientificName.java index 3e2bee463..7e0d0d838 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ScientificName.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ScientificName.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonConcept.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonConcept.java index e3f512fdf..68acc265f 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonConcept.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonConcept.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java index 74c5a1266..5506f81bf 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonResolution.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonResolution.java index e51ddea3a..45888e35a 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonResolution.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonResolution.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonResolutionException.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonResolutionException.java index d14c8459f..75670dddf 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonResolutionException.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonResolutionException.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; /** diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonResolver.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonResolver.java index 3c2cffc12..b6c67c005 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonResolver.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonResolver.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomicElement.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomicElement.java index 256fc4f7f..9b6014818 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomicElement.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomicElement.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/Taxonomy.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/Taxonomy.java index b5e7eeeb0..84d238157 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/Taxonomy.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/Taxonomy.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.LinnaeanRankClassification; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomyBuilder.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomyBuilder.java index 56648d19a..0a6e5335e 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomyBuilder.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomyBuilder.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.search.DwcaNameIndexer; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomyConfiguration.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomyConfiguration.java index c51476068..751119928 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomyConfiguration.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomyConfiguration.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.util.GbifModule; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/UnrankedScientificName.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/UnrankedScientificName.java index 17f5c477d..2f3b194ab 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/UnrankedScientificName.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/UnrankedScientificName.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/AndTaxonCondition.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/AndTaxonCondition.java index cf39481e3..e12ef4e2c 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/AndTaxonCondition.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/AndTaxonCondition.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.NameKey; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ConceptResolutionPriority.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ConceptResolutionPriority.java index c1baa7787..ed1eda6f6 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ConceptResolutionPriority.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ConceptResolutionPriority.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; /** diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/DiscardStrategy.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/DiscardStrategy.java index eb418b88e..217882cc7 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/DiscardStrategy.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/DiscardStrategy.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; /** diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/KeyAdjuster.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/KeyAdjuster.java index 813cc5723..d30ca3392 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/KeyAdjuster.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/KeyAdjuster.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.NameKey; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/KeyAdjustment.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/KeyAdjustment.java index 362fb9bb9..f51a8f904 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/KeyAdjustment.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/KeyAdjustment.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.NameKey; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/MatchTaxonCondition.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/MatchTaxonCondition.java index 706049778..ce51a3b73 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/MatchTaxonCondition.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/MatchTaxonCondition.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.NameKey; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/NameMatchType.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/NameMatchType.java index 62f9c2218..0f2a554e0 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/NameMatchType.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/NameMatchType.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; /** diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/OrTaxonCondition.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/OrTaxonCondition.java index e869c11a7..b22618e03 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/OrTaxonCondition.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/OrTaxonCondition.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.NameKey; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ScoreAdjuster.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ScoreAdjuster.java index cbd30d1a5..65d139750 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ScoreAdjuster.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ScoreAdjuster.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.NameKey; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ScoreAdjustment.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ScoreAdjustment.java index 1bfdde839..e949e4cae 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ScoreAdjustment.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ScoreAdjustment.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.NameKey; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/TaxonCondition.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/TaxonCondition.java index 7e7c21a8e..63c6ea65f 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/TaxonCondition.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/TaxonCondition.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.NameKey; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/UnrankedStrategy.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/UnrankedStrategy.java index 1b1691cae..00454e4c0 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/UnrankedStrategy.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/UnrankedStrategy.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.TaxonConceptInstance; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/util/DwcaWriter.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/util/DwcaWriter.java index 280014e2e..bdcf81c0b 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/util/DwcaWriter.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/util/DwcaWriter.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.util; import com.google.common.collect.Maps; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/util/GbifModule.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/util/GbifModule.java index 1e90548d4..481c51a8d 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/util/GbifModule.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/util/GbifModule.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.util; import com.fasterxml.jackson.core.*; diff --git a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/ALANameAnalyserTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/ALANameAnalyserTest.java index 04a527bcc..e37b99717 100644 --- a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/ALANameAnalyserTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/ALANameAnalyserTest.java @@ -1,5 +1,21 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; diff --git a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/ALATaxonResolverTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/ALATaxonResolverTest.java index 18c3919b4..5868f2f18 100644 --- a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/ALATaxonResolverTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/ALATaxonResolverTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.util.TestUtils; diff --git a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/CSVNameSourceTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/CSVNameSourceTest.java index 809649320..498344ff8 100644 --- a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/CSVNameSourceTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/CSVNameSourceTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; diff --git a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/NameProviderTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/NameProviderTest.java index 686394d4d..cbf187c4a 100644 --- a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/NameProviderTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/NameProviderTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.TaxonomicType; diff --git a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/RankComparatorTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/RankComparatorTest.java index 40aa3cb38..6d628af90 100644 --- a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/RankComparatorTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/RankComparatorTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import static au.org.ala.names.model.RankType.*; diff --git a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/ScientificNameTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/ScientificNameTest.java index 8006d53f5..3df72347a 100644 --- a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/ScientificNameTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/ScientificNameTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; diff --git a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonConceptTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonConceptTest.java index 97c6c435d..bfa40b4c6 100644 --- a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonConceptTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonConceptTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; diff --git a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonomyConfiugrationTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonomyConfiugrationTest.java index fd668a849..f4fc12968 100644 --- a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonomyConfiugrationTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonomyConfiugrationTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.util.TestUtils; diff --git a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonomyTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonomyTest.java index 7c88c6946..adfb2fc65 100644 --- a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonomyTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonomyTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; diff --git a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/AndTaxonConditionTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/AndTaxonConditionTest.java index d79006135..f4cbb35cc 100644 --- a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/AndTaxonConditionTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/AndTaxonConditionTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.ALANameAnalyser; diff --git a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/KeyAdjusterTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/KeyAdjusterTest.java index 9759d26e4..1c01c3599 100644 --- a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/KeyAdjusterTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/KeyAdjusterTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.*; diff --git a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/MatchTaxonConditionTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/MatchTaxonConditionTest.java index a4e43cdea..c9e44d0de 100644 --- a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/MatchTaxonConditionTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/MatchTaxonConditionTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.*; diff --git a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/OrTaxonConditionTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/OrTaxonConditionTest.java index c9a564a3f..9389f5267 100644 --- a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/OrTaxonConditionTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/OrTaxonConditionTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.ALANameAnalyser; diff --git a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/ScoreAdjusterTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/ScoreAdjusterTest.java index dfc5d0885..eea18097a 100644 --- a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/ScoreAdjusterTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/ScoreAdjusterTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.ALANameAnalyser; diff --git a/ala-name-matching-builder/src/test/java/au/org/ala/names/util/TestUtils.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/util/TestUtils.java index 2117faef3..6800124f3 100644 --- a/ala-name-matching-builder/src/test/java/au/org/ala/names/util/TestUtils.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/util/TestUtils.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.util; import au.org.ala.names.index.NameProvider; diff --git a/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/dwca-1/meta.xml b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/dwca-1/meta.xml index 05e5c1836..55dff6b37 100644 --- a/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/dwca-1/meta.xml +++ b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/dwca-1/meta.xml @@ -1,3 +1,19 @@ + + diff --git a/ala-name-matching-distribution/src/main/scripts/compare.sh b/ala-name-matching-distribution/src/main/scripts/compare.sh index 208e76ee3..952887cdb 100644 --- a/ala-name-matching-distribution/src/main/scripts/compare.sh +++ b/ala-name-matching-distribution/src/main/scripts/compare.sh @@ -1,4 +1,20 @@ #!/bin/sh +# +# Copyright (c) 2021 Atlas of Living Australia +# All Rights Reserved. +# +# The contents of this file are subject to the Mozilla Public +# License Version 1.1 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of +# the License at http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS +# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or +# implied. See the License for the specific language governing +# rights and limitations under the License. +# +# + SCRIPT_HOME=`dirname $0` JAVA_OPTIONS="${JAVA_OPTIONS}" exec java ${JAVA_OPTIONS} -cp "$SCRIPT_HOME/lib/*" au.org.ala.names.util.NameListComparer $* \ No newline at end of file diff --git a/ala-name-matching-distribution/src/main/scripts/dump.sh b/ala-name-matching-distribution/src/main/scripts/dump.sh index bb43a64e8..03b3fa27e 100644 --- a/ala-name-matching-distribution/src/main/scripts/dump.sh +++ b/ala-name-matching-distribution/src/main/scripts/dump.sh @@ -1,4 +1,20 @@ #!/bin/sh +# +# Copyright (c) 2021 Atlas of Living Australia +# All Rights Reserved. +# +# The contents of this file are subject to the Mozilla Public +# License Version 1.1 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of +# the License at http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS +# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or +# implied. See the License for the specific language governing +# rights and limitations under the License. +# +# + SCRIPT_HOME=`dirname $0` JAVA_OPTIONS="${JAVA_OPTIONS}" exec java ${JAVA_OPTIONS} -cp "$SCRIPT_HOME/lib/*" au.org.ala.names.util.TermDump $* \ No newline at end of file diff --git a/ala-name-matching-distribution/src/main/scripts/generate.sh b/ala-name-matching-distribution/src/main/scripts/generate.sh index 5043cc190..2c8cc304b 100644 --- a/ala-name-matching-distribution/src/main/scripts/generate.sh +++ b/ala-name-matching-distribution/src/main/scripts/generate.sh @@ -1,4 +1,20 @@ #!/bin/sh +# +# Copyright (c) 2021 Atlas of Living Australia +# All Rights Reserved. +# +# The contents of this file are subject to the Mozilla Public +# License Version 1.1 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of +# the License at http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS +# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or +# implied. See the License for the specific language governing +# rights and limitations under the License. +# +# + SCRIPT_HOME=`dirname $0` JAVA_OPTIONS="${JAVA_OPTIONS}" exec java ${JAVA_OPTIONS} -cp "$SCRIPT_HOME/lib/*" au.org.ala.names.util.NameListGenerator $* \ No newline at end of file diff --git a/ala-name-matching-distribution/src/main/scripts/index.sh b/ala-name-matching-distribution/src/main/scripts/index.sh index c2a771981..97f10ed39 100644 --- a/ala-name-matching-distribution/src/main/scripts/index.sh +++ b/ala-name-matching-distribution/src/main/scripts/index.sh @@ -1,4 +1,20 @@ #!/bin/sh +# +# Copyright (c) 2021 Atlas of Living Australia +# All Rights Reserved. +# +# The contents of this file are subject to the Mozilla Public +# License Version 1.1 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of +# the License at http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS +# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or +# implied. See the License for the specific language governing +# rights and limitations under the License. +# +# + SCRIPT_HOME=`dirname $0` JAVA_OPTIONS="${JAVA_OPTIONS}" exec java ${JAVA_OPTIONS} -cp "$SCRIPT_HOME/lib/*" au.org.ala.names.search.DwcaNameIndexer $* \ No newline at end of file diff --git a/ala-name-matching-distribution/src/main/scripts/merge.sh b/ala-name-matching-distribution/src/main/scripts/merge.sh index 82ecda5f1..07fa3e583 100644 --- a/ala-name-matching-distribution/src/main/scripts/merge.sh +++ b/ala-name-matching-distribution/src/main/scripts/merge.sh @@ -1,4 +1,20 @@ #!/bin/sh +# +# Copyright (c) 2021 Atlas of Living Australia +# All Rights Reserved. +# +# The contents of this file are subject to the Mozilla Public +# License Version 1.1 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of +# the License at http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS +# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or +# implied. See the License for the specific language governing +# rights and limitations under the License. +# +# + SCRIPT_HOME=`dirname $0` JAVA_OPTIONS="${JAVA_OPTIONS} -Xmx6G" exec java ${JAVA_OPTIONS} -cp "$SCRIPT_HOME/lib/*" au.org.ala.names.index.TaxonomyBuilder $* \ No newline at end of file diff --git a/ala-name-matching-model/src/main/java/au/org/ala/names/model/LinnaeanRankClassification.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/LinnaeanRankClassification.java index 958b8041a..338f496d3 100644 --- a/ala-name-matching-model/src/main/java/au/org/ala/names/model/LinnaeanRankClassification.java +++ b/ala-name-matching-model/src/main/java/au/org/ala/names/model/LinnaeanRankClassification.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.model; diff --git a/ala-name-matching-model/src/main/java/au/org/ala/names/model/MatchMetrics.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/MatchMetrics.java index 0121e3a50..eccb0260d 100644 --- a/ala-name-matching-model/src/main/java/au/org/ala/names/model/MatchMetrics.java +++ b/ala-name-matching-model/src/main/java/au/org/ala/names/model/MatchMetrics.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.model; import org.apache.commons.lang3.StringUtils; diff --git a/ala-name-matching-model/src/main/java/au/org/ala/names/model/NameFlag.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/NameFlag.java index 74207ee73..41bbe6c55 100644 --- a/ala-name-matching-model/src/main/java/au/org/ala/names/model/NameFlag.java +++ b/ala-name-matching-model/src/main/java/au/org/ala/names/model/NameFlag.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.model; /** diff --git a/ala-name-matching-model/src/main/java/au/org/ala/names/model/TaxonomicType.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/TaxonomicType.java index dfad2bba6..549414555 100644 --- a/ala-name-matching-model/src/main/java/au/org/ala/names/model/TaxonomicType.java +++ b/ala-name-matching-model/src/main/java/au/org/ala/names/model/TaxonomicType.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.model; /** diff --git a/ala-name-matching-model/src/main/java/au/org/ala/names/model/TaxonomicTypeGroup.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/TaxonomicTypeGroup.java index 0271b024a..8e15d144d 100644 --- a/ala-name-matching-model/src/main/java/au/org/ala/names/model/TaxonomicTypeGroup.java +++ b/ala-name-matching-model/src/main/java/au/org/ala/names/model/TaxonomicTypeGroup.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.model; /** diff --git a/ala-name-matching-model/src/main/java/au/org/ala/names/model/VernacularType.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/VernacularType.java index 92c197073..09cd73ee0 100644 --- a/ala-name-matching-model/src/main/java/au/org/ala/names/model/VernacularType.java +++ b/ala-name-matching-model/src/main/java/au/org/ala/names/model/VernacularType.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.model; import java.util.HashMap; diff --git a/ala-name-matching-model/src/main/java/au/org/ala/names/util/CleanedScientificName.java b/ala-name-matching-model/src/main/java/au/org/ala/names/util/CleanedScientificName.java index afe92e6a1..5246e63ba 100644 --- a/ala-name-matching-model/src/main/java/au/org/ala/names/util/CleanedScientificName.java +++ b/ala-name-matching-model/src/main/java/au/org/ala/names/util/CleanedScientificName.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.util; import java.text.CharacterIterator; diff --git a/ala-name-matching-model/src/main/java/au/org/ala/names/util/TaxonNameSoundEx.java b/ala-name-matching-model/src/main/java/au/org/ala/names/util/TaxonNameSoundEx.java index bb941f19e..8708e97cb 100644 --- a/ala-name-matching-model/src/main/java/au/org/ala/names/util/TaxonNameSoundEx.java +++ b/ala-name-matching-model/src/main/java/au/org/ala/names/util/TaxonNameSoundEx.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.util; import java.util.Collections; diff --git a/ala-name-matching-model/src/main/java/au/org/ala/vocab/ALATerm.java b/ala-name-matching-model/src/main/java/au/org/ala/vocab/ALATerm.java index cfaf7dc0b..cc3f1cd20 100644 --- a/ala-name-matching-model/src/main/java/au/org/ala/vocab/ALATerm.java +++ b/ala-name-matching-model/src/main/java/au/org/ala/vocab/ALATerm.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.vocab; import org.gbif.dwc.terms.Term; diff --git a/ala-name-matching-model/src/test/java/au/org/ala/names/parser/util/PhraseNameParserTests.java b/ala-name-matching-model/src/test/java/au/org/ala/names/parser/util/PhraseNameParserTests.java index b49c4bae5..e5df2d7db 100644 --- a/ala-name-matching-model/src/test/java/au/org/ala/names/parser/util/PhraseNameParserTests.java +++ b/ala-name-matching-model/src/test/java/au/org/ala/names/parser/util/PhraseNameParserTests.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.parser.util; import au.org.ala.names.model.ALAParsedName; diff --git a/ala-name-matching-model/src/test/java/au/org/ala/names/util/CleanedScientificNameTest.java b/ala-name-matching-model/src/test/java/au/org/ala/names/util/CleanedScientificNameTest.java index 5a8c11d5b..b6cde1356 100644 --- a/ala-name-matching-model/src/test/java/au/org/ala/names/util/CleanedScientificNameTest.java +++ b/ala-name-matching-model/src/test/java/au/org/ala/names/util/CleanedScientificNameTest.java @@ -1,5 +1,21 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.util; import org.junit.Test; diff --git a/ala-name-matching-model/src/test/java/org/gbif/nameparser/PhraseNameParserTest.java b/ala-name-matching-model/src/test/java/org/gbif/nameparser/PhraseNameParserTest.java index 976cb7d22..d9485da2b 100644 --- a/ala-name-matching-model/src/test/java/org/gbif/nameparser/PhraseNameParserTest.java +++ b/ala-name-matching-model/src/test/java/org/gbif/nameparser/PhraseNameParserTest.java @@ -1,5 +1,21 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package org.gbif.nameparser; import au.org.ala.names.model.ALAParsedName; diff --git a/ala-name-matching-search/src/main/java/au/org/ala/names/search/FieldType.java b/ala-name-matching-search/src/main/java/au/org/ala/names/search/FieldType.java index 61d1d45f3..b8a6efc60 100644 --- a/ala-name-matching-search/src/main/java/au/org/ala/names/search/FieldType.java +++ b/ala-name-matching-search/src/main/java/au/org/ala/names/search/FieldType.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.search; import au.org.ala.names.lucene.analyzer.LowerCaseKeywordAnalyzer; diff --git a/ala-name-matching-search/src/main/java/au/org/ala/names/util/TaxonNameSoundEx.java b/ala-name-matching-search/src/main/java/au/org/ala/names/util/TaxonNameSoundEx.java index bb941f19e..8708e97cb 100644 --- a/ala-name-matching-search/src/main/java/au/org/ala/names/util/TaxonNameSoundEx.java +++ b/ala-name-matching-search/src/main/java/au/org/ala/names/util/TaxonNameSoundEx.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.util; import java.util.Collections; diff --git a/ala-name-matching-search/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java b/ala-name-matching-search/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java index c5163709b..1d80ca4f5 100644 --- a/ala-name-matching-search/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java +++ b/ala-name-matching-search/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java @@ -1,5 +1,21 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.search; import au.org.ala.names.model.*; diff --git a/ala-name-matching-search/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java b/ala-name-matching-search/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java index f50c41ea6..a4ba57a8a 100644 --- a/ala-name-matching-search/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java +++ b/ala-name-matching-search/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.search; import au.org.ala.names.model.*; diff --git a/ala-name-matching-search/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java b/ala-name-matching-search/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java index d628d60a7..7fb5224fd 100644 --- a/ala-name-matching-search/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java +++ b/ala-name-matching-search/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.search; import au.org.ala.names.model.LinnaeanRankClassification; diff --git a/ala-name-matching-search/src/test/java/au/org/ala/names/search/VernacularMatchTest.java b/ala-name-matching-search/src/test/java/au/org/ala/names/search/VernacularMatchTest.java index 03e07f09b..6d221ee79 100644 --- a/ala-name-matching-search/src/test/java/au/org/ala/names/search/VernacularMatchTest.java +++ b/ala-name-matching-search/src/test/java/au/org/ala/names/search/VernacularMatchTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.search; import au.org.ala.names.model.LinnaeanRankClassification; diff --git a/ala-name-matching-tools/src/main/java/au/org/ala/names/util/NameListComparer.java b/ala-name-matching-tools/src/main/java/au/org/ala/names/util/NameListComparer.java index 622b43856..e1696868b 100644 --- a/ala-name-matching-tools/src/main/java/au/org/ala/names/util/NameListComparer.java +++ b/ala-name-matching-tools/src/main/java/au/org/ala/names/util/NameListComparer.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.util; import com.opencsv.*; diff --git a/ala-name-matching-tools/src/main/java/au/org/ala/names/util/NameListGenerator.java b/ala-name-matching-tools/src/main/java/au/org/ala/names/util/NameListGenerator.java index 2a000a38b..c3992ce11 100644 --- a/ala-name-matching-tools/src/main/java/au/org/ala/names/util/NameListGenerator.java +++ b/ala-name-matching-tools/src/main/java/au/org/ala/names/util/NameListGenerator.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.util; import com.opencsv.CSVWriter; diff --git a/ala-name-matching-tools/src/main/java/au/org/ala/names/util/TermDump.java b/ala-name-matching-tools/src/main/java/au/org/ala/names/util/TermDump.java index 5d405f68c..aaf356902 100644 --- a/ala-name-matching-tools/src/main/java/au/org/ala/names/util/TermDump.java +++ b/ala-name-matching-tools/src/main/java/au/org/ala/names/util/TermDump.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.util; import org.apache.commons.cli.*; From 9ae632cdfc08f27b2e14c646ffd5e6592f4fbe27 Mon Sep 17 00:00:00 2001 From: pal155 Date: Thu, 7 Oct 2021 11:02:10 +1100 Subject: [PATCH 14/19] Make distribution module not make a jar --- ala-name-matching-distribution/pom.xml | 1 + 1 file changed, 1 insertion(+) diff --git a/ala-name-matching-distribution/pom.xml b/ala-name-matching-distribution/pom.xml index 870f50442..fd59570af 100644 --- a/ala-name-matching-distribution/pom.xml +++ b/ala-name-matching-distribution/pom.xml @@ -10,6 +10,7 @@ 4.0.0 ala-name-matching-distribution + pom ALA Name Matching Distribution Distribution along with dependencies From 2949f40da6da8c476bad8f0702236a62bfef8b1b Mon Sep 17 00:00:00 2001 From: pal155 Date: Fri, 8 Oct 2021 11:14:55 +1100 Subject: [PATCH 15/19] Fix for https://github.com/AtlasOfLivingAustralia/ala-name-matching/issues/126 where a term appended onto an 'aff.' name is being treated as an author. --- .../org/ala/names/index/ALANameAnalyser.java | 40 ++++++++++++------- .../org/ala/names/search/DwcaNameIndexer.java | 8 ++++ .../ala/names/index/ALANameAnalyserTest.java | 21 ++++++++++ .../au/org/ala/names/index/TaxonomyTest.java | 35 ++++++++++++++++ .../au/org/ala/names/index/taxonomy-32.csv | 4 ++ .../ala/names/search/ALANameSearcherTest.java | 17 ++++++++ 6 files changed, 110 insertions(+), 15 deletions(-) create mode 100644 ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-32.csv diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALANameAnalyser.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALANameAnalyser.java index 46ccb467c..5227cf350 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALANameAnalyser.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALANameAnalyser.java @@ -95,6 +95,11 @@ public class ALANameAnalyser extends NameAnalyser { * Pattern for bare (no proper period) rank markers */ protected static final Pattern LOOSE_MARKERS = Pattern.compile("\\s+(?:" + RANK_MARKERS + "|" + RANK_PLACEHOLDER_MARKERS + ")\\.?\\s+"); + /** + * Pattern for unsure markers (cf, aff etc) + */ + protected static final Pattern UNSURE_MARKER = Pattern.compile("\\s+(?:cf|cfr|conf|aff)\\.?\\s+" ); + /** * Pattern for non-name characters */ @@ -218,22 +223,27 @@ public NameKey analyse(@Nullable NomenclaturalCode code, String scientificName, scientificName = (left + " " + right).trim(); } } - try { - name = this.nameParser.parse(scientificName, (rankType == null || rankType == RankType.UNRANKED) ? null : rankType.getCbRank()); - if (name != null) { - nameType = name.getType(); - if (rankType == null && name.getRank() != null) - rankType = RankType.getForCBRank(name.getRank()); + if (UNSURE_MARKER.matcher(scientificName).find()) { + // Leave this well alone but indicate that it is doubtful + nameType = NameType.DOUBTFUL; + } else { + try { + name = this.nameParser.parse(scientificName, (rankType == null || rankType == RankType.UNRANKED) ? null : rankType.getCbRank()); + if (name != null) { + nameType = name.getType(); + if (rankType == null && name.getRank() != null) + rankType = RankType.getForCBRank(name.getRank()); + } + } catch (UnparsableException ex) { + // Oh well, worth a try } - } catch (UnparsableException ex) { - // Oh well, worth a try - } - if (loose) { - if (scientificNameAuthorship == null && name != null) { - String ac = this.normalise(name.authorshipComplete()); - if (ac != null && !ac.isEmpty() && !(name instanceof ALAParsedName)) { // ALAParsedName indicates a phrase name; leave as-is - scientificName = name.buildName(true, true, false, true, true, false, true, false, true, false, false, false, true, true); - scientificNameAuthorship = ac; + if (loose) { + if (scientificNameAuthorship == null && name != null) { + String ac = this.normalise(name.authorshipComplete()); + if (ac != null && !ac.isEmpty() && !(name instanceof ALAParsedName)) { // ALAParsedName indicates a phrase name; leave as-is + scientificName = name.buildName(true, true, false, true, true, false, true, false, true, false, false, false, true, true); + scientificNameAuthorship = ac; + } } } } diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/search/DwcaNameIndexer.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/search/DwcaNameIndexer.java index c3f02b8c5..974f5fec1 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/search/DwcaNameIndexer.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/search/DwcaNameIndexer.java @@ -52,6 +52,8 @@ import java.io.IOException; import java.io.InputStream; import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; /** * @@ -82,6 +84,9 @@ public class DwcaNameIndexer extends ALANameIndexer { RankType.KINGDOM, RankType.PHYLUM, RankType.CLASS, RankType.ORDER, RankType.FAMILY }; + /** Detect names with an additional locality in parentheses at the end */ + protected static final Pattern LOCALITY_PATTERN = Pattern.compile("^([\\p{Alnum}.'()\\s]+)\\s+\\([\\p{Alnum}\\s]+\\)\\s*$"); + private static int PAGE_SIZE = 25000; private boolean loadingIndex; private boolean sciIndex; @@ -533,6 +538,9 @@ public boolean createLoadingIndex(File archiveDirectory) throws Exception{ nc = this.buildNameComplete(sn, sna, nc); otherNames.add(sn); otherNames.add(nc); + Matcher locality = LOCALITY_PATTERN.matcher(sn); + if (locality.matches()) + otherNames.add(locality.group(1).trim()); } } doc.add(new StoredField(NameIndexField.PRIORITY.toString(), score < 0 ? defaultScore : score)); diff --git a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/ALANameAnalyserTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/ALANameAnalyserTest.java index e37b99717..5f565c9fb 100644 --- a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/ALANameAnalyserTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/ALANameAnalyserTest.java @@ -269,6 +269,27 @@ public void testKey25() throws Exception { assertEquals(RankType.UNRANKED, key.getRank()); } + + // Test aff. name looks like an author + @Test + public void testKey26() throws Exception { + // With authot + NameKey key1 = this.analyser.analyse(null, "Carex aff. tereticaulis (Lake Omeo)", "sensu G.W. Carr", RankType.UNRANKED, TaxonomicType.INFERRED_UNPLACED, true); + assertEquals(null, key1.getCode()); + assertEquals(NameType.DOUBTFUL, key1.getType()); + assertEquals("CAREX AFF TERETICAULIS LAKE OMEO", key1.getScientificName()); + assertEquals("sensu G.W. Carr", key1.getScientificNameAuthorship()); + assertEquals(RankType.UNRANKED, key1.getRank()); + + // Without author + NameKey key2 = this.analyser.analyse(null, "Carex aff. tereticaulis (Lake Omeo)", null, RankType.UNRANKED, TaxonomicType.INFERRED_UNPLACED, true); + assertEquals(null, key2.getCode()); + assertEquals(NameType.DOUBTFUL, key2.getType()); + assertEquals("CAREX AFF TERETICAULIS LAKE OMEO", key2.getScientificName()); + assertEquals(null, key2.getScientificNameAuthorship()); + assertEquals(RankType.UNRANKED, key2.getRank()); + } + @Test public void testAuthorEquals1() throws Exception { assertEquals(0, this.analyser.compareAuthor(null, null)); diff --git a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonomyTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonomyTest.java index adfb2fc65..fb6af0c88 100644 --- a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonomyTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonomyTest.java @@ -420,6 +420,7 @@ public void testResolveUnranked6() throws Exception { assertEquals(RankType.UNRANKED, syn1.getRank()); assertNull(syn1.getProvenance()); } + @Test public void testResolveUnranked7() throws Exception { TaxonomyConfiguration config = TaxonomyConfiguration.read(this.resourceReader("taxonomy-config-2.json")); @@ -447,6 +448,40 @@ public void testResolveUnranked7() throws Exception { assertFalse(syn1.getProvenance() == null || syn1.getProvenance().isEmpty()); } + // Issue #126 Do not merge X y and X aff. y + @Test + public void testResolveUnranked8() throws Exception { + TaxonomyConfiguration config = TaxonomyConfiguration.read(this.resourceReader("taxonomy-config-2.json")); + this.taxonomy = new Taxonomy(config, null); + this.taxonomy.begin(); + CSVNameSource source1 = new CSVNameSource(this.resourceReader("taxonomy-32.csv"), DwcTerm.Taxon); + this.taxonomy.load(Arrays.asList(source1)); + this.taxonomy.resolve(); + TaxonConceptInstance acc1 = this.taxonomy.getInstance("Concept-1-1"); + TaxonConceptInstance acc2 = this.taxonomy.getInstance("Concept-2-1"); + TaxonConceptInstance acc3 = this.taxonomy.getInstance("Concept-3-1"); + assertNotNull(acc1); + assertNotNull(acc2); + assertNotNull(acc3); + TaxonConcept tcAcc1 = acc1.getContainer(); + TaxonConcept tcAcc2 = acc2.getContainer(); + TaxonConcept tcAcc3 = acc3.getContainer(); + assertNotSame(tcAcc1, tcAcc2); + assertNotSame(tcAcc1, tcAcc3); + assertNotSame(tcAcc2, tcAcc3); + ScientificName sn1 = tcAcc1.getContainer(); + ScientificName sn2 = tcAcc2.getContainer(); + ScientificName sn3 = tcAcc2.getContainer(); + assertNotSame(sn1, sn2); + assertSame(sn2, sn3); + UnrankedScientificName usn1 = sn1.getContainer(); + UnrankedScientificName usn2 = sn2.getContainer(); + assertNotSame(usn1, usn2); + BareName bn1 = usn1.getContainer(); + BareName bn2 = usn2.getContainer(); + assertNotSame(bn1, bn2); + } + // Test placement on an uncoded name @Test public void testPlaceUncoded1() throws Exception { diff --git a/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-32.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-32.csv new file mode 100644 index 000000000..adb1b624f --- /dev/null +++ b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-32.csv @@ -0,0 +1,4 @@ +taxonID,parentNameUsageID,acceptedNameUsageID,datasetID,nomenclaturalCode,scientificName,scientificNameAuthorship,taxonRank,taxonConceptID,scientificNameID,taxonomicStatus,nomenclaturalStatus,establishmentMeans,nameAccordingToID,nameAccordingTo,namePublishedInID,namePublishedIn,namePubishedInYear,nameComplete,nameFormatted,source +"Concept-1-1","","","dr100","ICN","Carex tereticaulis","F.Muell.","species","","","accepted","","","","","","","","","","" +"Concept-2-1","","","dr108","","Carex aff. tereticaulis (Lake Omeo)","","unranked","","","inferredUnplaced","","","","","","","","","","" +"Concept-3-1","","","dr108","","Carex aff. tereticaulis (Lake Omeo)","sensu G.W. Carr","unranked","","","inferredUnplaced","","","","","","","","","","" diff --git a/ala-name-matching-search/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java b/ala-name-matching-search/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java index 1d80ca4f5..2782b2ddd 100644 --- a/ala-name-matching-search/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java +++ b/ala-name-matching-search/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java @@ -1534,6 +1534,23 @@ public void testSimpleLookup17() throws Exception { } + @Test + public void testAffLookup1() throws Exception { + String name = "Carex aff. tereticaulis (Lake Omeo)"; + NameSearchResult nsr = searcher.searchForRecord(name); + assertNotNull(nsr); + assertEquals("ALA_186619", nsr.getLsid()); + name = "Carex aff. tereticaulis"; + nsr = searcher.searchForRecord(name); + assertNotNull(nsr); + assertEquals("ALA_186619", nsr.getLsid()); + name = "Carex tereticaulis"; + nsr = searcher.searchForRecord(name); + assertNotNull(nsr); + assertEquals("https://id.biodiversity.org.au/node/apni/2919780", nsr.getLsid()); + } + + @Test public void testMetricsLookup1() throws Exception { String name = "Geopelia placida"; From a80341f08e440dcd21af3fb53bb7637c456b5955 Mon Sep 17 00:00:00 2001 From: pal155 Date: Fri, 8 Oct 2021 15:32:38 +1100 Subject: [PATCH 16/19] Fix for https://github.com/AtlasOfLivingAustralia/ala-name-matching/issues/131 to align opencsv versions in libraries. --- .../au/org/ala/names/index/ALANameAnalyser.java | 13 ++++++++++++- .../java/au/org/ala/names/index/CSVNameSource.java | 7 ++++--- .../java/au/org/ala/names/index/NameSource.java | 3 +++ .../names/parser/util/PhraseNameParserTests.java | 14 +++++++++++++- .../au/org/ala/names/search/IconicSpeciesTest.java | 14 +++++++++++++- .../au/org/ala/names/util/NameListComparer.java | 5 +++-- pom.xml | 2 +- 7 files changed, 49 insertions(+), 9 deletions(-) diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALANameAnalyser.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALANameAnalyser.java index 5227cf350..d48770214 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALANameAnalyser.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALANameAnalyser.java @@ -18,6 +18,8 @@ import au.org.ala.names.model.*; import au.org.ala.names.util.CleanedScientificName; +import com.opencsv.CSVParser; +import com.opencsv.CSVParserBuilder; import com.opencsv.CSVReader; import com.opencsv.CSVReaderBuilder; import org.gbif.api.exception.UnparsableException; @@ -36,6 +38,7 @@ import org.slf4j.LoggerFactory; import javax.annotation.Nullable; +import java.io.FileReader; import java.io.InputStreamReader; import java.util.*; import java.util.function.Predicate; @@ -359,7 +362,15 @@ protected > void loadCsv(String resource, Map map, */ protected void loadPatternCsv(String resource, List list) { try { - CSVReader reader = new CSVReader(new InputStreamReader(this.getClass().getResourceAsStream(resource), "UTF-8"), ',', '"', 1); + CSVParser csvParser = new CSVParserBuilder() + .withSeparator('\t') + .withQuoteChar('"') + .withEscapeChar('\\') + .build(); + CSVReader reader = new CSVReaderBuilder(new InputStreamReader(this.getClass().getResourceAsStream(resource), "UTF-8")) + .withCSVParser(csvParser) + .withSkipLines(1) + .build(); String[] next; while ((next = reader.readNext()) != null) { String label = next[0]; diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/CSVNameSource.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/CSVNameSource.java index 1647e977e..fe2836ea9 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/CSVNameSource.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/CSVNameSource.java @@ -21,6 +21,7 @@ import au.org.ala.vocab.ALATerm; import com.opencsv.CSVReader; import com.opencsv.CSVReaderBuilder; +import com.opencsv.exceptions.CsvValidationException; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; @@ -68,7 +69,7 @@ public class CSVNameSource extends NameSource { * @param reader The file reader * @param rowType The type of row in the CSV */ - public CSVNameSource(Reader reader, Term rowType) throws IOException { + public CSVNameSource(Reader reader, Term rowType) throws IOException, CsvValidationException { this.name = "Reader " + System.identityHashCode(reader); this.reader = new CSVReaderBuilder(reader).build(); this.rowType = rowType; @@ -82,12 +83,12 @@ public CSVNameSource(Reader reader, Term rowType) throws IOException { * @param encoding The source encoding * @param rowType The type of row in the CSV */ - public CSVNameSource(Path path, String encoding, Term rowType) throws IOException { + public CSVNameSource(Path path, String encoding, Term rowType) throws IOException, CsvValidationException { this(Files.newBufferedReader(path, Charset.forName(encoding)), rowType); this.name = path.toUri().toASCIIString(); } - protected void collectColumns() throws IOException { + protected void collectColumns() throws IOException, CsvValidationException { TermFactory factory = TermFactory.instance(); int index = 0; String[] header = reader.readNext(); diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameSource.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameSource.java index b10bd9fad..9d15082d9 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameSource.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameSource.java @@ -17,6 +17,7 @@ package au.org.ala.names.index; import au.org.ala.vocab.ALATerm; +import com.opencsv.exceptions.CsvValidationException; import org.apache.commons.collections.MapUtils; import org.gbif.api.model.registry.Citation; import org.gbif.api.model.registry.Contact; @@ -374,6 +375,8 @@ public static NameSource create(File f) throws IndexBuilderException { return ns; } catch (IOException ex) { throw new UncheckedIOException(ex); + } catch (CsvValidationException ex) { + throw new IllegalStateException(ex); } } } diff --git a/ala-name-matching-model/src/test/java/au/org/ala/names/parser/util/PhraseNameParserTests.java b/ala-name-matching-model/src/test/java/au/org/ala/names/parser/util/PhraseNameParserTests.java index e5df2d7db..8ba03d9b3 100644 --- a/ala-name-matching-model/src/test/java/au/org/ala/names/parser/util/PhraseNameParserTests.java +++ b/ala-name-matching-model/src/test/java/au/org/ala/names/parser/util/PhraseNameParserTests.java @@ -17,6 +17,10 @@ package au.org.ala.names.parser.util; import au.org.ala.names.model.ALAParsedName; +import com.opencsv.CSVParser; +import com.opencsv.CSVParserBuilder; +import com.opencsv.CSVReader; +import com.opencsv.CSVReaderBuilder; import org.gbif.api.exception.UnparsableException; import org.gbif.api.model.checklistbank.ParsedName; import org.gbif.api.vocabulary.NameType; @@ -457,7 +461,15 @@ public void testVirusName2() throws Exception { //@Test public void testAllNamesForType() { try { - com.opencsv.CSVReader reader = new com.opencsv.CSVReader(new FileReader("/data/names/Version2011/ala_concepts_dump.txt"), '\t', '"', '\\', 1); + CSVParser csvParser = new CSVParserBuilder() + .withSeparator('\t') + .withQuoteChar('"') + .withEscapeChar('\\') + .build(); + CSVReader reader = new CSVReaderBuilder(new FileReader("/data/names/Version2011/ala_concepts_dump.txt")) + .withCSVParser(csvParser) + .withSkipLines(1) + .build(); PhraseNameParser parser = new PhraseNameParser(); int i = 0; for (String[] values = reader.readNext(); values != null; values = reader.readNext()) { diff --git a/ala-name-matching-search/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java b/ala-name-matching-search/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java index 7fb5224fd..42a696fe1 100644 --- a/ala-name-matching-search/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java +++ b/ala-name-matching-search/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java @@ -19,10 +19,14 @@ import au.org.ala.names.model.LinnaeanRankClassification; import au.org.ala.names.model.NameSearchResult; import au.org.ala.names.model.RankType; +import com.opencsv.CSVParser; +import com.opencsv.CSVParserBuilder; import com.opencsv.CSVReader; +import com.opencsv.CSVReaderBuilder; import org.apache.commons.lang3.StringUtils; import org.junit.Test; +import java.io.FileReader; import java.io.InputStreamReader; import java.util.List; @@ -103,7 +107,15 @@ public void testIconicSpeciesSRCCOL() { @Test public void testIconicSpeciesFile() { try { - CSVReader reader = new CSVReader(new InputStreamReader(this.getClass().getResourceAsStream("iconic_species_list.csv")), ',', '"'); + CSVParser csvParser = new CSVParserBuilder() + .withSeparator(',') + .withQuoteChar('"') + .withEscapeChar('\\') + .build(); + CSVReader reader = new CSVReaderBuilder(new InputStreamReader(this.getClass().getResourceAsStream("iconic_species_list.csv"))) + .withCSVParser(csvParser) + .withSkipLines(1) + .build(); String[] values; int passed = 0, failed = 0; diff --git a/ala-name-matching-tools/src/main/java/au/org/ala/names/util/NameListComparer.java b/ala-name-matching-tools/src/main/java/au/org/ala/names/util/NameListComparer.java index e1696868b..522e97a81 100644 --- a/ala-name-matching-tools/src/main/java/au/org/ala/names/util/NameListComparer.java +++ b/ala-name-matching-tools/src/main/java/au/org/ala/names/util/NameListComparer.java @@ -19,6 +19,7 @@ import com.opencsv.*; import au.org.ala.names.model.*; import au.org.ala.names.search.*; +import com.opencsv.exceptions.CsvValidationException; import org.apache.commons.cli.*; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; @@ -88,7 +89,7 @@ protected String mapTerm(String column) { return null; } - protected void readHeader() throws IOException { + protected void readHeader() throws IOException, CsvValidationException { String[] header = names.readNext(); int i = 0; @@ -246,7 +247,7 @@ public String[] match(String[] row) { return values.toArray(new String[values.size()]); } - public void compare() throws IOException { + public void compare() throws IOException, CsvValidationException { String[] row, match; int count = 0; diff --git a/pom.xml b/pom.xml index 7a031300c..44c9c22d6 100644 --- a/pom.xml +++ b/pom.xml @@ -41,7 +41,7 @@ 4.13.1 1.7.25 1.6.2 - 4.1 + 5.0 1.2 2.4 1.32 From e1286254b99014635430d9c53e66cbf45e4d70ab Mon Sep 17 00:00:00 2001 From: pal155 Date: Tue, 12 Oct 2021 16:50:32 +1100 Subject: [PATCH 17/19] Fix for https://github.com/AtlasOfLivingAustralia/ala-name-matching/issues/132 to produce slightly more sensible autocomplete. Searches use the synonym flag to include/exclude synonyms Results are tested against the query name for similarity, squashing down over-enthusiastic long matches. --- .../org/ala/names/search/ALANameSearcher.java | 91 ++++++---- .../ala/names/search/AutocompleteTest.java | 162 ++++++++++++++++++ 2 files changed, 217 insertions(+), 36 deletions(-) create mode 100644 ala-name-matching-search/src/test/java/au/org/ala/names/search/AutocompleteTest.java diff --git a/ala-name-matching-search/src/main/java/au/org/ala/names/search/ALANameSearcher.java b/ala-name-matching-search/src/main/java/au/org/ala/names/search/ALANameSearcher.java index 451e522b4..3cbe595df 100644 --- a/ala-name-matching-search/src/main/java/au/org/ala/names/search/ALANameSearcher.java +++ b/ala-name-matching-search/src/main/java/au/org/ala/names/search/ALANameSearcher.java @@ -30,6 +30,8 @@ import org.gbif.api.vocabulary.NameType; import org.gbif.api.vocabulary.Rank; import org.gbif.nameparser.PhraseNameParser; +import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric; +import uk.ac.shef.wit.simmetrics.similaritymetrics.Levenshtein; import java.io.*; import java.nio.file.Path; @@ -73,6 +75,21 @@ public class ALANameSearcher { public static final Pattern affPattern = Pattern.compile("([\\x00-\\x7F\\s]*) aff[#!?\\\\. ]([\\x00-\\x7F\\s]*)"); public static final Pattern cfPattern = Pattern.compile("([\\x00-\\x7F\\s]*) cf[#!?\\\\. ]([\\x00-\\x7F\\s]*)"); + private static Comparator AUTOCOMPLETE_COMPARATOR = new Comparator() { + @Override + public int compare(Map o1, Map o2) { + if (o1 == o2) + return 0; + if (o1 == null) + return Integer.MAX_VALUE; + if (o2 == null) + return Integer.MIN_VALUE; + float score1 = (float) o1.getOrDefault("score", 1.0f); + float score2 = (float) o2.getOrDefault("score", 1.0f); + return -Float.compare(score1, score2); + } + }; + /** * A set of names that are cross rank homonyms. */ @@ -1792,7 +1809,7 @@ public List getGuidsForTaxa(List taxaQueries) { return guids; } - private void appendAutocompleteResults(Map output, TopDocs results, boolean includeSynonyms, boolean commonNameResults) throws IOException { + private void appendAutocompleteResults(Map output, TopDocs results, boolean includeSynonyms, boolean commonNameResults, String q, AbstractStringMetric similarity) throws IOException { ScoreDoc[] scoreDocs = results.scoreDocs; int scoreDocsCount = scoreDocs.length; for(int excludedResult = 0; excludedResult < scoreDocsCount; ++excludedResult) { @@ -1804,11 +1821,16 @@ private void appendAutocompleteResults(Map output, TopDocs results, if (nsr == null || (nsr.getLeft() == null && !includeSynonyms)) continue; - Map m = formatAutocompleteNsr(i.score, nsr); + String name = commonNameResults ? src.get("common_orig") : src.get("name"); + float score = similarity.getSimilarity(q, name); + score *= i.score; + if (!commonNameResults) + score *= 2.0f; + Map m = formatAutocompleteNsr(score, nsr); //use the matched common name if (commonNameResults) { - m.put("commonname", src.get("common_orig")); + m.put("commonname", name); m.put("match", "commonName"); } else { m.put("match", "scientificName"); @@ -1836,10 +1858,7 @@ private void appendAutocompleteResults(Map output, TopDocs results, } if (((nsr != null && nsr.getAcceptedLsid() == null) || includeSynonyms) && m != null) { - if (m.get("name").toString().equals("Acacia")) { - int aa = 4; - } - Map existing = output.get(m.get("lsid").toString()); + Map existing = output.get(m.get("lsid").toString()); if (existing == null) { output.put(m.get("lsid").toString(), m); } else { @@ -2043,36 +2062,36 @@ private String getExtendedTaxonConceptByGuid(String guid, boolean checkPreferred */ public List autocomplete(String q, int max, boolean includeSynonyms) { try { - if(false) { - return null; - } else { - Map output = new HashMap(); - - //more queries for better scoring values - String lq = q.toLowerCase(); - String uq = q.toUpperCase(); - - //name search - Query fq = buildAutocompleteQuery(NameIndexField.NAME, lq, false); - BooleanQuery b = new BooleanQuery.Builder() - .add(fq, BooleanClause.Occur.MUST) - .add(NameIndexField.LEFT.searchWildcard("*"), includeSynonyms ? BooleanClause.Occur.SHOULD : BooleanClause.Occur.MUST) - .build(); - TopDocs results = cbSearcher.search(b, max); - appendAutocompleteResults(output, results, includeSynonyms, false); - - //format search term for the current common name index - uq = concatName(uq).toUpperCase(); - - //common name search - fq = buildAutocompleteQuery(NameIndexField.SEARCHABLE_COMMON_NAME, uq, true); - results = vernSearcher.search(fq, max); - appendAutocompleteResults(output, results, includeSynonyms, true); - - return new ArrayList(output.values()); - } + AbstractStringMetric similarity = new Levenshtein(); + Map output = new HashMap<>(); + + //more queries for better scoring values + String lq = q.toLowerCase(); + String uq = q.toUpperCase(); + + //name search + Query fq = buildAutocompleteQuery(NameIndexField.NAME, lq, false); + BooleanQuery.Builder bb = new BooleanQuery.Builder(); + bb.add(fq, BooleanClause.Occur.MUST); + if (!includeSynonyms) + bb.add(NameIndexField.iS_SYNONYM.search("T"), BooleanClause.Occur.MUST_NOT); + BooleanQuery b = bb.build(); + TopDocs results = cbSearcher.search(b, max); + appendAutocompleteResults(output, results, includeSynonyms, false, q, similarity); + + //format search term for the current common name index + uq = concatName(uq).toUpperCase(); + + //common name search + fq = buildAutocompleteQuery(NameIndexField.SEARCHABLE_COMMON_NAME, uq, true); + results = vernSearcher.search(fq, max); + appendAutocompleteResults(output, results, includeSynonyms, true, q, similarity); + + List matches = new ArrayList<>(output.values()); + matches.sort(AUTOCOMPLETE_COMPARATOR); + return matches; } catch (Exception e) { - log.error("Autocomplete error.",e); + log.error("Autocomplete error.", e); } return null; } diff --git a/ala-name-matching-search/src/test/java/au/org/ala/names/search/AutocompleteTest.java b/ala-name-matching-search/src/test/java/au/org/ala/names/search/AutocompleteTest.java new file mode 100644 index 000000000..0f0925add --- /dev/null +++ b/ala-name-matching-search/src/test/java/au/org/ala/names/search/AutocompleteTest.java @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + +package au.org.ala.names.search; + +import au.org.ala.names.model.*; +import org.gbif.api.model.checklistbank.ParsedName; +import org.gbif.nameparser.PhraseNameParser; +import org.junit.Ignore; +import org.junit.Test; + +import java.util.List; +import java.util.Map; + +import static org.junit.Assert.*; + +public class AutocompleteTest { + private static ALANameSearcher searcher; + + @org.junit.BeforeClass + public static void init() throws Exception { + searcher = new ALANameSearcher("/data/lucene/namematching-20210811"); + } + + @Test + public void testAutocomplete1() throws Exception { + List results = searcher.autocomplete("Elusor", 10, false); + assertNotNull(results); + assertTrue(results.size() > 0); + Map first = results.get(0); + assertEquals("Elusor", first.get("name")); + } + + @Test + public void testAutocomplete2() throws Exception { + List results = searcher.autocomplete("Mary riv", 10, false); + assertNotNull(results); + assertTrue(results.size() > 1); + Map first = results.get(0); + assertEquals("Samadera sp. Mary River", first.get("name")); + Map second = results.get(1); + assertEquals("Mary River cod", second.get("commonname")); + assertEquals("Maccullochella mariensis", second.get("name")); + Map third = results.get(2); + assertEquals("Mary River turtle", third.get("commonname")); + assertEquals("Elusor macrurus", third.get("name")); + } + + @Test + public void testAutocomplete3() throws Exception { + List results = searcher.autocomplete("Mary river t", 10, false); + assertNotNull(results); + assertTrue(results.size() > 0); + Map first = results.get(0); + assertEquals("Mary River turtle", first.get("commonname")); + assertEquals("Elusor macrurus", first.get("name")); + } + + @Test + public void testAutocomplete4() throws Exception { + List results = searcher.autocomplete("Acacia", 10, true); + assertNotNull(results); + assertTrue(results.size() > 0); + Map first = results.get(0); + assertEquals("Acacia", first.get("name")); + } + + @Test + public void testAutocomplete5() throws Exception { + List results = searcher.autocomplete("Acacia d", 10, true); + assertNotNull(results); + assertTrue(results.size() > 0); + Map first = results.get(0); + assertEquals("Acacia dampieri", first.get("name")); + } + + @Test + public void testAutocomplete6() throws Exception { + List results = searcher.autocomplete("Mylitta pse", 10, true); + assertNotNull(results); + assertTrue(results.size() > 0); + Map first = results.get(0); + assertEquals("Hysterangium pseudacaciae", first.get("name")); + assertNotNull(first.get("synonymMatch")); + } + + + @Test + public void testAutocomplete7() throws Exception { + // No match with synonym + List results = searcher.autocomplete("Mylitta pse", 10, false); + assertNotNull(results); + assertTrue(results.isEmpty()); + } + + + @Test + public void testAutocomplete8() throws Exception { + // No match with garbage + List results = searcher.autocomplete("Glurglefkluff11", 10, true); + assertNotNull(results); + assertTrue(results.isEmpty()); + } + + @Test + public void testAutocomplete9() throws Exception { + List results = searcher.autocomplete("Osphra", 10, true); + assertNotNull(results); + assertTrue(results.size() > 0); + Map first = results.get(0); + assertEquals("Osphranter", first.get("name")); + } + + @Test + public void testAutocomplete10() throws Exception { + List results = searcher.autocomplete("Rossi", 10, true); + assertNotNull(results); + assertTrue(results.size() > 0); + Map first = results.get(0); + assertEquals("Rhachotropis rossi", first.get("name")); + } + + + @Test + public void testAutocomplete11() throws Exception { + List results = searcher.autocomplete("rush", 10, false); + assertNotNull(results); + assertTrue(results.size() > 0); + Map first = results.get(0); + assertEquals("Juncus", first.get("name")); + assertEquals("Rushes", first.get("commonname")); + } + + + @Test + public void testAutocomplete12() throws Exception { + List results = searcher.autocomplete("rush", 10, true); + assertNotNull(results); + assertTrue(results.size() > 0); + Map first = results.get(0); + assertEquals("Acacia alleniana", first.get("name")); + List synonyms = (List) first.get("synonymMatch"); + assertNotNull(synonyms); + assertTrue(synonyms.size() > 0); + Map synonym = synonyms.get(0); + assertEquals("Rush-leaved Wattle", synonym.get("commonname")); + } + +} \ No newline at end of file From c4c124d3dbc6c24a8ba6d4f148f6358939cde454 Mon Sep 17 00:00:00 2001 From: pal155 Date: Thu, 14 Oct 2021 07:41:57 +1100 Subject: [PATCH 18/19] Fix wrongly formatted informal patterns --- .../src/main/java/au/org/ala/names/index/ALANameAnalyser.java | 2 +- .../main/resources/au/org/ala/names/index/informal_names.csv | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALANameAnalyser.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALANameAnalyser.java index d48770214..60fd91098 100644 --- a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALANameAnalyser.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALANameAnalyser.java @@ -363,7 +363,7 @@ protected > void loadCsv(String resource, Map map, protected void loadPatternCsv(String resource, List list) { try { CSVParser csvParser = new CSVParserBuilder() - .withSeparator('\t') + .withSeparator(',') .withQuoteChar('"') .withEscapeChar('\\') .build(); diff --git a/ala-name-matching-builder/src/main/resources/au/org/ala/names/index/informal_names.csv b/ala-name-matching-builder/src/main/resources/au/org/ala/names/index/informal_names.csv index e73d214ba..23973ee57 100644 --- a/ala-name-matching-builder/src/main/resources/au/org/ala/names/index/informal_names.csv +++ b/ala-name-matching-builder/src/main/resources/au/org/ala/names/index/informal_names.csv @@ -1,3 +1,3 @@ label,pattern,title,link numbered group,^\\p{Alpha}+\\s+(?i:group|subgroup|species)\\s*\\d+\\.?,Informal groups of the form , -gen nov,(?i:gen\\.?\\s+nov\\.?)$,New genus name type gen. nov., +gen nov,(?i:gen\\.?\\s+nov\\.?)$,New genus name type gen. nov., \ No newline at end of file From c4830ab8ebb60455e81348d5dff009449dab5440 Mon Sep 17 00:00:00 2001 From: pal155 Date: Thu, 14 Oct 2021 07:51:07 +1100 Subject: [PATCH 19/19] Release 4.0 --- ala-name-matching-builder/pom.xml | 2 +- ala-name-matching-distribution/pom.xml | 2 +- ala-name-matching-model/pom.xml | 2 +- ala-name-matching-search/pom.xml | 2 +- ala-name-matching-tools/pom.xml | 2 +- pom.xml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ala-name-matching-builder/pom.xml b/ala-name-matching-builder/pom.xml index 8ae325219..411e0a054 100644 --- a/ala-name-matching-builder/pom.xml +++ b/ala-name-matching-builder/pom.xml @@ -5,7 +5,7 @@ au.org.ala ala-name-matching - 4.0-SNAPSHOT + 4.0 ala-name-matching-builder diff --git a/ala-name-matching-distribution/pom.xml b/ala-name-matching-distribution/pom.xml index fd59570af..29a1d09f4 100644 --- a/ala-name-matching-distribution/pom.xml +++ b/ala-name-matching-distribution/pom.xml @@ -5,7 +5,7 @@ ala-name-matching au.org.ala - 4.0-SNAPSHOT + 4.0 4.0.0 diff --git a/ala-name-matching-model/pom.xml b/ala-name-matching-model/pom.xml index 5e4bf9222..8ad2889a0 100644 --- a/ala-name-matching-model/pom.xml +++ b/ala-name-matching-model/pom.xml @@ -5,7 +5,7 @@ ala-name-matching au.org.ala - 4.0-SNAPSHOT + 4.0 4.0.0 diff --git a/ala-name-matching-search/pom.xml b/ala-name-matching-search/pom.xml index 5bfb1842a..3feaba613 100644 --- a/ala-name-matching-search/pom.xml +++ b/ala-name-matching-search/pom.xml @@ -5,7 +5,7 @@ au.org.ala ala-name-matching - 4.0-SNAPSHOT + 4.0 ala-name-matching-search diff --git a/ala-name-matching-tools/pom.xml b/ala-name-matching-tools/pom.xml index 41b6a19b1..a06455898 100644 --- a/ala-name-matching-tools/pom.xml +++ b/ala-name-matching-tools/pom.xml @@ -5,7 +5,7 @@ ala-name-matching au.org.ala - 4.0-SNAPSHOT + 4.0 4.0.0 diff --git a/pom.xml b/pom.xml index 44c9c22d6..db8b02d73 100644 --- a/pom.xml +++ b/pom.xml @@ -11,7 +11,7 @@ ala-name-matching pom - 4.0-SNAPSHOT + 4.0 ala-name-matching-model