diff --git a/.travis.yml b/.travis.yml index e0186b7ae..bd4809b34 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,15 +10,15 @@ branches: before_install: - mkdir -p ~/.m2; wget -q -O ~/.m2/settings.xml https://raw.githubusercontent.com/AtlasOfLivingAustralia/travis-build-configuration/master/travis_maven_settings_simple.xml -- sudo mkdir -p /data/lucene; sudo wget -O /data/lucene/namematching-20200214.tgz https://archives.ala.org.au/archives/nameindexes/20200214/namematching-20200214.tgz +- sudo mkdir -p /data/lucene; sudo wget -O /data/lucene/namematching-20210811.tgz https://archives.ala.org.au/archives/nameindexes/20210811/namematching-20210811.tgz - cd /data/lucene -- sudo tar zxvf namematching-20200214.tgz -- sudo ln -s namematching-20200214 namematching +- sudo tar zxvf namematching-20210811.tgz +- sudo ln -s namematching-20210811 namematching - ls -laF - cd $TRAVIS_BUILD_DIR script: -- "[ \"${TRAVIS_PULL_REQUEST}\" = \"false\" ] && mvn -P travis clean install deploy || mvn -P travis clean install" +- 'if [ "${TRAVIS_PULL_REQUEST}" = "false" ]; then mvn -P travis clean install deploy; else mvn -P travis clean install; fi' env: global: diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..566908108 --- /dev/null +++ b/LICENSE @@ -0,0 +1,469 @@ + MOZILLA PUBLIC LICENSE + Version 1.1 + + --------------- + +1. Definitions. + + 1.0.1. "Commercial Use" means distribution or otherwise making the + Covered Code available to a third party. + + 1.1. "Contributor" means each entity that creates or contributes to + the creation of Modifications. + + 1.2. "Contributor Version" means the combination of the Original + Code, prior Modifications used by a Contributor, and the Modifications + made by that particular Contributor. + + 1.3. "Covered Code" means the Original Code or Modifications or the + combination of the Original Code and Modifications, in each case + including portions thereof. + + 1.4. "Electronic Distribution Mechanism" means a mechanism generally + accepted in the software development community for the electronic + transfer of data. + + 1.5. "Executable" means Covered Code in any form other than Source + Code. + + 1.6. "Initial Developer" means the individual or entity identified + as the Initial Developer in the Source Code notice required by Exhibit + A. + + 1.7. "Larger Work" means a work which combines Covered Code or + portions thereof with code not governed by the terms of this License. + + 1.8. "License" means this document. + + 1.8.1. "Licensable" means having the right to grant, to the maximum + extent possible, whether at the time of the initial grant or + subsequently acquired, any and all of the rights conveyed herein. + + 1.9. "Modifications" means any addition to or deletion from the + substance or structure of either the Original Code or any previous + Modifications. When Covered Code is released as a series of files, a + Modification is: + A. Any addition to or deletion from the contents of a file + containing Original Code or previous Modifications. + + B. Any new file that contains any part of the Original Code or + previous Modifications. + + 1.10. "Original Code" means Source Code of computer software code + which is described in the Source Code notice required by Exhibit A as + Original Code, and which, at the time of its release under this + License is not already Covered Code governed by this License. + + 1.10.1. "Patent Claims" means any patent claim(s), now owned or + hereafter acquired, including without limitation, method, process, + and apparatus claims, in any patent Licensable by grantor. + + 1.11. "Source Code" means the preferred form of the Covered Code for + making modifications to it, including all modules it contains, plus + any associated interface definition files, scripts used to control + compilation and installation of an Executable, or source code + differential comparisons against either the Original Code or another + well known, available Covered Code of the Contributor's choice. The + Source Code can be in a compressed or archival form, provided the + appropriate decompression or de-archiving software is widely available + for no charge. + + 1.12. "You" (or "Your") means an individual or a legal entity + exercising rights under, and complying with all of the terms of, this + License or a future version of this License issued under Section 6.1. + For legal entities, "You" includes any entity which controls, is + controlled by, or is under common control with You. For purposes of + this definition, "control" means (a) the power, direct or indirect, + to cause the direction or management of such entity, whether by + contract or otherwise, or (b) ownership of more than fifty percent + (50%) of the outstanding shares or beneficial ownership of such + entity. + +2. Source Code License. + + 2.1. The Initial Developer Grant. + The Initial Developer hereby grants You a world-wide, royalty-free, + non-exclusive license, subject to third party intellectual property + claims: + (a) under intellectual property rights (other than patent or + trademark) Licensable by Initial Developer to use, reproduce, + modify, display, perform, sublicense and distribute the Original + Code (or portions thereof) with or without Modifications, and/or + as part of a Larger Work; and + + (b) under Patents Claims infringed by the making, using or + selling of Original Code, to make, have made, use, practice, + sell, and offer for sale, and/or otherwise dispose of the + Original Code (or portions thereof). + + (c) the licenses granted in this Section 2.1(a) and (b) are + effective on the date Initial Developer first distributes + Original Code under the terms of this License. + + (d) Notwithstanding Section 2.1(b) above, no patent license is + granted: 1) for code that You delete from the Original Code; 2) + separate from the Original Code; or 3) for infringements caused + by: i) the modification of the Original Code or ii) the + combination of the Original Code with other software or devices. + + 2.2. Contributor Grant. + Subject to third party intellectual property claims, each Contributor + hereby grants You a world-wide, royalty-free, non-exclusive license + + (a) under intellectual property rights (other than patent or + trademark) Licensable by Contributor, to use, reproduce, modify, + display, perform, sublicense and distribute the Modifications + created by such Contributor (or portions thereof) either on an + unmodified basis, with other Modifications, as Covered Code + and/or as part of a Larger Work; and + + (b) under Patent Claims infringed by the making, using, or + selling of Modifications made by that Contributor either alone + and/or in combination with its Contributor Version (or portions + of such combination), to make, use, sell, offer for sale, have + made, and/or otherwise dispose of: 1) Modifications made by that + Contributor (or portions thereof); and 2) the combination of + Modifications made by that Contributor with its Contributor + Version (or portions of such combination). + + (c) the licenses granted in Sections 2.2(a) and 2.2(b) are + effective on the date Contributor first makes Commercial Use of + the Covered Code. + + (d) Notwithstanding Section 2.2(b) above, no patent license is + granted: 1) for any code that Contributor has deleted from the + Contributor Version; 2) separate from the Contributor Version; + 3) for infringements caused by: i) third party modifications of + Contributor Version or ii) the combination of Modifications made + by that Contributor with other software (except as part of the + Contributor Version) or other devices; or 4) under Patent Claims + infringed by Covered Code in the absence of Modifications made by + that Contributor. + +3. Distribution Obligations. + + 3.1. Application of License. + The Modifications which You create or to which You contribute are + governed by the terms of this License, including without limitation + Section 2.2. The Source Code version of Covered Code may be + distributed only under the terms of this License or a future version + of this License released under Section 6.1, and You must include a + copy of this License with every copy of the Source Code You + distribute. You may not offer or impose any terms on any Source Code + version that alters or restricts the applicable version of this + License or the recipients' rights hereunder. However, You may include + an additional document offering the additional rights described in + Section 3.5. + + 3.2. Availability of Source Code. + Any Modification which You create or to which You contribute must be + made available in Source Code form under the terms of this License + either on the same media as an Executable version or via an accepted + Electronic Distribution Mechanism to anyone to whom you made an + Executable version available; and if made available via Electronic + Distribution Mechanism, must remain available for at least twelve (12) + months after the date it initially became available, or at least six + (6) months after a subsequent version of that particular Modification + has been made available to such recipients. You are responsible for + ensuring that the Source Code version remains available even if the + Electronic Distribution Mechanism is maintained by a third party. + + 3.3. Description of Modifications. + You must cause all Covered Code to which You contribute to contain a + file documenting the changes You made to create that Covered Code and + the date of any change. You must include a prominent statement that + the Modification is derived, directly or indirectly, from Original + Code provided by the Initial Developer and including the name of the + Initial Developer in (a) the Source Code, and (b) in any notice in an + Executable version or related documentation in which You describe the + origin or ownership of the Covered Code. + + 3.4. Intellectual Property Matters + (a) Third Party Claims. + If Contributor has knowledge that a license under a third party's + intellectual property rights is required to exercise the rights + granted by such Contributor under Sections 2.1 or 2.2, + Contributor must include a text file with the Source Code + distribution titled "LEGAL" which describes the claim and the + party making the claim in sufficient detail that a recipient will + know whom to contact. If Contributor obtains such knowledge after + the Modification is made available as described in Section 3.2, + Contributor shall promptly modify the LEGAL file in all copies + Contributor makes available thereafter and shall take other steps + (such as notifying appropriate mailing lists or newsgroups) + reasonably calculated to inform those who received the Covered + Code that new knowledge has been obtained. + + (b) Contributor APIs. + If Contributor's Modifications include an application programming + interface and Contributor has knowledge of patent licenses which + are reasonably necessary to implement that API, Contributor must + also include this information in the LEGAL file. + + (c) Representations. + Contributor represents that, except as disclosed pursuant to + Section 3.4(a) above, Contributor believes that Contributor's + Modifications are Contributor's original creation(s) and/or + Contributor has sufficient rights to grant the rights conveyed by + this License. + + 3.5. Required Notices. + You must duplicate the notice in Exhibit A in each file of the Source + Code. If it is not possible to put such notice in a particular Source + Code file due to its structure, then You must include such notice in a + location (such as a relevant directory) where a user would be likely + to look for such a notice. If You created one or more Modification(s) + You may add your name as a Contributor to the notice described in + Exhibit A. You must also duplicate this License in any documentation + for the Source Code where You describe recipients' rights or ownership + rights relating to Covered Code. You may choose to offer, and to + charge a fee for, warranty, support, indemnity or liability + obligations to one or more recipients of Covered Code. However, You + may do so only on Your own behalf, and not on behalf of the Initial + Developer or any Contributor. You must make it absolutely clear than + any such warranty, support, indemnity or liability obligation is + offered by You alone, and You hereby agree to indemnify the Initial + Developer and every Contributor for any liability incurred by the + Initial Developer or such Contributor as a result of warranty, + support, indemnity or liability terms You offer. + + 3.6. Distribution of Executable Versions. + You may distribute Covered Code in Executable form only if the + requirements of Section 3.1-3.5 have been met for that Covered Code, + and if You include a notice stating that the Source Code version of + the Covered Code is available under the terms of this License, + including a description of how and where You have fulfilled the + obligations of Section 3.2. The notice must be conspicuously included + in any notice in an Executable version, related documentation or + collateral in which You describe recipients' rights relating to the + Covered Code. You may distribute the Executable version of Covered + Code or ownership rights under a license of Your choice, which may + contain terms different from this License, provided that You are in + compliance with the terms of this License and that the license for the + Executable version does not attempt to limit or alter the recipient's + rights in the Source Code version from the rights set forth in this + License. If You distribute the Executable version under a different + license You must make it absolutely clear that any terms which differ + from this License are offered by You alone, not by the Initial + Developer or any Contributor. You hereby agree to indemnify the + Initial Developer and every Contributor for any liability incurred by + the Initial Developer or such Contributor as a result of any such + terms You offer. + + 3.7. Larger Works. + You may create a Larger Work by combining Covered Code with other code + not governed by the terms of this License and distribute the Larger + Work as a single product. In such a case, You must make sure the + requirements of this License are fulfilled for the Covered Code. + +4. Inability to Comply Due to Statute or Regulation. + + If it is impossible for You to comply with any of the terms of this + License with respect to some or all of the Covered Code due to + statute, judicial order, or regulation then You must: (a) comply with + the terms of this License to the maximum extent possible; and (b) + describe the limitations and the code they affect. Such description + must be included in the LEGAL file described in Section 3.4 and must + be included with all distributions of the Source Code. Except to the + extent prohibited by statute or regulation, such description must be + sufficiently detailed for a recipient of ordinary skill to be able to + understand it. + +5. Application of this License. + + This License applies to code to which the Initial Developer has + attached the notice in Exhibit A and to related Covered Code. + +6. Versions of the License. + + 6.1. New Versions. + Netscape Communications Corporation ("Netscape") may publish revised + and/or new versions of the License from time to time. Each version + will be given a distinguishing version number. + + 6.2. Effect of New Versions. + Once Covered Code has been published under a particular version of the + License, You may always continue to use it under the terms of that + version. You may also choose to use such Covered Code under the terms + of any subsequent version of the License published by Netscape. No one + other than Netscape has the right to modify the terms applicable to + Covered Code created under this License. + + 6.3. Derivative Works. + If You create or use a modified version of this License (which you may + only do in order to apply it to code which is not already Covered Code + governed by this License), You must (a) rename Your license so that + the phrases "Mozilla", "MOZILLAPL", "MOZPL", "Netscape", + "MPL", "NPL" or any confusingly similar phrase do not appear in your + license (except to note that your license differs from this License) + and (b) otherwise make it clear that Your version of the license + contains terms which differ from the Mozilla Public License and + Netscape Public License. (Filling in the name of the Initial + Developer, Original Code or Contributor in the notice described in + Exhibit A shall not of themselves be deemed to be modifications of + this License.) + +7. DISCLAIMER OF WARRANTY. + + COVERED CODE IS PROVIDED UNDER THIS LICENSE ON AN "AS IS" BASIS, + WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, + WITHOUT LIMITATION, WARRANTIES THAT THE COVERED CODE IS FREE OF + DEFECTS, MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE OR NON-INFRINGING. + THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE COVERED CODE + IS WITH YOU. SHOULD ANY COVERED CODE PROVE DEFECTIVE IN ANY RESPECT, + YOU (NOT THE INITIAL DEVELOPER OR ANY OTHER CONTRIBUTOR) ASSUME THE + COST OF ANY NECESSARY SERVICING, REPAIR OR CORRECTION. THIS DISCLAIMER + OF WARRANTY CONSTITUTES AN ESSENTIAL PART OF THIS LICENSE. NO USE OF + ANY COVERED CODE IS AUTHORIZED HEREUNDER EXCEPT UNDER THIS DISCLAIMER. + +8. TERMINATION. + + 8.1. This License and the rights granted hereunder will terminate + automatically if You fail to comply with terms herein and fail to cure + such breach within 30 days of becoming aware of the breach. All + sublicenses to the Covered Code which are properly granted shall + survive any termination of this License. Provisions which, by their + nature, must remain in effect beyond the termination of this License + shall survive. + + 8.2. If You initiate litigation by asserting a patent infringement + claim (excluding declatory judgment actions) against Initial Developer + or a Contributor (the Initial Developer or Contributor against whom + You file such action is referred to as "Participant") alleging that: + + (a) such Participant's Contributor Version directly or indirectly + infringes any patent, then any and all rights granted by such + Participant to You under Sections 2.1 and/or 2.2 of this License + shall, upon 60 days notice from Participant terminate prospectively, + unless if within 60 days after receipt of notice You either: (i) + agree in writing to pay Participant a mutually agreeable reasonable + royalty for Your past and future use of Modifications made by such + Participant, or (ii) withdraw Your litigation claim with respect to + the Contributor Version against such Participant. If within 60 days + of notice, a reasonable royalty and payment arrangement are not + mutually agreed upon in writing by the parties or the litigation claim + is not withdrawn, the rights granted by Participant to You under + Sections 2.1 and/or 2.2 automatically terminate at the expiration of + the 60 day notice period specified above. + + (b) any software, hardware, or device, other than such Participant's + Contributor Version, directly or indirectly infringes any patent, then + any rights granted to You by such Participant under Sections 2.1(b) + and 2.2(b) are revoked effective as of the date You first made, used, + sold, distributed, or had made, Modifications made by that + Participant. + + 8.3. If You assert a patent infringement claim against Participant + alleging that such Participant's Contributor Version directly or + indirectly infringes any patent where such claim is resolved (such as + by license or settlement) prior to the initiation of patent + infringement litigation, then the reasonable value of the licenses + granted by such Participant under Sections 2.1 or 2.2 shall be taken + into account in determining the amount or value of any payment or + license. + + 8.4. In the event of termination under Sections 8.1 or 8.2 above, + all end user license agreements (excluding distributors and resellers) + which have been validly granted by You or any distributor hereunder + prior to termination shall survive termination. + +9. LIMITATION OF LIABILITY. + + UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT + (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL YOU, THE INITIAL + DEVELOPER, ANY OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF COVERED CODE, + OR ANY SUPPLIER OF ANY OF SUCH PARTIES, BE LIABLE TO ANY PERSON FOR + ANY INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY + CHARACTER INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF GOODWILL, + WORK STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER + COMMERCIAL DAMAGES OR LOSSES, EVEN IF SUCH PARTY SHALL HAVE BEEN + INFORMED OF THE POSSIBILITY OF SUCH DAMAGES. THIS LIMITATION OF + LIABILITY SHALL NOT APPLY TO LIABILITY FOR DEATH OR PERSONAL INJURY + RESULTING FROM SUCH PARTY'S NEGLIGENCE TO THE EXTENT APPLICABLE LAW + PROHIBITS SUCH LIMITATION. SOME JURISDICTIONS DO NOT ALLOW THE + EXCLUSION OR LIMITATION OF INCIDENTAL OR CONSEQUENTIAL DAMAGES, SO + THIS EXCLUSION AND LIMITATION MAY NOT APPLY TO YOU. + +10. U.S. GOVERNMENT END USERS. + + The Covered Code is a "commercial item," as that term is defined in + 48 C.F.R. 2.101 (Oct. 1995), consisting of "commercial computer + software" and "commercial computer software documentation," as such + terms are used in 48 C.F.R. 12.212 (Sept. 1995). Consistent with 48 + C.F.R. 12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (June 1995), + all U.S. Government End Users acquire Covered Code with only those + rights set forth herein. + +11. MISCELLANEOUS. + + This License represents the complete agreement concerning subject + matter hereof. If any provision of this License is held to be + unenforceable, such provision shall be reformed only to the extent + necessary to make it enforceable. This License shall be governed by + California law provisions (except to the extent applicable law, if + any, provides otherwise), excluding its conflict-of-law provisions. + With respect to disputes in which at least one party is a citizen of, + or an entity chartered or registered to do business in the United + States of America, any litigation relating to this License shall be + subject to the jurisdiction of the Federal Courts of the Northern + District of California, with venue lying in Santa Clara County, + California, with the losing party responsible for costs, including + without limitation, court costs and reasonable attorneys' fees and + expenses. The application of the United Nations Convention on + Contracts for the International Sale of Goods is expressly excluded. + Any law or regulation which provides that the language of a contract + shall be construed against the drafter shall not apply to this + License. + +12. RESPONSIBILITY FOR CLAIMS. + + As between Initial Developer and the Contributors, each party is + responsible for claims and damages arising, directly or indirectly, + out of its utilization of rights under this License and You agree to + work with Initial Developer and Contributors to distribute such + responsibility on an equitable basis. Nothing herein is intended or + shall be deemed to constitute any admission of liability. + +13. MULTIPLE-LICENSED CODE. + + Initial Developer may designate portions of the Covered Code as + "Multiple-Licensed". "Multiple-Licensed" means that the Initial + Developer permits you to utilize portions of the Covered Code under + Your choice of the MPL or the alternative licenses, if any, specified + by the Initial Developer in the file described in Exhibit A. + +EXHIBIT A -Mozilla Public License. + + ``The contents of this file are subject to the Mozilla Public License + Version 1.1 (the "License"); you may not use this file except in + compliance with the License. You may obtain a copy of the License at + https://www.mozilla.org/MPL/ + + Software distributed under the License is distributed on an "AS IS" + basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the + License for the specific language governing rights and limitations + under the License. + + The Original Code is ______________________________________. + + The Initial Developer of the Original Code is ________________________. + Portions created by ______________________ are Copyright (C) ______ + _______________________. All Rights Reserved. + + Contributor(s): ______________________________________. + + Alternatively, the contents of this file may be used under the terms + of the _____ license (the "[___] License"), in which case the + provisions of [______] License are applicable instead of those + above. If you wish to allow use of your version of this file only + under the terms of the [____] License and not to allow others to use + your version of this file under the MPL, indicate your decision by + deleting the provisions above and replace them with the notice and + other provisions required by the [___] License. If you do not delete + the provisions above, a recipient may use your version of this file + under either the MPL or the [___] License." + + [NOTE: The text of this Exhibit A may differ slightly from the text of + the notices in the Source Code files of the Original Code. You should + use the text of this Exhibit A rather than the text found in the + Original Code Source Code for Your Modifications.] diff --git a/README.md b/README.md index 8e0b9cfba..8b293148f 100644 --- a/README.md +++ b/README.md @@ -6,11 +6,25 @@ This API borrows heavily from the name parsing great work done by [GBIF](https:/ in their [scientific name parser library](https://github.com/gbif/name-parser) This code contains additions for handling some Australian specific issues. +## Modules + +* **ala-name-matching-model** The data model used by the name matching index. + This module contains a number of useful vocabularies that you may want to + include in your application, even if you don' want to name match. +* **ala-name-matching-search** Local name index searching. + Include this in you application if you want to match names against a local name index. +* **ala-name-magcing-builder** Merge taxonomies and build name indexes. + This is a separate module to the searcher so that you can build the name + index that the searcher uses, without importing a shedload of dependencies + if you just want to search for things. +* **ala-name-matching-tools** Some useful utilities that can be used to + do bulk matching for testing and the like. +* **ala-name-matching-distributions** A full distribution zip file, including + some shell scripts to get various commands going. + ## Versions -Currently there are 2 versions of this library, 2.x and 3.x. -* 2.x is using lucene 4. -* 3.x is using lucene 6 or above. +Version 4.x of the library uses Lucene 8. ## Generating a name match index @@ -41,17 +55,19 @@ You can download the IRMNG DwCA for homonyms from the following URL: An assembly zip file for this can be downloaded from our maven repository : -[ala-name-matching-3.5-distribution.zip](http://nexus.ala.org.au/service/local/repositories/releases/content/au/org/ala/ala-name-matching/3.5/ala-name-matching-3.5-distribution.zip) +[ala-name-matching-4.0-SNAPSHOT-distribution.zip](http://nexus.ala.org.au/service/local/repositories/releases/content/au/org/ala/ala-name-matching/3.5/ala-name-matching-3.5-distribution.zip) To generate the name index using the data described above, follow these steps. Alternatively use the [ALA Ansible scripts](https://github.com/AtlasOfLivingAustralia/ala-install) here using the playbook [nameindexer.yml](https://github.com/AtlasOfLivingAustralia/ala-install/blob/master/ansible/nameindexer-standalone.yml) which does it all for you. * Download the zip files linked above to a directory e.g. /data/names/ and extract them -* Download the distribution zip [ala-name-matching-3.5-distribution.zip](http://nexus.ala.org.au/service/local/repositories/releases/content/au/org/ala/ala-name-matching/3.5/ala-name-matching-3.5-distribution.zip) +* Download the distribution zip [ala-name-matching-disribution-4.0-SNAPSHOT-distribution.zip](http://nexus.ala.org.au/service/local/repositories/releases/content/au/org/ala/ala-name-matching/3.5/ala-name-matching-distribution-4.0-SNAPSHOT-distribution.zip) + and unzip it. + You wil find a number of shell scripts in the base directory. * Generate the names index with command: ``` -java -jar ala-name-matching-3.5.jar --all --dwca /data/names/dwca-col --target /data/lucene/testdwc-namematching --irmng /data/names/irmng/IRMNG_DWC_HOMONYMS --common /data/names/col_vernacular.txt +./index.sh --all --dwca /data/names/dwca-col --target /data/lucene/testdwc-namematching --irmng /data/names/irmng/IRMNG_DWC_HOMONYMS --common /data/names/col_vernacular.txt ``` Please be aware that the names indexing could take over an hour to complete. @@ -66,7 +82,7 @@ into a single, combined taxonomy. An example command for the taxonomy builder is: ``` -java --classpath au.org.ala.names.index.TaxonomyBuilder -c /data/names/ala-taxon-config.json -w tmp -o /data/names/combined /data/names/APNI/DwC /data/names/AFD/DwC /data/names/CAAB/DwC +./merge.sh -c /data/names/ala-taxon-config.json -w tmp -o /data/names/combined /data/names/APNI/DwC /data/names/AFD/DwC /data/names/CAAB/DwC ``` More information about the merge configuration can be found [here](doc/merge-config.md). @@ -76,14 +92,18 @@ More information about the merge configuration can be found [here](doc/merge-con This library is built with maven. By default a `mvn install` will try to run a test suite which will fail without a local installation of a name index. To skip this step, run a build with ```mvn install -DskipTests=true```. -The build creates 3 artefacts in the ala-name-matching/target directory: +The build creates one artefact in the `ala-name-matching-distribution/target` directory: + +* ala-name-matching-distribution-4.0-SNAPSHOT-distribution.zip - zip containing the project jar and dependencies -* ala-name-matching-3.5.jar - built jar for the project code only -* ala-name-matching-3.5-distribution.zip - zip containing the project jar and dependencies -* ala-name-matching-3.5-sources.jar - source jar for the project code only +Each module contains two artefacts in the +`ala-name-matching/ala-name-matching-/target` directory: -The name index for Australian names lists used in unit tests can be downloaded [from here](https://biocache.ala.org.au/archives/nameindexes/20200214) and needs to be extracted to the -directory `/data/lucene/namematching-20200214` +* ala-name-matching--4.0-SNAPSHOT.jar - built jar for the project code only +* ala-name-matching--4.0-SNAPSHOT-sources.jar - source jar for the project code only + +The name index for Australian names lists used in unit tests can be downloaded [from here](https://biocache.ala.org.au/archives/nameindexes/20220629) and needs to be extracted to the +directory `/data/lucene/namematching-20210811` ## ALA Names List @@ -116,19 +136,29 @@ The ALA Name Matching is available as a library that can be used in other projec To use ala-name-matching, include it as a dependency in your pom file: ``` - - au.org.ala - ala-name-matching - 3.5 - + + au.org.ala + ala-name-matching-search + 4.0-SNAPSHOT + ``` +If you just want the handy enums and such-like, use +``` + + au.org.ala + ala-name-matching-model + 4.0-SNAPSHOT + +``` + + If you are using grails 3, you may encounter problems with the newer GBIF libraries having validation code that conflicts with spring validation. You can correct this by using ``` -compile("au.org.ala:ala-name-matching:3.5") { +compile("au.org.ala:ala-name-matching-search:4.0-SNAPSHOT") { exclude group: 'org.slf4j', module: 'slf4j-log4j12' exclude group: 'org.apache.bval', module: 'org.apache.bval.bundle' } diff --git a/ala-name-matching-builder/pom.xml b/ala-name-matching-builder/pom.xml new file mode 100644 index 000000000..411e0a054 --- /dev/null +++ b/ala-name-matching-builder/pom.xml @@ -0,0 +1,79 @@ + + + 4.0.0 + + + au.org.ala + ala-name-matching + 4.0 + + + ala-name-matching-builder + jar + ALA Name Matching Taxonomy Merging and Index Building + Tools to first merge multiple taxonomies together and then build a searchable index out of the resulting taxonomy + + + au.org.ala + ala-name-matching-model + ${project.version} + + + au.org.ala + ala-name-matching-search + ${project.version} + + + org.slf4j + slf4j-log4j12 + ${slf4j.version} + + + org.gbif + dwca-io + ${dwca-io.version} + + + commons-io + commons-io + + + org.slf4j + slf4j-api + + + + + org.gbif.checklistbank + checklistbank-common + ${checklist-bank.version} + + + org.gbif.registry + registry-ws-client + + + com.beust + jcommander + + + org.slf4j + jcl-over-slf4j + + + io.dropwizard.metrics + metrics-core + + + io.dropwizard.metrics + metrics-ganglia + + + + + commons-cli + commons-cli + ${commons-cli.version} + + + diff --git a/src/main/java/au/org/ala/names/index/ALANameAnalyser.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALANameAnalyser.java similarity index 89% rename from src/main/java/au/org/ala/names/index/ALANameAnalyser.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALANameAnalyser.java index 607eddb8e..60fd91098 100644 --- a/src/main/java/au/org/ala/names/index/ALANameAnalyser.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALANameAnalyser.java @@ -1,7 +1,25 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.*; import au.org.ala.names.util.CleanedScientificName; +import com.opencsv.CSVParser; +import com.opencsv.CSVParserBuilder; import com.opencsv.CSVReader; import com.opencsv.CSVReaderBuilder; import org.gbif.api.exception.UnparsableException; @@ -20,6 +38,7 @@ import org.slf4j.LoggerFactory; import javax.annotation.Nullable; +import java.io.FileReader; import java.io.InputStreamReader; import java.util.*; import java.util.function.Predicate; @@ -79,6 +98,11 @@ public class ALANameAnalyser extends NameAnalyser { * Pattern for bare (no proper period) rank markers */ protected static final Pattern LOOSE_MARKERS = Pattern.compile("\\s+(?:" + RANK_MARKERS + "|" + RANK_PLACEHOLDER_MARKERS + ")\\.?\\s+"); + /** + * Pattern for unsure markers (cf, aff etc) + */ + protected static final Pattern UNSURE_MARKER = Pattern.compile("\\s+(?:cf|cfr|conf|aff)\\.?\\s+" ); + /** * Pattern for non-name characters */ @@ -202,22 +226,27 @@ public NameKey analyse(@Nullable NomenclaturalCode code, String scientificName, scientificName = (left + " " + right).trim(); } } - try { - name = this.nameParser.parse(scientificName, (rankType == null || rankType == RankType.UNRANKED) ? null : rankType.getCbRank()); - if (name != null) { - nameType = name.getType(); - if (rankType == null && name.getRank() != null) - rankType = RankType.getForCBRank(name.getRank()); + if (UNSURE_MARKER.matcher(scientificName).find()) { + // Leave this well alone but indicate that it is doubtful + nameType = NameType.DOUBTFUL; + } else { + try { + name = this.nameParser.parse(scientificName, (rankType == null || rankType == RankType.UNRANKED) ? null : rankType.getCbRank()); + if (name != null) { + nameType = name.getType(); + if (rankType == null && name.getRank() != null) + rankType = RankType.getForCBRank(name.getRank()); + } + } catch (UnparsableException ex) { + // Oh well, worth a try } - } catch (UnparsableException ex) { - // Oh well, worth a try - } - if (loose) { - if (scientificNameAuthorship == null && name != null) { - String ac = this.normalise(name.authorshipComplete()); - if (ac != null && !ac.isEmpty() && !(name instanceof ALAParsedName)) { // ALAParsedName indicates a phrase name; leave as-is - scientificName = name.buildName(true, true, false, true, true, false, true, false, true, false, false, false, true, true); - scientificNameAuthorship = ac; + if (loose) { + if (scientificNameAuthorship == null && name != null) { + String ac = this.normalise(name.authorshipComplete()); + if (ac != null && !ac.isEmpty() && !(name instanceof ALAParsedName)) { // ALAParsedName indicates a phrase name; leave as-is + scientificName = name.buildName(true, true, false, true, true, false, true, false, true, false, false, false, true, true); + scientificNameAuthorship = ac; + } } } } @@ -333,7 +362,15 @@ protected > void loadCsv(String resource, Map map, */ protected void loadPatternCsv(String resource, List list) { try { - CSVReader reader = new CSVReader(new InputStreamReader(this.getClass().getResourceAsStream(resource), "UTF-8"), ',', '"', 1); + CSVParser csvParser = new CSVParserBuilder() + .withSeparator(',') + .withQuoteChar('"') + .withEscapeChar('\\') + .build(); + CSVReader reader = new CSVReaderBuilder(new InputStreamReader(this.getClass().getResourceAsStream(resource), "UTF-8")) + .withCSVParser(csvParser) + .withSkipLines(1) + .build(); String[] next; while ((next = reader.readNext()) != null) { String label = next[0]; diff --git a/src/main/java/au/org/ala/names/index/ALATaxonResolver.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALATaxonResolver.java similarity index 96% rename from src/main/java/au/org/ala/names/index/ALATaxonResolver.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALATaxonResolver.java index 3e50a75e9..90f4e41fe 100644 --- a/src/main/java/au/org/ala/names/index/ALATaxonResolver.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ALATaxonResolver.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.index.provider.ConceptResolutionPriority; diff --git a/src/main/java/au/org/ala/names/index/BareName.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/BareName.java similarity index 89% rename from src/main/java/au/org/ala/names/index/BareName.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/BareName.java index 5902e1974..e2ea911ed 100644 --- a/src/main/java/au/org/ala/names/index/BareName.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/BareName.java @@ -1,9 +1,24 @@ -package au.org.ala.names.index; +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ -import au.org.ala.names.model.RankType; -import org.gbif.api.vocabulary.NomenclaturalCode; +package au.org.ala.names.index; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; import java.util.stream.Collectors; /** diff --git a/src/main/java/au/org/ala/names/index/CSVNameSource.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/CSVNameSource.java similarity index 94% rename from src/main/java/au/org/ala/names/index/CSVNameSource.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/CSVNameSource.java index 672d2027c..fe2836ea9 100644 --- a/src/main/java/au/org/ala/names/index/CSVNameSource.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/CSVNameSource.java @@ -1,10 +1,27 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; -import au.org.ala.vocab.ALATerm; import au.org.ala.names.model.RankType; import au.org.ala.names.model.TaxonomicType; +import au.org.ala.vocab.ALATerm; import com.opencsv.CSVReader; import com.opencsv.CSVReaderBuilder; +import com.opencsv.exceptions.CsvValidationException; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; @@ -17,7 +34,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.*; +import java.io.IOException; +import java.io.Reader; import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Path; @@ -51,7 +69,7 @@ public class CSVNameSource extends NameSource { * @param reader The file reader * @param rowType The type of row in the CSV */ - public CSVNameSource(Reader reader, Term rowType) throws IOException { + public CSVNameSource(Reader reader, Term rowType) throws IOException, CsvValidationException { this.name = "Reader " + System.identityHashCode(reader); this.reader = new CSVReaderBuilder(reader).build(); this.rowType = rowType; @@ -65,12 +83,12 @@ public CSVNameSource(Reader reader, Term rowType) throws IOException { * @param encoding The source encoding * @param rowType The type of row in the CSV */ - public CSVNameSource(Path path, String encoding, Term rowType) throws IOException { + public CSVNameSource(Path path, String encoding, Term rowType) throws IOException, CsvValidationException { this(Files.newBufferedReader(path, Charset.forName(encoding)), rowType); this.name = path.toUri().toASCIIString(); } - protected void collectColumns() throws IOException { + protected void collectColumns() throws IOException, CsvValidationException { TermFactory factory = TermFactory.instance(); int index = 0; String[] header = reader.readNext(); diff --git a/src/main/java/au/org/ala/names/index/DwcaNameSource.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/DwcaNameSource.java similarity index 93% rename from src/main/java/au/org/ala/names/index/DwcaNameSource.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/DwcaNameSource.java index d4473cca0..7b88dc89d 100644 --- a/src/main/java/au/org/ala/names/index/DwcaNameSource.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/DwcaNameSource.java @@ -1,9 +1,25 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; -import au.org.ala.names.model.VernacularType; -import au.org.ala.vocab.ALATerm; import au.org.ala.names.model.RankType; import au.org.ala.names.model.TaxonomicType; +import au.org.ala.names.model.VernacularType; +import au.org.ala.vocab.ALATerm; import org.apache.commons.beanutils.BeanUtils; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -13,15 +29,15 @@ import org.gbif.api.model.registry.Dataset; import org.gbif.api.vocabulary.*; import org.gbif.dwc.terms.DcTerm; -import org.gbif.dwc.terms.GbifTerm; -import org.gbif.dwca.io.MetadataException; -import org.gbif.dwca.record.Record; -import org.gbif.dwca.record.StarRecord; import org.gbif.dwc.terms.DwcTerm; +import org.gbif.dwc.terms.GbifTerm; import org.gbif.dwc.terms.Term; import org.gbif.dwca.io.Archive; import org.gbif.dwca.io.ArchiveFactory; import org.gbif.dwca.io.ArchiveFile; +import org.gbif.dwca.io.MetadataException; +import org.gbif.dwca.record.Record; +import org.gbif.dwca.record.StarRecord; import java.io.File; import java.io.IOException; @@ -227,6 +243,12 @@ public void loadVernacularDwCA(Taxonomy taxonomy) throws IndexBuilderException { * @throws IndexBuilderException if unable to load a record into the taxonomy. */ protected void loadTaxonDwCA(Taxonomy taxonomy) throws IndexBuilderException { + String defaultDatasetName = null; + try { + defaultDatasetName = archive.getMetadata().getTitle(); + } catch (MetadataException e) { + taxonomy.report(IssueType.PROBLEM, "provider.archive.noMetadata", (String) null, null); + } if (archive.getCore().getRowType() != DwcTerm.Taxon) throw new IndexBuilderException("Expecting a core row type of " + DwcTerm.Taxon); List classifiers = TaxonConceptInstance.CLASSIFICATION_FIELDS.stream().filter(t -> archive.getCore().hasTerm(t)).collect(Collectors.toList()); @@ -240,7 +262,10 @@ protected void loadTaxonDwCA(Taxonomy taxonomy) throws IndexBuilderException { Record core = record.core(); taxonID = core.value(DwcTerm.taxonID); String verbatimNomenclaturalCode = core.value(DwcTerm.nomenclaturalCode); - NameProvider provider = taxonomy.resolveProvider(core.value(DwcTerm.datasetID), core.value(DwcTerm.datasetName)); + String datasetName = core.value(DwcTerm.datasetName); + if (datasetName == null) + datasetName = defaultDatasetName; + NameProvider provider = taxonomy.resolveProvider(core.value(DwcTerm.datasetID), datasetName); NomenclaturalCode code = taxonomy.resolveCode(verbatimNomenclaturalCode); String scientificName = core.value(DwcTerm.scientificName); String scientificNameAuthorship = core.value(DwcTerm.scientificNameAuthorship); diff --git a/src/main/java/au/org/ala/names/index/IndexBuilderException.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/IndexBuilderException.java similarity index 57% rename from src/main/java/au/org/ala/names/index/IndexBuilderException.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/IndexBuilderException.java index 62dfd56bc..478aea1e7 100644 --- a/src/main/java/au/org/ala/names/index/IndexBuilderException.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/IndexBuilderException.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; /** diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/IssueType.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/IssueType.java new file mode 100644 index 000000000..a32294293 --- /dev/null +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/IssueType.java @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + +package au.org.ala.names.index; + +/** + * Vocabulary for reporting issues. + * + * @author Doug Palmer <Doug.Palmer@csiro.au> + * @copyright Copyright © 2017 Atlas of Living Australia + */ +public enum IssueType { + /** An invalid source taxonomy */ + VALIDATION, + /** An error likely to make a taxonomy unusable */ + ERROR, + /** A problem loading the taxonomy that needs to be addressed */ + PROBLEM, + /** A collision between concepts */ + COLLISION, + /** A note about processing */ + NOTE, + /** A statistic of some sort */ + COUNT +} diff --git a/src/main/java/au/org/ala/names/index/Name.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/Name.java similarity index 94% rename from src/main/java/au/org/ala/names/index/Name.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/Name.java index 96d61b7a0..6f5a6873c 100644 --- a/src/main/java/au/org/ala/names/index/Name.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/Name.java @@ -1,8 +1,23 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; -import org.gbif.api.vocabulary.NomenclaturalCode; import java.util.*; import java.util.stream.Collectors; diff --git a/src/main/java/au/org/ala/names/index/NameAnalyser.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameAnalyser.java similarity index 93% rename from src/main/java/au/org/ala/names/index/NameAnalyser.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameAnalyser.java index c7ad9c649..71a94218e 100644 --- a/src/main/java/au/org/ala/names/index/NameAnalyser.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameAnalyser.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; diff --git a/src/main/java/au/org/ala/names/index/NameKey.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameKey.java similarity index 93% rename from src/main/java/au/org/ala/names/index/NameKey.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameKey.java index 9542c9f02..0454df956 100644 --- a/src/main/java/au/org/ala/names/index/NameKey.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameKey.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; /** diff --git a/src/main/java/au/org/ala/names/index/NameProvider.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameProvider.java similarity index 97% rename from src/main/java/au/org/ala/names/index/NameProvider.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameProvider.java index 00ef182e6..8d1d8933b 100644 --- a/src/main/java/au/org/ala/names/index/NameProvider.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameProvider.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.index.provider.*; diff --git a/src/main/java/au/org/ala/names/index/NameSource.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameSource.java similarity index 92% rename from src/main/java/au/org/ala/names/index/NameSource.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameSource.java index 4b75daad0..9d15082d9 100644 --- a/src/main/java/au/org/ala/names/index/NameSource.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/NameSource.java @@ -1,6 +1,23 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.vocab.ALATerm; +import com.opencsv.exceptions.CsvValidationException; import org.apache.commons.collections.MapUtils; import org.gbif.api.model.registry.Citation; import org.gbif.api.model.registry.Contact; @@ -345,20 +362,21 @@ abstract public class NameSource { * * @throws IndexBuilderException if unable to create the name source */ - public static NameSource create(String f) throws IndexBuilderException { + public static NameSource create(File f) throws IndexBuilderException { try { - File nf = new File(f); NameSource ns; - if (!nf.exists()) - throw new IndexBuilderException("Name source " + nf + " does not exist"); - if (nf.isDirectory()) - ns = new DwcaNameSource(nf); + if (!f.exists()) + throw new IndexBuilderException("Name source " + f + " does not exist"); + if (f.isDirectory()) + ns = new DwcaNameSource(f); else - ns = new CSVNameSource(nf.toPath(), "UTF-8", DwcTerm.Taxon); + ns = new CSVNameSource(f.toPath(), "UTF-8", DwcTerm.Taxon); ns.validate(); return ns; } catch (IOException ex) { throw new UncheckedIOException(ex); + } catch (CsvValidationException ex) { + throw new IllegalStateException(ex); } } } diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/RankComparator.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/RankComparator.java new file mode 100644 index 000000000..2849b5948 --- /dev/null +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/RankComparator.java @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + +package au.org.ala.names.index; + +import au.org.ala.names.model.RankType; +import org.gbif.checklistbank.model.Equality; + +import java.util.Arrays; +import java.util.Map; +import java.util.stream.Collectors; + +import static au.org.ala.names.model.RankType.*; + +/** + * Compare two ranks and establish whether they are close enough to each other or different. + */ +public class RankComparator { + private static final RankType[][] RANK_RANGES = new RankType[][] { + new RankType[] { DOMAIN, DOMAIN, KINGDOM }, + new RankType[] { KINGDOM, DOMAIN, INFRAKINGDOM }, + new RankType[] { SUBKINGDOM, KINGDOM, SUPERPHYLUM }, + new RankType[] { INFRAKINGDOM, KINGDOM, SUPERPHYLUM }, + new RankType[] { SUPERPHYLUM, INFRAKINGDOM, INFRAPHYLUM }, + new RankType[] { PHYLUM, INFRAKINGDOM, SUPERCLASS }, + new RankType[] { SUBPHYLUM, INFRAKINGDOM, CLASS }, + new RankType[] { INFRAPHYLUM, PHYLUM, CLASS }, + new RankType[] { SUPERCLASS, INFRAPHYLUM, INFRACLASS }, + new RankType[] { CLASS, SUBPHYLUM, SUPERORDER }, + new RankType[] { SUBCLASS, INFRAPHYLUM, ORDER }, + new RankType[] { INFRACLASS, CLASS, ORDER }, + new RankType[] { SUBINFRACLASS, SUBCLASS, ORDER }, + new RankType[] { SUPERDIVISION_ZOOLOGY, SUBCLASS, ORDER }, + new RankType[] { DIVISION_ZOOLOGY, SUBCLASS, ORDER }, + new RankType[] { SUBDIVISION_ZOOLOGY, SUBCLASS, ORDER }, + new RankType[] { SUPERCOHORT, SUBCLASS, ORDER }, + new RankType[] { COHORT, SUBCLASS, ORDER }, + new RankType[] { SUBCOHORT, SUBCLASS, ORDER }, + new RankType[] { SUPERORDER, INFRACLASS, INFRAORDER }, + new RankType[] { ORDER, SUBCLASS, SUPERFAMILY }, + new RankType[] { SUBORDER, INFRACLASS, FAMILY }, + new RankType[] { INFRAORDER, ORDER, FAMILY }, + new RankType[] { PARVORDER, SUBORDER, FAMILY }, + new RankType[] { SUPERSERIES_ZOOLOGY, SUBORDER, FAMILY }, + new RankType[] { SERIES_ZOOLOGY, SUBORDER, FAMILY }, + new RankType[] { SUBSERIES_ZOOLOGY, SUBORDER, FAMILY }, + new RankType[] { SUPERSECTION_ZOOLOGY, SUBORDER, FAMILY }, + new RankType[] { SECTION_ZOOLOGY, SUBORDER, FAMILY }, + new RankType[] { SUBSECTION_ZOOLOGY, SUBORDER, FAMILY }, + new RankType[] { FAMILY, SUBORDER, INFRAFAMILY }, + new RankType[] { SUBFAMILY, INFRAORDER, GENUS }, + new RankType[] { INFRAFAMILY, FAMILY, GENUS }, + new RankType[] { SUPERTRIBE, SUBFAMILY, GENUS }, + new RankType[] { TRIBE, SUBFAMILY, GENUS }, + new RankType[] { SUBTRIBE, SUBFAMILY, GENUS }, + new RankType[] { SUPERGENUS, INFRAFAMILY, INFRAGENUS }, + new RankType[] { GENUS_GROUP, INFRAFAMILY, INFRAGENUS }, + new RankType[] { GENUS, INFRAFAMILY, SUPERSPECIES }, + new RankType[] { SUBGENUS, INFRAFAMILY, SUPERSPECIES }, + new RankType[] { INFRAGENUS, INFRAFAMILY, SUPERSPECIES }, + new RankType[] { SUPERSECTION_BOTANY, GENUS, SPECIES_SUBGROUP }, + new RankType[] { SECTION_BOTANY, GENUS, SPECIES_SUBGROUP }, + new RankType[] { SUBSECTION_BOTANY, GENUS, SPECIES_SUBGROUP }, + new RankType[] { SUPERSERIES_BOTANY, GENUS, SPECIES_SUBGROUP }, + new RankType[] { SERIES_BOTANY, SUBGENUS, SPECIES_SUBGROUP }, + new RankType[] { SUBSERIES_BOTANY, SUBGENUS, SPECIES_SUBGROUP }, + new RankType[] { INFRAGENERICNAME, SUBGENUS, SPECIES_SUBGROUP }, + new RankType[] { SPECIES_GROUP, SUBGENUS, SPECIES_SUBGROUP }, + new RankType[] { SUPERSPECIES, SUBGENUS, SPECIES_SUBGROUP }, + new RankType[] { SPECIES_SUBGROUP, SUBGENUS, SPECIES_SUBGROUP }, + new RankType[] { SPECIES, SPECIES, TELEOMORPH }, + new RankType[] { NOTHOSPECIES, SPECIES, SUBSPECIES }, + new RankType[] { HOLOMORPH, SPECIES, SUBSPECIES }, + new RankType[] { ANAMORPH, SPECIES, SUBSPECIES }, + new RankType[] { TELEOMORPH, SPECIES, SUBSPECIES }, + new RankType[] { SUBSPECIES, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { NOTHOSUBSPECIES, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { INFRASPECIFICNAME, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { INFRASUBSPECIESNAME, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { VARIETY, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { NOTHOVARIETY, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { SUBVARIETY, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { FORM, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { NOTHOFORM, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { SUBFORM, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { BIOVAR, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { SEROVAR, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { FORMASPECIALIS, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { CULTIVARGROUP, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { CULTIVAR, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { PATHOVAR, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { HYBRID, NOTHOSPECIES, SUPRAGENERICNAME }, + new RankType[] { SUPRAGENERICNAME, NOTHOSPECIES, SUPRAGENERICNAME } + }; + + private static final Map UPPER_BOUND = Arrays.stream(RANK_RANGES).collect(Collectors.toMap( + r -> r[0], + r -> r[1] + )); + + private static final Map LOWER_BOUND = Arrays.stream(RANK_RANGES).collect(Collectors.toMap( + r -> r[0], + r -> r[2] + )); + + /** + * Compare two ranges and see if they are equivalent-ish. + *

+ * Rank comparators allow a degree of slop between ranks, so that a + * subclass and order or supergenus and family are considered close enough. + * Incomparable ranks {@link RankType#INFORMAL} and the like are equal to each other + * and not equal to any other rank. + *

+ * @param rank1 The first rank + * @param rank2 The second rank + * + * @return An equality statement + */ + public Equality compare(RankType rank1, RankType rank2) { + if (rank1 == rank2) + return Equality.EQUAL; + if (rank1.getId() <= 0 && rank2.getId() <= 0) + return Equality.EQUAL; + if (rank1 == UNRANKED || rank2 == UNRANKED || rank1 == INFORMAL || rank2 == INFORMAL) + return Equality.EQUAL; + if (rank1.getId() <= 0 && rank2.getId() <= 0) + return Equality.UNKNOWN; + RankType r1u = UPPER_BOUND.get(rank1); + RankType r1l = LOWER_BOUND.get(rank1); + if ((r1u != null && rank2.compareTo(r1u) >= 0) && (r1l != null && rank2.compareTo(r1l) <= 0)) + return Equality.EQUAL; + RankType r2u = UPPER_BOUND.get(rank2); + RankType r2l = LOWER_BOUND.get(rank2); + if ((r2u != null && rank1.compareTo(r2u) >= 0) && (r2l != null && rank1.compareTo(r2l) <= 0)) + return Equality.EQUAL; + return Equality.DIFFERENT; + } + +} diff --git a/src/main/java/au/org/ala/names/index/Reporter.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/Reporter.java similarity index 79% rename from src/main/java/au/org/ala/names/index/Reporter.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/Reporter.java index 0e97c042e..960fb80f7 100644 --- a/src/main/java/au/org/ala/names/index/Reporter.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/Reporter.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import java.util.List; diff --git a/src/main/java/au/org/ala/names/index/ResolutionException.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ResolutionException.java similarity index 62% rename from src/main/java/au/org/ala/names/index/ResolutionException.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/ResolutionException.java index d73e47940..ab4ecf920 100644 --- a/src/main/java/au/org/ala/names/index/ResolutionException.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ResolutionException.java @@ -1,7 +1,22 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import javax.annotation.Nullable; -import java.util.Collections; import java.util.List; /** diff --git a/src/main/java/au/org/ala/names/index/ScientificName.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ScientificName.java similarity index 85% rename from src/main/java/au/org/ala/names/index/ScientificName.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/ScientificName.java index 74d8ce9a6..7e0d0d838 100644 --- a/src/main/java/au/org/ala/names/index/ScientificName.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/ScientificName.java @@ -1,10 +1,31 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; import au.org.ala.names.util.DwcaWriter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; import java.util.stream.Collectors; /** @@ -30,6 +51,8 @@ * @copyright Copyright (c) 2017 CSIRO */ public class ScientificName extends Name implements Comparable { + private static final Logger logger = LoggerFactory.getLogger(ScientificName.class); + /** * Construct for a container and a key * @@ -100,13 +123,18 @@ public TaxonomicElement findElement(Taxonomy taxonomy, NameProvider provider) { */ @Override protected TaxonConcept findPrincipal(Taxonomy taxonomy) { - TaxonConcept principal = this.findBasePrincipal(taxonomy); - TaxonConceptInstance representative = principal.getRepresentative(); - TaxonConceptInstance resolved = representative.getResolvedAccepted(); + try { + TaxonConcept principal = this.findBasePrincipal(taxonomy); + TaxonConceptInstance representative = principal.getRepresentative(); + TaxonConceptInstance resolved = representative.getResolvedAccepted(); - if (resolved != representative && resolved.getContainer().getContainer() == this) - principal = resolved.getContainer(); - return principal; + if (resolved != representative && resolved.getContainer().getContainer() == this) + principal = resolved.getContainer(); + return principal; + } catch (RuntimeException ex) { + logger.error("Unable to find principal for " + this); + throw ex; + } } /** @@ -138,16 +166,16 @@ private TaxonConcept findBasePrincipal(Taxonomy taxonomy) { return concepts.get(0); if (accepted.size() == 1) return accepted.get(0); - List authored = accepted.stream().filter(tc -> tc.isAuthored() || tc.isAutonym()).collect(Collectors.toList()); - if (authored.size() == 0) - return accepted.get(0); + final int score = accepted.stream().mapToInt(TaxonConcept::getPrincipalScore).max().orElse(TaxonomicElement.MIN_SCORE); + List candidates = accepted.stream().filter(tc -> tc.getPrincipalScore() == score).collect(Collectors.toList()); + if (candidates.size() == 1) + return candidates.get(0); + candidates.sort(REVERSE_PRINCIPAL_SCORE_COMPARATOR); + List authored = candidates.stream().filter(tc -> tc.isAuthored() || tc.isAutonym()).collect(Collectors.toList()); if (authored.size() == 1) return authored.get(0); - taxonomy.report(IssueType.COLLISION, "scientificName.collision", this, authored); - final int score = authored.stream().mapToInt(TaxonConcept::getPrincipalScore).max().orElse(TaxonomicElement.MIN_SCORE); - List candidates = authored.stream().filter(tc -> tc.getPrincipalScore() == score).collect(Collectors.toList()); - if (candidates.size() > 1) - taxonomy.report(IssueType.PROBLEM, "scientificName.collision.warn", this, candidates); + taxonomy.report(IssueType.COLLISION, "scientificName.collision", this, candidates); + taxonomy.report(IssueType.PROBLEM, "scientificName.collision.warn", this, candidates); return candidates.get(0); } diff --git a/src/main/java/au/org/ala/names/index/TaxonConcept.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonConcept.java similarity index 96% rename from src/main/java/au/org/ala/names/index/TaxonConcept.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonConcept.java index 832b3eeb2..68acc265f 100644 --- a/src/main/java/au/org/ala/names/index/TaxonConcept.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonConcept.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; @@ -118,7 +134,7 @@ public TaxonConceptInstance addInstance(NameKey instanceKey, TaxonConceptInstanc */ public TaxonConceptInstance findInstance(NameProvider provider, boolean acceptedOnly) { for (TaxonConceptInstance instance: this.instances) - if (instance.getProvider().equals(provider) && (!acceptedOnly || instance.isAccepted())) + if (instance.getProvider().equals(provider) && !instance.isForbidden() && (!acceptedOnly || instance.isAccepted())) return instance; return null; } diff --git a/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java similarity index 97% rename from src/main/java/au/org/ala/names/index/TaxonConceptInstance.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java index 0bb5b539b..5506f81bf 100644 --- a/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonConceptInstance.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; @@ -12,6 +28,8 @@ import org.gbif.dwc.terms.DwcTerm; import org.gbif.dwc.terms.GbifTerm; import org.gbif.dwc.terms.Term; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import javax.annotation.Nullable; import java.io.IOException; @@ -29,6 +47,8 @@ * @copyright Copyright © 2017 Atlas of Living Australia */ public class TaxonConceptInstance extends TaxonomicElement { + private static final Logger logger = LoggerFactory.getLogger(TaxonConceptInstance.class); + /** Compare instance base (priovider only) scores */ public static Comparator PROVIDER_SCORE_COMPARATOR = new Comparator() { @Override @@ -272,7 +292,7 @@ public NameProvider getProvider() { } /** - * Get the originating authorityfor the data + * Get the originating authority for the data * * @return The authority source */ @@ -935,6 +955,10 @@ private TaxonConceptInstance getResolvedAccepted(TaxonConceptInstance original, if (trace != null) trace.add(ae); TaxonConceptInstance accepted = ae.getRepresentative(); + if (accepted == null) { + logger.warn("Null representative instance for " + ae + " when resolving " + this); + return resolved; + } accepted = accepted.getResolvedAccepted(original, steps - 1, trace, exception); if (!accepted.isForbidden()) return accepted; @@ -993,27 +1017,33 @@ public void normalise() throws IndexBuilderException { * * @param taxonomy The current taxonomy * + * @return True if successfully resolved + * * @throws IndexBuilderException If unable to make a link, usually due to a broken reference */ // If you plan to change this, it is called by a parallel stream, so consisder thread safety // At the moment, this fills out inferred information only - public void resolveLinks(Taxonomy taxonomy) throws IndexBuilderException { + public boolean resolveLinks(Taxonomy taxonomy) throws IndexBuilderException { if (this.parentNameUsageID != null) { this.parent = taxonomy.getInstance(this.parentNameUsageID); } if (this.parentNameUsage != null && this.parent == null) { this.parent = taxonomy.findElement(this.code, this.parentNameUsage, this.provider, null); } - if (this.parent == null && (this.parentNameUsage != null || this.parentNameUsageID != null)) - throw new IndexBuilderException("Unable to find parent taxon for " + this + " from " + this.parentNameUsageID + " - " + this.parentNameUsage); + if (this.parent == null && (this.parentNameUsage != null || this.parentNameUsageID != null)) { + taxonomy.report(IssueType.ERROR, "instance.parent.invalidLink", this.taxonID, this.scientificName, "Unable to find parent taxon for " + this + " from " + this.parentNameUsageID + " - " + this.parentNameUsage); + return false; + } if (this.acceptedNameUsageID != null) { this.accepted = taxonomy.getInstance(this.acceptedNameUsageID); } if (this.acceptedNameUsage != null && this.accepted == null) { this.accepted = taxonomy.findElement(this.code, this.acceptedNameUsage, this.provider, null); } - if (this.accepted == null && (this.acceptedNameUsage != null || this.acceptedNameUsageID != null)) - throw new IndexBuilderException("Unable to find accepted taxon for " + this + " from " + this.acceptedNameUsageID + " - " + this.acceptedNameUsage); + if (this.accepted == null && (this.acceptedNameUsage != null || this.acceptedNameUsageID != null)) { + taxonomy.report(IssueType.ERROR, "instance.accepted.invalidLink", this.taxonID, this.scientificName, "Unable to find accepted taxon for " + this + " from " + this.acceptedNameUsageID + " - " + this.acceptedNameUsage); + return false; + } // No parent or accepted taxon but has a classification, so see if we can deduce a parent if (this.parent == null && this.accepted == null && this.classification != null) { String genus = ""; @@ -1045,6 +1075,7 @@ public void resolveLinks(Taxonomy taxonomy) throws IndexBuilderException { if (this.parent == null) this.parent = this.provider.findDefaultParent(taxonomy, this); taxonomy.count("count.resolve.instance.links"); + return true; } /** diff --git a/src/main/java/au/org/ala/names/index/TaxonResolution.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonResolution.java similarity index 92% rename from src/main/java/au/org/ala/names/index/TaxonResolution.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonResolution.java index e51ddea3a..45888e35a 100644 --- a/src/main/java/au/org/ala/names/index/TaxonResolution.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonResolution.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; diff --git a/src/main/java/au/org/ala/names/index/TaxonResolutionException.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonResolutionException.java similarity index 68% rename from src/main/java/au/org/ala/names/index/TaxonResolutionException.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonResolutionException.java index d14c8459f..75670dddf 100644 --- a/src/main/java/au/org/ala/names/index/TaxonResolutionException.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonResolutionException.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; /** diff --git a/src/main/java/au/org/ala/names/index/TaxonResolver.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonResolver.java similarity index 83% rename from src/main/java/au/org/ala/names/index/TaxonResolver.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonResolver.java index 3c2cffc12..b6c67c005 100644 --- a/src/main/java/au/org/ala/names/index/TaxonResolver.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonResolver.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; diff --git a/src/main/java/au/org/ala/names/index/TaxonomicElement.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomicElement.java similarity index 92% rename from src/main/java/au/org/ala/names/index/TaxonomicElement.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomicElement.java index 339193252..9b6014818 100644 --- a/src/main/java/au/org/ala/names/index/TaxonomicElement.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomicElement.java @@ -1,7 +1,22 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; -import org.gbif.api.vocabulary.NomenclaturalCode; import java.util.Comparator; diff --git a/src/main/java/au/org/ala/names/index/Taxonomy.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/Taxonomy.java similarity index 98% rename from src/main/java/au/org/ala/names/index/Taxonomy.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/Taxonomy.java index 471bbd6be..84d238157 100644 --- a/src/main/java/au/org/ala/names/index/Taxonomy.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/Taxonomy.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.LinnaeanRankClassification; @@ -478,7 +494,8 @@ public void provideUnknownTaxon() throws Exception { */ public void resolveLinks() throws IndexBuilderException { logger.info("Resolving links"); - this.instances.values().parallelStream().forEach(instance -> instance.resolveLinks(this)); + if (!this.instances.values().parallelStream().allMatch(instance -> instance.resolveLinks(this))) + throw new IndexBuilderException("Errors resolving links"); logger.info("Finished resolving links"); } @@ -1469,6 +1486,7 @@ public void createWorkingIndex() throws IOException { indexer.commitLoadingIndexes(); indexer.generateIndex(); indexer.create(interim); + indexer.createIrmng(null); indexer.commit(); } catch (Exception ex) { throw new IndexBuilderException("Unable to build working index"); @@ -1659,7 +1677,7 @@ public List> getIndexValues(Term type, String taxonID) throws I IndexSearcher searcher = this.searcherManager.acquire(); try { TopDocs docs = searcher.search(query, 100, Sort.INDEXORDER); - List> valueList = new ArrayList<>(docs.totalHits); + List> valueList = new ArrayList<>((int) docs.totalHits.value); for (ScoreDoc sd : docs.scoreDocs) { Document document = searcher.doc(sd.doc); Map values = new HashMap<>(); diff --git a/src/main/java/au/org/ala/names/index/TaxonomyBuilder.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomyBuilder.java similarity index 63% rename from src/main/java/au/org/ala/names/index/TaxonomyBuilder.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomyBuilder.java index 0cf6380bc..0a6e5335e 100644 --- a/src/main/java/au/org/ala/names/index/TaxonomyBuilder.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomyBuilder.java @@ -1,15 +1,30 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; -import au.org.ala.names.search.ALANameSearcher; import au.org.ala.names.search.DwcaNameIndexer; -import au.org.ala.names.util.FileUtils; import org.apache.commons.cli.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.*; +import java.io.File; +import java.io.FileInputStream; +import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; import java.util.List; import java.util.stream.Collectors; @@ -22,6 +37,48 @@ public class TaxonomyBuilder { private static Logger logger = LoggerFactory.getLogger(TaxonomyBuilder.class); + /** + * Recursively find sources. + *

+ * The directory and sub-directory are first searched for a meta.xml file and, + * if present, the source is added as a DwCA. + * Otherwise, any csv files are added to the list and subdirectories recursively + * searched. + *

+ * @param path + * @return + */ + protected static List findSources(File path) { + List sources = new ArrayList<>(); + try { + if (!path.exists()) { + logger.info("Path does not exist " + path); + return sources; + } + if (path.isFile()) { + logger.info("Adding source file at " + path); + sources.add(NameSource.create(path)); + return sources; + } + if (!path.isDirectory()) { + logger.info("Unknown file type for " + path); + } + File meta = new File(path, "meta.xml"); + if (meta.exists()) { + logger.info("Adding DwCA at " + path); + sources.add(NameSource.create(path)); + return sources; + } else { + for (File f : path.listFiles()) { + if (f.isDirectory() || f.getName().endsWith(".csv")) + sources.addAll(findSources(f)); + } + } + } catch (Exception ex) { + logger.error("Unable to get sources for " + path, ex); + } + return sources; + } public static void main(String[] args) { try { @@ -36,12 +93,14 @@ public static void main(String[] args) { Integer samples = null; DwcaNameIndexer indexer; TaxonomyConfiguration config = null; + List sources; Option o = OptionBuilder.withLongOpt("output").withDescription("Output directory - defaults to 'combined' in the current directory").hasArg().withArgName("DIR").withType(File.class).create('o'); Option w = OptionBuilder.withLongOpt("work").withDescription("Working directory - defaults to the current directory").hasArg().withArgName("DIR").withType(File.class).create('w'); Option c = OptionBuilder.withLongOpt("config").withDescription("Configuration file").hasArg().withArgName("FILE").withType(File.class).create('c'); Option r = OptionBuilder.withLongOpt("report").withDescription("Report file").hasArg().withArgName("FILE").withType(File.class).create('r'); Option p = OptionBuilder.withLongOpt("previous").withDescription("Previous taxonomy DwCA").hasArg().withArgName("DIR").withType(File.class).create('p'); + Option recurse = OptionBuilder.withLongOpt("recurse").withDescription("Input file is a directory, recurse through subdirectories").create('R'); Option ncl = OptionBuilder.withLongOpt("noclean").withDescription("Don't clean up work area").create(); Option nc = OptionBuilder.withLongOpt("nocreate").withDescription("Don't create an output taxonomy").create(); Option s = OptionBuilder.withLongOpt("sample").withDescription("Output a sample taxonomy, consisting of n concepts plus their parents/accepted").hasArg().withArgName("N").withType(Integer.class).create(); @@ -50,6 +109,7 @@ public static void main(String[] args) { options.addOption(c); options.addOption(r); options.addOption(p); + options.addOption(recurse); options.addOption(ncl); options.addOption(nc); options.addOption(s); @@ -80,7 +140,11 @@ public static void main(String[] args) { if (cmd.hasOption("sample")) { samples = Integer.parseInt(cmd.getOptionValue("sample")); } - List sources = Arrays.asList(cmd.getArgs()).stream().map(f -> NameSource.create(f)).collect(Collectors.toList()); + if (cmd.hasOption("recurse")) { + sources = Arrays.asList(cmd.getArgs()).stream().map(File::new).map(f -> findSources(f)).flatMap(List::stream).collect(Collectors.toList()); + } else { + sources = Arrays.asList(cmd.getArgs()).stream().map(File::new).map(f -> NameSource.create(f)).collect(Collectors.toList()); + } Taxonomy taxonomy = new Taxonomy(config, work); taxonomy.begin(); taxonomy.load(sources); diff --git a/src/main/java/au/org/ala/names/index/TaxonomyConfiguration.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomyConfiguration.java similarity index 92% rename from src/main/java/au/org/ala/names/index/TaxonomyConfiguration.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomyConfiguration.java index 9a1451a10..751119928 100644 --- a/src/main/java/au/org/ala/names/index/TaxonomyConfiguration.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/TaxonomyConfiguration.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.util.GbifModule; @@ -18,7 +34,6 @@ import java.io.*; import java.net.URI; import java.util.*; -import java.util.stream.Collectors; /** * A readable description of a taxonomy construction. diff --git a/src/main/java/au/org/ala/names/index/UnrankedScientificName.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/UnrankedScientificName.java similarity index 93% rename from src/main/java/au/org/ala/names/index/UnrankedScientificName.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/UnrankedScientificName.java index 17f5c477d..2f3b194ab 100644 --- a/src/main/java/au/org/ala/names/index/UnrankedScientificName.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/UnrankedScientificName.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; diff --git a/src/main/java/au/org/ala/names/index/provider/AndTaxonCondition.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/AndTaxonCondition.java similarity index 75% rename from src/main/java/au/org/ala/names/index/provider/AndTaxonCondition.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/AndTaxonCondition.java index cf39481e3..e12ef4e2c 100644 --- a/src/main/java/au/org/ala/names/index/provider/AndTaxonCondition.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/AndTaxonCondition.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.NameKey; diff --git a/src/main/java/au/org/ala/names/index/provider/ConceptResolutionPriority.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ConceptResolutionPriority.java similarity index 51% rename from src/main/java/au/org/ala/names/index/provider/ConceptResolutionPriority.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ConceptResolutionPriority.java index c1baa7787..ed1eda6f6 100644 --- a/src/main/java/au/org/ala/names/index/provider/ConceptResolutionPriority.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ConceptResolutionPriority.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; /** diff --git a/src/main/java/au/org/ala/names/index/provider/DiscardStrategy.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/DiscardStrategy.java similarity index 55% rename from src/main/java/au/org/ala/names/index/provider/DiscardStrategy.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/DiscardStrategy.java index eb418b88e..217882cc7 100644 --- a/src/main/java/au/org/ala/names/index/provider/DiscardStrategy.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/DiscardStrategy.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; /** diff --git a/src/main/java/au/org/ala/names/index/provider/KeyAdjuster.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/KeyAdjuster.java similarity index 61% rename from src/main/java/au/org/ala/names/index/provider/KeyAdjuster.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/KeyAdjuster.java index 813cc5723..d30ca3392 100644 --- a/src/main/java/au/org/ala/names/index/provider/KeyAdjuster.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/KeyAdjuster.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.NameKey; diff --git a/src/main/java/au/org/ala/names/index/provider/KeyAdjustment.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/KeyAdjustment.java similarity index 86% rename from src/main/java/au/org/ala/names/index/provider/KeyAdjustment.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/KeyAdjustment.java index 362fb9bb9..f51a8f904 100644 --- a/src/main/java/au/org/ala/names/index/provider/KeyAdjustment.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/KeyAdjustment.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.NameKey; diff --git a/src/main/java/au/org/ala/names/index/provider/MatchTaxonCondition.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/MatchTaxonCondition.java similarity index 94% rename from src/main/java/au/org/ala/names/index/provider/MatchTaxonCondition.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/MatchTaxonCondition.java index c9992a01d..ce51a3b73 100644 --- a/src/main/java/au/org/ala/names/index/provider/MatchTaxonCondition.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/MatchTaxonCondition.java @@ -1,12 +1,26 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.NameKey; -import au.org.ala.names.index.NameProvider; import au.org.ala.names.index.TaxonConceptInstance; import au.org.ala.names.model.RankType; import au.org.ala.names.model.TaxonomicType; import com.fasterxml.jackson.annotation.JsonIgnore; -import com.fasterxml.jackson.annotation.JsonProperty; import org.gbif.api.vocabulary.NameType; import org.gbif.api.vocabulary.NomenclaturalCode; import org.gbif.api.vocabulary.NomenclaturalStatus; @@ -211,7 +225,7 @@ private boolean matchScientificName(String name) { return this.matchScientificName.equals(name); case REGEX: if (this.patternScientificName == null) - this.patternScientificName = Pattern.compile(this.scientificName); + this.patternScientificName = Pattern.compile(this.scientificName, Pattern.CASE_INSENSITIVE); return this.patternScientificName.matcher(name).matches(); default: if (this.matchScientificName == null) diff --git a/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/NameMatchType.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/NameMatchType.java new file mode 100644 index 000000000..0f2a554e0 --- /dev/null +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/NameMatchType.java @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + +package au.org.ala.names.index.provider; + +/** + * How to match a name or author + * + * @author Doug Palmer <Doug.Palmer@csiro.au> + * @copyright Copyright © 2017 Atlas of Living Australia + */ +public enum NameMatchType { + /** Exact match */ + EXACT, + /** Case and space insensitive */ + INSENSITIVE, + /** Normalised by GBIF name analysis @see org.gbif.checklistbank.utils.SciNameNormalizer @see org.gbif.checklistbank.authorship.AuthorComparator */ + NORMALISED, + /** Reguilar expression match */ + REGEX +} diff --git a/src/main/java/au/org/ala/names/index/provider/OrTaxonCondition.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/OrTaxonCondition.java similarity index 75% rename from src/main/java/au/org/ala/names/index/provider/OrTaxonCondition.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/OrTaxonCondition.java index e869c11a7..b22618e03 100644 --- a/src/main/java/au/org/ala/names/index/provider/OrTaxonCondition.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/OrTaxonCondition.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.NameKey; diff --git a/src/main/java/au/org/ala/names/index/provider/ScoreAdjuster.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ScoreAdjuster.java similarity index 73% rename from src/main/java/au/org/ala/names/index/provider/ScoreAdjuster.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ScoreAdjuster.java index e1cd00e8a..65d139750 100644 --- a/src/main/java/au/org/ala/names/index/provider/ScoreAdjuster.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ScoreAdjuster.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.NameKey; @@ -6,7 +22,6 @@ import java.util.ArrayList; import java.util.List; -import java.util.stream.Collectors; /** * A score adjustment for applying to a specific diff --git a/src/main/java/au/org/ala/names/index/provider/ScoreAdjustment.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ScoreAdjustment.java similarity index 61% rename from src/main/java/au/org/ala/names/index/provider/ScoreAdjustment.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ScoreAdjustment.java index 1bfdde839..e949e4cae 100644 --- a/src/main/java/au/org/ala/names/index/provider/ScoreAdjustment.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/ScoreAdjustment.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.NameKey; diff --git a/src/main/java/au/org/ala/names/index/provider/TaxonCondition.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/TaxonCondition.java similarity index 67% rename from src/main/java/au/org/ala/names/index/provider/TaxonCondition.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/TaxonCondition.java index 7e7c21a8e..63c6ea65f 100644 --- a/src/main/java/au/org/ala/names/index/provider/TaxonCondition.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/TaxonCondition.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.NameKey; diff --git a/src/main/java/au/org/ala/names/index/provider/UnrankedStrategy.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/UnrankedStrategy.java similarity index 86% rename from src/main/java/au/org/ala/names/index/provider/UnrankedStrategy.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/UnrankedStrategy.java index 1f997c0cb..00454e4c0 100644 --- a/src/main/java/au/org/ala/names/index/provider/UnrankedStrategy.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/index/provider/UnrankedStrategy.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.TaxonConceptInstance; @@ -5,7 +21,6 @@ import au.org.ala.names.model.TaxonomicType; import java.util.Arrays; -import java.util.Collection; import java.util.HashSet; import java.util.Set; diff --git a/src/main/java/au/org/ala/names/search/ALANameIndexer.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/search/ALANameIndexer.java similarity index 81% rename from src/main/java/au/org/ala/names/search/ALANameIndexer.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/search/ALANameIndexer.java index f86800453..1b22633e6 100644 --- a/src/main/java/au/org/ala/names/search/ALANameIndexer.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/search/ALANameIndexer.java @@ -24,13 +24,12 @@ import com.opencsv.CSVReaderBuilder; import org.apache.commons.io.FileUtils; import org.apache.commons.io.LineIterator; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.KeywordAnalyzer; -import org.apache.lucene.document.*; -import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; @@ -117,35 +116,6 @@ public class ALANameIndexer { private String indexDirectory; private IndexWriter cbIndexWriter; - //Fields that are being indexed or stored in the lucene index - public enum IndexField { - - NAME("name"), - NAMES("names"), - ID("id"), - RANK("rank"), - SEARCHABLE_NAME("searchcan"), - LSID("lsid"), - HOMONYM("homonym"), - ACCEPTED("synonym"), - LEFT("left"), - RIGHT("right"), - PRIORITY("priority"), - SEARCHABLE_COMMON_NAME("common"), - COMMON_NAME("common_orig"), - LANGUAGE("lang"); - - String name; - - IndexField(String name) { - this.name = name; - } - - public String toString() { - return name; - } - } - PhraseNameParser parser = new PhraseNameParser(); Set knownHomonyms = new HashSet(); Set blacklist = new HashSet(); @@ -200,7 +170,7 @@ public void createIndex(String exportsDir, String indexDir, boolean generateSciN * @throws Exception */ public void createIrmngIndex(String exportsDir, String indexDir) throws Exception { - Analyzer analyzer = new LowerCaseKeywordAnalyzer(); + Analyzer analyzer = LowerCaseKeywordAnalyzer.newInstance(); IndexWriter irmngWriter = createIndexWriter(new File(indexDir + File.separator + "irmng"), analyzer, true); indexIrmngDwcA(irmngWriter, irmngDwcaDirectory); indexIRMNG(irmngWriter, exportsDir + File.separator + "ala-species-homonyms.txt", RankType.SPECIES); @@ -211,7 +181,7 @@ public void createIrmngIndex(String exportsDir, String indexDir) throws Exceptio public void createIndex(String exportsDir, String indexDir, String acceptedFile, String synonymFile, String irmngDwca, boolean generateSciNames, boolean generateCommonNames) throws Exception { - Analyzer analyzer = new LowerCaseKeywordAnalyzer(); + Analyzer analyzer = LowerCaseKeywordAnalyzer.newInstance(); //generate the extra id index createExtraIdIndex(indexDir + File.separator + "id", new File(exportsDir + File.separator + "identifiers.txt")); if (generateSciNames) { @@ -244,11 +214,10 @@ private IndexSearcher createTmpGuidIndex(String cbExportFile) throws Exception { Document doc = new Document(); String id = values[POS_ID]; String guid = values[POS_LSID]; - doc.add(new StringField("id", id, Store.YES)); - if (StringUtils.isEmpty(id)) + NameIndexField.ID.store(id, doc); + if (StringUtils.isEmpty(id)) guid = id; - - doc.add(new StoredField("guid", guid)); + NameIndexField.GUID.store(guid, doc); iw.addDocument(doc); } System.out.println("Finished writing the tmp guid index..."); @@ -348,13 +317,13 @@ private void indexALA(IndexWriter iw, String file, String synonymFile) throws Ex values[POS_PID], values[POS_C], values[POS_CID], values[POS_O], values[POS_OID], values[POS_F], values[POS_FID], values[POS_G], values[POS_GID], values[POS_S], values[POS_SID], - values[POS_LFT], values[POS_RGT], acceptedValues, + Integer.parseInt(values[POS_LFT]), Integer.parseInt(values[POS_RGT]), acceptedValues, values[POS_SP_EPITHET], values[POS_INFRA_EPITHET], values[POS_AUTHOR], null, null, priority); //add the excluded information if applicable if ("T".equals(values[POS_EXCLUDED]) || "Y".equals(values[POS_EXCLUDED])) { - doc.add(new TextField(NameIndexField.SYNONYM_TYPE.toString(), SynonymType.EXCLUDES.getId().toString(), Store.YES)); + NameIndexField.SYNONYM_TYPE.store(SynonymType.EXCLUDES.getId().toString(), doc); } if (doc != null) { iw.addDocument(doc); @@ -376,7 +345,7 @@ private void indexALA(IndexWriter iw, String file, String synonymFile) throws Ex public void addAdditionalName(String lsid, String scientificName, String author, LinnaeanRankClassification cl) throws Exception { if (cbIndexWriter == null) - cbIndexWriter = createIndexWriter(new File(indexDirectory + File.separator + "cb"), new LowerCaseKeywordAnalyzer(), false); + cbIndexWriter = createIndexWriter(new File(indexDirectory + File.separator + "cb"), LowerCaseKeywordAnalyzer.newInstance(), false); Document doc = createALAIndexDocument(scientificName, "-1", lsid, author, cl); cbIndexWriter.addDocument(doc); @@ -389,7 +358,7 @@ public void addAdditionalName(String lsid, String scientificName, String author, */ public void deleteName(String lsid) throws Exception{ if(cbIndexWriter == null){ - cbIndexWriter = createIndexWriter(new File(indexDirectory+ File.separator + "cb"), new LowerCaseKeywordAnalyzer(), false); + cbIndexWriter = createIndexWriter(new File(indexDirectory+ File.separator + "cb"), LowerCaseKeywordAnalyzer.newInstance(), false); } Term term = new Term("lsid", lsid); cbIndexWriter.deleteDocuments(new TermQuery(term)); @@ -442,44 +411,44 @@ protected void indexIrmngDwcA(IndexWriter iw, String archiveDirectory) throws Ex Document doc = new Document(); String kingdom = dwcr.value(DwcTerm.kingdom); if (StringUtils.isNotEmpty(kingdom)) { - doc.add(new TextField(RankType.KINGDOM.getRank(), kingdom, Store.YES)); + NameIndexField.KINGDOM.store(kingdom, doc); } String phylum = dwcr.value(DwcTerm.phylum); if (StringUtils.isNotEmpty(phylum)) { - doc.add(new TextField(RankType.PHYLUM.getRank(), phylum, Store.YES)); + NameIndexField.PHYLUM.store(phylum, doc); } String classs = dwcr.value(DwcTerm.class_); if (StringUtils.isNotEmpty(classs)) { - doc.add(new TextField(RankType.CLASS.getRank(), classs, Store.YES)); + NameIndexField.CLASS.store(classs, doc); } String order = dwcr.value(DwcTerm.order); if (StringUtils.isNotEmpty(order)) { - doc.add(new TextField(RankType.ORDER.getRank(), order, Store.YES)); + NameIndexField.ORDER.store(order, doc); } String family = dwcr.value(DwcTerm.family); if (StringUtils.isNotEmpty(family)) { - doc.add(new TextField(RankType.FAMILY.getRank(), family, Store.YES)); + NameIndexField.FAMILY.store(kingdom, doc); } String genus = dwcr.value(DwcTerm.genus); String calculatedRank = "genus"; if (StringUtils.isNotEmpty(genus)) { - doc.add(new TextField(RankType.GENUS.getRank(), genus, Store.YES)); + NameIndexField.GENUS.store(genus, doc); String specificEpithet = dwcr.value(DwcTerm.specificEpithet); if (StringUtils.isNotEmpty(specificEpithet)) { calculatedRank = "species"; - doc.add(new TextField(RankType.SPECIES.getRank(), genus + " " + specificEpithet, Store.YES)); + NameIndexField.SPECIES.store(genus + " " + specificEpithet, doc); } } String rank = dwcr.value(DwcTerm.taxonRank); if (StringUtils.isEmpty(rank)) rank = calculatedRank; - doc.add(new TextField(IndexField.RANK.toString(), rank, Store.YES)); + NameIndexField.RANK.store(rank, doc); //now add the author - we don't do anything about this on homonym resolution yet //Add the author information String author = dwcr.value(DwcTerm.scientificNameAuthorship); if (StringUtils.isNotEmpty(author)) { //TODO think about whether we need to treat the author string with the taxamatch - doc.add(new TextField(NameIndexField.AUTHOR.toString(), author, Store.YES)); + NameIndexField.AUTHOR.store(author, doc); } //now add it to the index iw.addDocument(doc); @@ -504,20 +473,21 @@ void indexIRMNG(IndexWriter iw, String irmngExport, RankType rank) throws Except while ((values = reader.readNext()) != null) { Document doc = new Document(); if (values != null && values.length >= 7) { - doc.add(new TextField(RankType.KINGDOM.getRank(), values[0], Store.YES)); - doc.add(new TextField(RankType.PHYLUM.getRank(), values[1], Store.YES)); - doc.add(new TextField(RankType.CLASS.getRank(), values[2], Store.YES)); - doc.add(new TextField(RankType.ORDER.getRank(), values[3], Store.YES)); - doc.add(new TextField(RankType.FAMILY.getRank(), values[4], Store.YES)); - doc.add(new TextField(RankType.GENUS.getRank(), values[5], Store.YES)); + NameIndexField.KINGDOM.store(values[0], doc); + NameIndexField.PHYLUM.store(values[1], doc); + NameIndexField.CLASS.store(values[2], doc); + NameIndexField.ORDER.store(values[3], doc); + NameIndexField.FAMILY.store(values[4], doc); + NameIndexField.GENUS.store(values[5], doc); if (rank == RankType.GENUS) { - doc.add(new TextField(IndexField.ID.toString(), values[6], Store.YES)); - doc.add(new TextField(IndexField.ACCEPTED.toString(), values[8], Store.YES)); - doc.add(new TextField(IndexField.HOMONYM.toString(), values[10], Store.YES)); + + NameIndexField.ID.store(values[6], doc); + NameIndexField.ACCEPTED.store(values[8], doc); + NameIndexField.HOMONYM.store(values[10], doc); } else if (rank == RankType.SPECIES) { - doc.add(new TextField(RankType.SPECIES.getRank(), values[6], Store.YES)); + NameIndexField.SPECIES.store(values[6], doc); } - doc.add(new TextField(IndexField.RANK.toString(), rank.getRank(), Store.YES)); + NameIndexField.RANK.store(rank.getRank(), doc); iw.addDocument(doc); count++; } @@ -568,7 +538,7 @@ private void addCoLCommonNames(IndexWriter iw, IndexSearcher currentSearcher) th while ((values = reader.readNext()) != null) { if (values.length == 3) { if (doesTaxonConceptExist(currentSearcher, values[2])) { - iw.addDocument(createCommonNameDocument(values[0], values[1], values[2], null, 1.0f)); + iw.addDocument(createCommonNameDocument(values[0], values[1], values[2], null)); count++; } else { System.out.println("Unable to locate LSID " + values[2] + " in current dump"); @@ -605,13 +575,13 @@ private void addAnbgCommonNames(String fileName, IndexWriter iw, IndexSearcher c if (doesTaxonConceptExist(currentSearcher, values[3]) || doesTaxonConceptExist(idSearcher, values[3])) { //each common name could be a comma separated list if (!values[2].contains(",") || values[2].toLowerCase().contains(" and ")) { - iw.addDocument(createCommonNameDocument(values[2], null, values[3], null, 2.0f)); + iw.addDocument(createCommonNameDocument(values[2], null, values[3], null)); count++; } else { //we need to process each common name in the list String[] names = p.split(values[2]); for (String name : names) { - iw.addDocument(createCommonNameDocument(name, null, values[3],null, 2.0f)); + iw.addDocument(createCommonNameDocument(name, null, values[3],null)); count++; } } @@ -651,9 +621,9 @@ protected void createExtraIdIndex(IndexWriter iw, File idFile) throws Exception if (values != null && values.length >= 3) { Document doc = new Document(); //doc.add(new Field("lsid", values[2], Store.NO, Index.NOT_ANALYZED)); - doc.add(new StringField("lsid", values[2], Store.YES)); + NameIndexField.LSID.store(values[2], doc); //doc.add(new Field("reallsid", values[1], Store.YES, Index.NO)); - doc.add(new StoredField("reallsid", values[1])); + NameIndexField.REAL_LSID.store(values[1], doc); iw.addDocument(doc); } } @@ -700,7 +670,7 @@ private IndexSearcher createTmpIndex(String tcFileName) throws Exception { //just add the LSID to the index Document doc = new Document(); - doc.add(new StringField("lsid", values[0], Store.YES)); + NameIndexField.LSID.store(values[0], doc); iw.addDocument(doc); } @@ -723,7 +693,7 @@ private boolean doesTaxonConceptExist(IndexSearcher is, String lsid) { TermQuery query = new TermQuery(new Term("lsid", lsid)); try { org.apache.lucene.search.TopDocs results = is.search(query, 1); - return results.totalHits > 0; + return results.totalHits.value > 0; } catch (IOException e) { return false; } @@ -743,7 +713,7 @@ private String getAcceptedLSID(String value) { try { TermQuery tq = new TermQuery(new Term("lsid", value)); org.apache.lucene.search.TopDocs results = idSearcher.search(tq, 1); - if (results.totalHits > 0) + if (results.totalHits.value > 0) return idSearcher.doc(results.scoreDocs[0].doc).get("reallsid"); } catch (IOException e) { } @@ -751,38 +721,34 @@ private String getAcceptedLSID(String value) { return value; } - protected Document createCommonNameDocument(String cn, String sn, String lsid, String language, float boost){ - return createCommonNameDocument(cn, sn, lsid, language, boost, true); + protected Document createCommonNameDocument(String cn, String sn, String lsid, String language){ + return createCommonNameDocument(cn, sn, lsid, language, true); } - protected Document createCommonNameDocument(String cn, String sn, String lsid, String language, float boost, boolean checkAccepted) { + protected Document createCommonNameDocument(String cn, String sn, String lsid, String language, boolean checkAccepted) { Document doc = new Document(); - //we are only interested in keeping all the alphanumerical values of the common name - //when searching the same operations will need to be peformed on the search string - TextField searchAbleName = new TextField(IndexField.SEARCHABLE_COMMON_NAME.toString(), cn.toUpperCase().replaceAll("[^A-Z0-9ÏËÖÜÄÉÈČÁÀÆŒ]", ""), Store.YES); - searchAbleName.setBoost(boost); - doc.add(searchAbleName); + // Uses field type to normalise + NameIndexField.SEARCHABLE_COMMON_NAME.store(cn, doc); if (sn != null) { - doc.add(new TextField(IndexField.NAME.toString(), sn, Store.YES)); + NameIndexField.NAME.store(sn, doc); } String newLsid = getAcceptedLSID(lsid); - - doc.add(new TextField(IndexField.COMMON_NAME.toString(), cn, Store.YES)); - doc.add(new TextField(IndexField.LSID.toString(), newLsid, Store.YES)); + NameIndexField.COMMON_NAME.store(cn, doc); + NameIndexField.LSID.store(newLsid, doc); if(language != null) { - doc.add(new TextField(IndexField.LANGUAGE.toString(), language.toLowerCase().trim(), Store.YES)); - } + NameIndexField.LANGUAGE.store(language.toLowerCase().trim(), doc); + } return doc; } public Document createALAIndexDocument(String name, String id, String lsid, String author, LinnaeanRankClassification cl){ - return createALAIndexDocument(name,id, lsid, author,null,null, null, null, cl, null, null, MatchMetrics.DEFAULT_PRIORITY); + return createALAIndexDocument(name,id, lsid, author,null,null, 0, 0, cl, null, null, MatchMetrics.DEFAULT_PRIORITY); } - public Document createALAIndexDocument(String name, String id, String lsid, String author, String rank, String rankId, String left, String right, LinnaeanRankClassification cl, String nameComplete, Collection otherNames, int priority){ + public Document createALAIndexDocument(String name, String id, String lsid, String author, String rank, String rankId, int left, int right, LinnaeanRankClassification cl, String nameComplete, Collection otherNames, int priority){ if(cl == null) cl = new LinnaeanRankClassification(); return createALAIndexDocument(name, id, lsid, rankId, rank, cl.getKingdom(), cl.getKid(), cl.getPhylum() @@ -793,11 +759,11 @@ public Document createALAIndexDocument(String name, String id, String lsid, Stri protected Document createALASynonymDocument(String scientificName, String author, String nameComplete, Collection otherNames, String id, String lsid, String nameLsid, String acceptedLsid, String acceptedId, int priority, String synonymType) { lsid = StringUtils.isBlank(lsid) ? nameLsid : lsid; Document doc = createALAIndexDocument(scientificName, id, lsid, null, null, - null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, + null, null, null, null, null, null, null, null, null, null, null, null, null, null, 0, 0, acceptedLsid, null, null, author, nameComplete, otherNames, priority); if (doc != null && synonymType != null) { try { - doc.add(new TextField(NameIndexField.SYNONYM_TYPE.toString(), synonymType, Store.YES)); + NameIndexField.SYNONYM_TYPE.store(synonymType, doc); } catch (Exception e) { System.out.println("Error on " + scientificName + " " + author + " " + id + ". " + e.getMessage()); } @@ -812,7 +778,7 @@ private boolean isBlacklisted(String scientificName) { protected Document createALAIndexDocument(String name, String id, String lsid, String rank, String rankString, String kingdom, String kid, String phylum, String pid, String clazz, String cid, String order, String oid, String family, String fid, String genus, String gid, - String species, String sid, String left, String right, String acceptedConcept, String specificEpithet, + String species, String sid, int left, int right, String acceptedConcept, String specificEpithet, String infraspecificEpithet, String author, String nameComplete, Collection otherNames, int priority) { // @@ -821,6 +787,7 @@ protected Document createALAIndexDocument(String name, String id, String lsid, S return null; } + int rankIndex = rank == null || rankString.isEmpty() ? -1 : Integer.parseInt(rank); nameComplete = buildNameComplete(name, author, nameComplete); CleanedScientificName cname = new CleanedScientificName(name); CleanedScientificName cnameComplete = new CleanedScientificName(nameComplete); @@ -828,11 +795,10 @@ protected Document createALAIndexDocument(String name, String id, String lsid, S String soundexGenus = genus; //Add the ids - doc.add(new StringField(NameIndexField.ID.toString(), id, Store.YES)); - - doc.add(new StringField(NameIndexField.LSID.toString(), lsid, Store.YES)); + NameIndexField.ID.store(id, doc); + NameIndexField.LSID.store(lsid, doc); if (lsid.startsWith("ALA")) { - doc.add(new StringField(NameIndexField.ALA.toString(), "T", Store.YES)); + NameIndexField.ALA.store("T", doc); } @@ -844,85 +810,83 @@ protected Document createALAIndexDocument(String name, String id, String lsid, S nameSet.add(cnameComplete.getNormalised()); nameSet.add(cnameComplete.getBasic()); for (String n: nameSet) { - Field f = new TextField(NameIndexField.NAME.toString(), n, Store.YES); - doc.add(f); + NameIndexField.NAME.store(n, doc); } - - doc.add(new StringField(NameIndexField.NAME_CANONICAL.toString(), cname.getNormalised(), Store.YES)); - doc.add(new StringField(NameIndexField.NAME_COMPLETE.toString(), cnameComplete.getNormalised(), Store.YES)); + NameIndexField.NAME_CANONICAL.store(cname.getNormalised(), doc); + NameIndexField.NAME_COMPLETE.store(cnameComplete.getNormalised(), doc); //rank information - if (StringUtils.isNotEmpty(rank)) { - doc.add(new StringField(NameIndexField.RANK_ID.toString(), rank, Store.YES)); + if (rankIndex >= 0) { + NameIndexField.RANK_ID.store(rankIndex, doc); } if (StringUtils.isNotEmpty(rankString)) { - doc.add(new StringField(NameIndexField.RANK.toString(), rankString, Store.YES)); + NameIndexField.RANK.store(rankString, doc); } //handle the synonyms if (StringUtils.isNotEmpty(acceptedConcept)) { - doc.add(new StringField(NameIndexField.ACCEPTED.toString(), acceptedConcept, Store.YES)); - doc.add(new StringField(NameIndexField.iS_SYNONYM.toString(), "T", Store.YES)); - } else { - doc.add(new StringField(NameIndexField.iS_SYNONYM.toString(), "F", Store.YES)); + NameIndexField.ACCEPTED.store(acceptedConcept, doc); + NameIndexField.iS_SYNONYM.store("T", doc); + } else { + NameIndexField.iS_SYNONYM.store("F", doc); } //Add the classification information if (StringUtils.trimToNull(kingdom) != null) { - doc.add(new TextField(RankType.KINGDOM.getRank(), kingdom, Store.YES)); - if (StringUtils.isNotBlank(kid)) { - doc.add(new StoredField("kid", kid)); + NameIndexField.KINGDOM.store(kingdom, doc); + if (StringUtils.isNotBlank(kid)) { + NameIndexField.KINGDOM_ID.store(kid, doc); } } if (StringUtils.trimToNull(phylum) != null) { - doc.add(new TextField(RankType.PHYLUM.getRank(), phylum, Store.YES)); + NameIndexField.PHYLUM.store(phylum, doc); if (StringUtils.isNotBlank(pid)) { - doc.add(new StoredField("pid", pid)); + NameIndexField.PHYLUM_ID.store(pid, doc); } } if (StringUtils.trimToNull(clazz) != null) { - doc.add(new TextField(RankType.CLASS.getRank(), clazz, Store.YES)); + NameIndexField.CLASS.store(clazz, doc); if (StringUtils.isNotBlank(cid)) { - doc.add(new StoredField("cid", cid)); + NameIndexField.CLASS_ID.store(cid, doc); } } if (StringUtils.trimToNull(order) != null) { - doc.add(new TextField(RankType.ORDER.getRank(), order, Store.YES)); + NameIndexField.ORDER.store(order, doc); if (StringUtils.isNotBlank(oid)) { - doc.add(new StoredField("oid", oid)); + NameIndexField.ORDER_ID.store(oid, doc); } } if (StringUtils.trimToNull(family) != null) { - doc.add(new TextField(RankType.FAMILY.getRank(), family, Store.YES)); + NameIndexField.FAMILY.store(family, doc); if (StringUtils.isNotBlank(fid)) { - doc.add(new StoredField("fid", fid)); + NameIndexField.FAMILY_ID.store(fid, doc); } } if (StringUtils.trimToNull(genus) != null) { - doc.add(new TextField(RankType.GENUS.getRank(), genus, Store.YES)); + NameIndexField.GENUS.store(genus, doc); if (StringUtils.isNotBlank(gid)) { - doc.add(new StoredField("gid", gid)); + NameIndexField.GENUS_ID.store(gid, doc); } } if (StringUtils.trimToNull(species) != null) { - doc.add(new TextField(RankType.SPECIES.getRank(), species, Store.YES)); + NameIndexField.SPECIES.store(species, doc); if (StringUtils.isNotBlank(sid)) { - doc.add(new StoredField("sid", sid)); + NameIndexField.SPECIES_ID.store(sid, doc); } } - if (StringUtils.trimToNull(left) != null) { - doc.add(new StringField("left", left, Store.YES)); + if (left > 0) { + NameIndexField.LEFT.store(left, doc); } - if (StringUtils.trimToNull(right) != null) { - doc.add(new StringField("right", right, Store.YES)); + if (right > 0) { + NameIndexField.RIGHT.store(right, doc); } - doc.add(new StoredField("priority", priority)); + NameIndexField.PRIORITY.store(priority, doc); //Add the author information if (StringUtils.isNotEmpty(author)) { //TODO think about whether we need to treat the author string with the taxamatch - doc.add(new TextField(NameIndexField.AUTHOR.toString(), author, Store.YES)); + NameIndexField.AUTHOR.store(author, doc); } @@ -937,8 +901,7 @@ protected Document createALAIndexDocument(String name, String id, String lsid, S && cn.getType() != NameType.INFORMAL && !"6500".equals(rank) && cn.getType() != NameType.DOUBTFUL) { if (!nameSet.contains(cn.canonicalName())) { - Field f2 = new TextField(NameIndexField.NAME.toString(), cn.canonicalName(), Store.YES); - doc.add(f2); + NameIndexField.NAME.store(cn.canonicalName(), doc); } if (specificEpithet == null && cn.isBinomial()) { //check to see if we need to determine the epithets from the parse @@ -950,30 +913,28 @@ protected Document createALAIndexDocument(String name, String id, String lsid, S //check to see if the concept represents a phrase name if (cn != null && cn instanceof ALAParsedName) { //set up the field type that is stored and Index.ANALYZED_NO_NORMS - FieldType ft = new FieldType(TextField.TYPE_STORED); - ft.setOmitNorms(true); ALAParsedName alapn = (ALAParsedName) cn; if (alapn.getRank() != Rank.SPECIES && alapn.getSpecificEpithet() != null) { - doc.add(new Field(NameIndexField.SPECIFIC.toString(), alapn.getSpecificEpithet(), ft)); + NameIndexField.SPECIFIC.store(alapn.getSpecificEpithet(), doc); } else if (alapn.getRank() != Rank.SPECIES && alapn.getSpecificEpithet() == null) { log.warn(lsid + " " + name + " has an empty specific for non sp. phrase"); } if (StringUtils.trimToNull(alapn.getLocationPhraseDescription()) != null) { - doc.add(new Field(NameIndexField.PHRASE.toString(), alapn.cleanPhrase, ft)); + NameIndexField.PHRASE.store(alapn.cleanPhrase, doc); } if (alapn.getPhraseVoucher() != null) { - doc.add(new Field(NameIndexField.VOUCHER.toString(), alapn.cleanVoucher, ft)); + NameIndexField.VOUCHER.store(alapn.cleanVoucher, doc); } if (StringUtils.isBlank(genus) && StringUtils.isNotBlank(alapn.getGenusOrAbove())) { //add the genus to the index as it is necessary to match on the phrase name. - doc.add(new TextField(RankType.GENUS.getRank(), alapn.getGenusOrAbove(), Store.YES)); + NameIndexField.GENUS.store(alapn.getGenusOrAbove(), doc); } } } catch (org.gbif.api.exception.UnparsableException e) { //check to see if the name is a virus in which case an extra name is added without the virus key word if (e.type == NameType.VIRUS) { - doc.add(new TextField(NameIndexField.NAME.toString(), ALANameSearcher.virusStopPattern.matcher(name).replaceAll(" "), Store.YES)); + NameIndexField.NAME.store(ALANameSearcher.virusStopPattern.matcher(name).replaceAll(" "), doc); } } catch (Exception e) { @@ -984,24 +945,24 @@ protected Document createALAIndexDocument(String name, String id, String lsid, S //add the sound expressions for the name if required try { if (StringUtils.isNotBlank(soundexGenus)) { - doc.add(new TextField(NameIndexField.GENUS_EX.toString(), TaxonNameSoundEx.treatWord(soundexGenus, "genus"), Store.YES)); + NameIndexField.GENUS_EX.store(TaxonNameSoundEx.treatWord(soundexGenus, "genus"), doc); } if (StringUtils.isNotBlank(specificEpithet)) { String soundex = TaxonNameSoundEx.treatWord(specificEpithet, "species"); if (soundex == null) soundex = ""; - doc.add(new TextField(NameIndexField.SPECIES_EX.toString(), soundex, Store.YES)); + NameIndexField.SPECIES_EX.store(soundex, doc); } else if (StringUtils.isNotBlank(soundexGenus)) { - doc.add(new TextField(NameIndexField.SPECIES_EX.toString(), "", Store.YES)); + NameIndexField.SPECIES_EX.store("", doc); } if (StringUtils.isNotBlank(infraspecificEpithet)) { String soundex = TaxonNameSoundEx.treatWord(infraspecificEpithet, "species"); if (soundex == null) soundex = ""; - doc.add(new TextField(NameIndexField.INFRA_EX.toString(), soundex, Store.YES)); - } else if (StringUtils.isNotBlank(specificEpithet)) { + NameIndexField.INFRA_EX.store(soundex, doc); + } else if (StringUtils.isNotBlank(specificEpithet)) { //make searching for an empty infraspecific soudex easier - doc.add(new TextField(NameIndexField.INFRA_EX.toString(), "", Store.YES)); + NameIndexField.INFRA_EX.store("", doc); } } catch (Exception e) { log.warn(lsid + " " + name + " has issues creating a soundex: " + e.getMessage()); diff --git a/src/main/java/au/org/ala/names/search/DwcaNameIndexer.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/search/DwcaNameIndexer.java similarity index 96% rename from src/main/java/au/org/ala/names/search/DwcaNameIndexer.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/search/DwcaNameIndexer.java index 883464bdb..974f5fec1 100644 --- a/src/main/java/au/org/ala/names/search/DwcaNameIndexer.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/search/DwcaNameIndexer.java @@ -21,9 +21,10 @@ import com.opencsv.CSVReader; import org.apache.commons.cli.*; import org.apache.commons.io.FileUtils; -import org.apache.commons.lang.StringUtils; -import org.apache.commons.lang.time.DateFormatUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.time.DateFormatUtils; import org.apache.log4j.Logger; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.KeywordAnalyzer; import org.apache.lucene.document.*; import org.apache.lucene.index.DirectoryReader; @@ -51,6 +52,8 @@ import java.io.IOException; import java.io.InputStream; import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; /** * @@ -81,6 +84,9 @@ public class DwcaNameIndexer extends ALANameIndexer { RankType.KINGDOM, RankType.PHYLUM, RankType.CLASS, RankType.ORDER, RankType.FAMILY }; + /** Detect names with an additional locality in parentheses at the end */ + protected static final Pattern LOCALITY_PATTERN = Pattern.compile("^([\\p{Alnum}.'()\\s]+)\\s+\\([\\p{Alnum}\\s]+\\)\\s*$"); + private static int PAGE_SIZE = 25000; private boolean loadingIndex; private boolean sciIndex; @@ -92,15 +98,15 @@ public class DwcaNameIndexer extends ALANameIndexer { private IndexWriter loadingIndexWriter = null; private IndexWriter vernacularIndexWriter = null; private IndexWriter idWriter = null; - private LowerCaseKeywordAnalyzer analyzer; + private Analyzer analyzer; private Map priorities; - public DwcaNameIndexer(File targetDir, File tmpDir, Properties priorities, boolean loadingIndex, boolean sciIndex) { + public DwcaNameIndexer(File targetDir, File tmpDir, Properties priorities, boolean loadingIndex, boolean sciIndex) throws IOException { this.targetDir = targetDir; this.tmpDir = tmpDir; this.loadingIndex = loadingIndex; this.sciIndex = sciIndex; - this.analyzer = new LowerCaseKeywordAnalyzer(); + this.analyzer = LowerCaseKeywordAnalyzer.newInstance(); this.priorities = this.buildPriorities(priorities); } @@ -212,10 +218,9 @@ public boolean create(File namesDwc) throws Exception{ } public void createIrmng(File irmngDwc) throws Exception { - if (irmngDwc == null || !irmngDwc.exists()) - return; - IndexWriter irmngWriter = this.createIndexWriter(new File(this.targetDir, "irmng"), this.analyzer, true); - this.indexIrmngDwcA(irmngWriter, irmngDwc.getCanonicalPath()); + IndexWriter irmngWriter = this.createIndexWriter(new File(this.targetDir, "irmng"), this.analyzer, true); + if (irmngDwc != null && irmngDwc.exists()) + this.indexIrmngDwcA(irmngWriter, irmngDwc.getCanonicalPath()); irmngWriter.commit(); irmngWriter.forceMerge(1); irmngWriter.close(); @@ -311,7 +316,7 @@ private boolean loadCommonNames(File verncacularDwc) throws Exception { lsid = result.getAcceptedLsid() != null ? result.getAcceptedLsid() : result.getLsid(); if (scientificName == null) scientificName = result.getRankClassification().getScientificName(); - Document doc = this.createCommonNameDocument(vernacularName, scientificName, lsid, language,1.0f, false); + Document doc = this.createCommonNameDocument(vernacularName, scientificName, lsid, language, false); this.vernacularIndexWriter.addDocument(doc); } return true; @@ -344,9 +349,9 @@ private void indexCommonNames(File file) throws Exception { String lsid = StringUtils.isNotEmpty(values[1]) ? values[1] : values[0]; //check to see if it exists TopDocs result = getLoadIdxResults(null, "lsid", lsid, 1); - if(result.totalHits>0){ + if(result.totalHits.value > 0){ //we can add the common name - Document doc = createCommonNameDocument(values[3], values[2], lsid, values[4], 1.0f, false); + Document doc = createCommonNameDocument(values[3], values[2], lsid, values[4], false); this.vernacularIndexWriter.addDocument(doc); count++; } @@ -379,7 +384,7 @@ private void indexCommonNameExtension(Archive archive) throws Exception { String vernacularName = record.value(DwcTerm.vernacularName); String language = record.value(DcTerm.language); TopDocs result = getLoadIdxResults(null, "lsid", taxonID, 1); - if(result.totalHits > 0){ + if(result.totalHits.value > 0){ Document sciNameDoc = lsearcher.doc(result.scoreDocs[0].doc); //get the scientific name //we can add the common name @@ -388,7 +393,6 @@ private void indexCommonNameExtension(Archive archive) throws Exception { sciNameDoc.get(NameIndexField.NAME.toString()), taxonID, language, - 1.0f, false); this.vernacularIndexWriter.addDocument(doc); count++; @@ -492,15 +496,18 @@ public boolean createLoadingIndex(File archiveDirectory) throws Exception{ RankType rt = RankType.getForStrRank(taxonRank); if(rt != null){ doc.add(new StringField(NameIndexField.RANK.toString(), rt.getRank(), Field.Store.YES)); - doc.add(new StringField(NameIndexField.RANK_ID.toString(), rt.getId().toString(), Field.Store.YES)); + doc.add(new IntPoint(NameIndexField.RANK_ID.toString(), rt.getId())); + doc.add(new StoredField(NameIndexField.RANK_ID.toString(), rt.getId())); } else { doc.add(new StringField(NameIndexField.RANK.toString(), taxonRank, Field.Store.YES)); - doc.add(new StringField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId().toString(), Field.Store.YES)); + doc.add(new IntPoint(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId())); + doc.add(new StoredField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId())); } } else { //put in unknown rank doc.add(new StringField(NameIndexField.RANK.toString(), "Unknown", Field.Store.YES)); - doc.add(new StringField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId().toString(), Field.Store.YES)); + doc.add(new IntPoint(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId())); + doc.add(new StoredField(NameIndexField.RANK_ID.toString(), RankType.UNRANKED.getId())); } if(StringUtils.equals(taxonID, acceptedNameUsageID) || StringUtils.equals(id, acceptedNameUsageID) || acceptedNameUsageID == null){ //mark this one as an accepted concept @@ -531,6 +538,9 @@ public boolean createLoadingIndex(File archiveDirectory) throws Exception{ nc = this.buildNameComplete(sn, sna, nc); otherNames.add(sn); otherNames.add(nc); + Matcher locality = LOCALITY_PATTERN.matcher(sn); + if (locality.matches()) + otherNames.add(locality.group(1).trim()); } } doc.add(new StoredField(NameIndexField.PRIORITY.toString(), score < 0 ? defaultScore : score)); @@ -582,11 +592,11 @@ public void generateIndex() throws Exception{ //get all the records that don't have parents that are accepted log.info("Loading index from temporary index."); TopDocs rootConcepts = getLoadIdxResults(null, "root", "T", PAGE_SIZE); - int left = 0; + int left = 1; int right = left; int lastRight = right; int count = 0; - while (rootConcepts != null && rootConcepts.totalHits > 0) { + while (rootConcepts != null && rootConcepts.totalHits.value > 0) { ScoreDoc lastConcept = null; for (ScoreDoc sd : rootConcepts.scoreDocs) { lastConcept = sd; @@ -626,7 +636,7 @@ private int addIndex(Document doc, int currentDepth, int currentLeft, LinnaeanRa String id = doc.get(NameIndexField.ID.toString()); //get children for this record TopDocs children = getLoadIdxResults(null, "parent_id", id, PAGE_SIZE); - if(children.totalHits == 0){ + if(children.totalHits.value == 0){ id = doc.get(NameIndexField.LSID.toString()); children = getLoadIdxResults(null, "parent_id", id, PAGE_SIZE); } @@ -729,8 +739,8 @@ private int addIndex(Document doc, int currentDepth, int currentLeft, LinnaeanRa doc.get(NameIndexField.AUTHOR.toString()), doc.get(NameIndexField.RANK.toString()), doc.get(NameIndexField.RANK_ID.toString()), - Integer.toString(left), - Integer.toString(right), + left, + right, newcl, nameComplete, otherNames, @@ -752,9 +762,9 @@ protected Document createALASynonymDocument(String scientificName, String author String genus = null; String specificEpithet = null; String infraspecificEpithet = null; - try { + try { TopDocs hits = this.cbSearcher.search(new TermQuery(new Term(NameIndexField.LSID.toString(), acceptedLsid)), 1); - if (hits.totalHits > 0) + if (hits.totalHits.value > 0) accepted = this.cbSearcher.doc(hits.scoreDocs[0].doc); } catch (Exception ex) { log.warn("Error finding accepted document for " + acceptedLsid, ex); @@ -786,7 +796,7 @@ protected Document createALASynonymDocument(String scientificName, String author } Document doc = createALAIndexDocument(scientificName, id, lsid, null, null, - kingdom, null, phylum, null, clazz, null, order, null, family, null, genus, null, null, null, null, null, + kingdom, null, phylum, null, clazz, null, order, null, family, null, genus, null, null, null, 0, 0, acceptedLsid, specificEpithet, infraspecificEpithet, author, nameComplete, otherNames, priority); if (doc != null && synonymType != null) { try { diff --git a/src/main/java/au/org/ala/names/util/DwcaWriter.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/util/DwcaWriter.java similarity index 96% rename from src/main/java/au/org/ala/names/util/DwcaWriter.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/util/DwcaWriter.java index 280014e2e..bdcf81c0b 100644 --- a/src/main/java/au/org/ala/names/util/DwcaWriter.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/util/DwcaWriter.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.util; import com.google.common.collect.Maps; diff --git a/src/main/java/au/org/ala/names/util/GbifModule.java b/ala-name-matching-builder/src/main/java/au/org/ala/names/util/GbifModule.java similarity index 78% rename from src/main/java/au/org/ala/names/util/GbifModule.java rename to ala-name-matching-builder/src/main/java/au/org/ala/names/util/GbifModule.java index 1e90548d4..481c51a8d 100644 --- a/src/main/java/au/org/ala/names/util/GbifModule.java +++ b/ala-name-matching-builder/src/main/java/au/org/ala/names/util/GbifModule.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.util; import com.fasterxml.jackson.core.*; diff --git a/src/main/resources/au/org/ala/homonyms/cross_rank_homonyms.txt b/ala-name-matching-builder/src/main/resources/au/org/ala/homonyms/cross_rank_homonyms.txt similarity index 100% rename from src/main/resources/au/org/ala/homonyms/cross_rank_homonyms.txt rename to ala-name-matching-builder/src/main/resources/au/org/ala/homonyms/cross_rank_homonyms.txt diff --git a/src/main/resources/au/org/ala/names/index/author_abbreviations.csv b/ala-name-matching-builder/src/main/resources/au/org/ala/names/index/author_abbreviations.csv similarity index 100% rename from src/main/resources/au/org/ala/names/index/author_abbreviations.csv rename to ala-name-matching-builder/src/main/resources/au/org/ala/names/index/author_abbreviations.csv diff --git a/src/main/resources/au/org/ala/names/index/informal_names.csv b/ala-name-matching-builder/src/main/resources/au/org/ala/names/index/informal_names.csv similarity index 95% rename from src/main/resources/au/org/ala/names/index/informal_names.csv rename to ala-name-matching-builder/src/main/resources/au/org/ala/names/index/informal_names.csv index e73d214ba..23973ee57 100644 --- a/src/main/resources/au/org/ala/names/index/informal_names.csv +++ b/ala-name-matching-builder/src/main/resources/au/org/ala/names/index/informal_names.csv @@ -1,3 +1,3 @@ label,pattern,title,link numbered group,^\\p{Alpha}+\\s+(?i:group|subgroup|species)\\s*\\d+\\.?,Informal groups of the form , -gen nov,(?i:gen\\.?\\s+nov\\.?)$,New genus name type gen. nov., +gen nov,(?i:gen\\.?\\s+nov\\.?)$,New genus name type gen. nov., \ No newline at end of file diff --git a/src/main/resources/au/org/ala/names/index/nomenclatural_codes.csv b/ala-name-matching-builder/src/main/resources/au/org/ala/names/index/nomenclatural_codes.csv similarity index 100% rename from src/main/resources/au/org/ala/names/index/nomenclatural_codes.csv rename to ala-name-matching-builder/src/main/resources/au/org/ala/names/index/nomenclatural_codes.csv diff --git a/src/main/resources/au/org/ala/names/index/nomenclatural_status_codes.csv b/ala-name-matching-builder/src/main/resources/au/org/ala/names/index/nomenclatural_status_codes.csv similarity index 100% rename from src/main/resources/au/org/ala/names/index/nomenclatural_status_codes.csv rename to ala-name-matching-builder/src/main/resources/au/org/ala/names/index/nomenclatural_status_codes.csv diff --git a/src/main/resources/au/org/ala/names/index/rank_codes.csv b/ala-name-matching-builder/src/main/resources/au/org/ala/names/index/rank_codes.csv similarity index 100% rename from src/main/resources/au/org/ala/names/index/rank_codes.csv rename to ala-name-matching-builder/src/main/resources/au/org/ala/names/index/rank_codes.csv diff --git a/src/main/resources/au/org/ala/names/index/taxonomic_type_codes.csv b/ala-name-matching-builder/src/main/resources/au/org/ala/names/index/taxonomic_type_codes.csv similarity index 100% rename from src/main/resources/au/org/ala/names/index/taxonomic_type_codes.csv rename to ala-name-matching-builder/src/main/resources/au/org/ala/names/index/taxonomic_type_codes.csv diff --git a/src/main/resources/blacklist.txt b/ala-name-matching-builder/src/main/resources/blacklist.txt similarity index 100% rename from src/main/resources/blacklist.txt rename to ala-name-matching-builder/src/main/resources/blacklist.txt diff --git a/src/main/resources/log4j.xml b/ala-name-matching-builder/src/main/resources/log4j.xml similarity index 100% rename from src/main/resources/log4j.xml rename to ala-name-matching-builder/src/main/resources/log4j.xml diff --git a/src/main/resources/taxonomy.properties b/ala-name-matching-builder/src/main/resources/taxonomy.properties similarity index 98% rename from src/main/resources/taxonomy.properties rename to ala-name-matching-builder/src/main/resources/taxonomy.properties index 00d2a60e2..216340814 100644 --- a/src/main/resources/taxonomy.properties +++ b/ala-name-matching-builder/src/main/resources/taxonomy.properties @@ -38,6 +38,7 @@ count.vernacularName.placed=Placed {0} additional vernacular names count.vernacularName.unplaced=Unable to find taxa for {0} additional vernacular names dwca.additionalInfo=Created by combining source taxonomies using the ala-name-matching algorithms. \ See https://github.com/AtlasOfLivingAustralia/ala-name-matching for more information. +instance.accepted.invalidLink=Invalid accepted link for {0} {1} - {2} instance.accepted.resolve=Unable to resolve accepted taxon for {3} instance.accepted.resolve.loop=Loop resolving accepted taxon for {3} - {2} instance.accepted.resolve.loop.provenance=Synonym loop resolved by converting to inferred unplaced @@ -47,6 +48,7 @@ instance.discarded.synonym.provenance=Discarded name synonymised into this taxon instance.inferredSynonym.provenance=Inferred from {0} in source {1} instance.multiIndex=Multiple index entries for {3}: {4} {5} choosing first instance.noIndex=No index entry for {3} +instance.parent.invalidLink=Invalid parent link for {0} {1} - {2} instance.parent.resolve=Unable to resolve parent for {3} instance.parent.resolve.loop=Loop resolving parent for {3} - {2} instance.parent.resolve.loop.provenance=Parent loop resolved by replacing parent with the unknown taxon @@ -74,6 +76,7 @@ name.UnrankedScientificName.principal=Principal for unranked scientific name {3} name.principal=Principal for {3} is {4} nomenclaturalCode.notFound=Cant find nomenclatural code {2} nomenclaturalStatus.notFound=Cant find nomenclatural status {2}, ignoring - reported once for each status +provider.archive.noMetadata=Archive has no metadata provider.validation.unknownTaxonID.noID=No unknown taxon identifier provider.validation.unknownTaxonID.notFound=Unknown taxon identifier {0} not found rank.notFound=Cant find rank of {2}, making unranked - reported once for each rank diff --git a/src/test/java/au/org/ala/names/index/ALANameAnalyserTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/ALANameAnalyserTest.java similarity index 93% rename from src/test/java/au/org/ala/names/index/ALANameAnalyserTest.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/index/ALANameAnalyserTest.java index 04a527bcc..5f565c9fb 100644 --- a/src/test/java/au/org/ala/names/index/ALANameAnalyserTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/ALANameAnalyserTest.java @@ -1,5 +1,21 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; @@ -253,6 +269,27 @@ public void testKey25() throws Exception { assertEquals(RankType.UNRANKED, key.getRank()); } + + // Test aff. name looks like an author + @Test + public void testKey26() throws Exception { + // With authot + NameKey key1 = this.analyser.analyse(null, "Carex aff. tereticaulis (Lake Omeo)", "sensu G.W. Carr", RankType.UNRANKED, TaxonomicType.INFERRED_UNPLACED, true); + assertEquals(null, key1.getCode()); + assertEquals(NameType.DOUBTFUL, key1.getType()); + assertEquals("CAREX AFF TERETICAULIS LAKE OMEO", key1.getScientificName()); + assertEquals("sensu G.W. Carr", key1.getScientificNameAuthorship()); + assertEquals(RankType.UNRANKED, key1.getRank()); + + // Without author + NameKey key2 = this.analyser.analyse(null, "Carex aff. tereticaulis (Lake Omeo)", null, RankType.UNRANKED, TaxonomicType.INFERRED_UNPLACED, true); + assertEquals(null, key2.getCode()); + assertEquals(NameType.DOUBTFUL, key2.getType()); + assertEquals("CAREX AFF TERETICAULIS LAKE OMEO", key2.getScientificName()); + assertEquals(null, key2.getScientificNameAuthorship()); + assertEquals(RankType.UNRANKED, key2.getRank()); + } + @Test public void testAuthorEquals1() throws Exception { assertEquals(0, this.analyser.compareAuthor(null, null)); diff --git a/src/test/java/au/org/ala/names/index/ALATaxonResolverTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/ALATaxonResolverTest.java similarity index 96% rename from src/test/java/au/org/ala/names/index/ALATaxonResolverTest.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/index/ALATaxonResolverTest.java index 18c3919b4..5868f2f18 100644 --- a/src/test/java/au/org/ala/names/index/ALATaxonResolverTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/ALATaxonResolverTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.util.TestUtils; diff --git a/src/test/java/au/org/ala/names/index/CSVNameSourceTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/CSVNameSourceTest.java similarity index 89% rename from src/test/java/au/org/ala/names/index/CSVNameSourceTest.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/index/CSVNameSourceTest.java index 809649320..498344ff8 100644 --- a/src/test/java/au/org/ala/names/index/CSVNameSourceTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/CSVNameSourceTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; diff --git a/src/test/java/au/org/ala/names/index/NameProviderTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/NameProviderTest.java similarity index 92% rename from src/test/java/au/org/ala/names/index/NameProviderTest.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/index/NameProviderTest.java index 89fc351bd..cbf187c4a 100644 --- a/src/test/java/au/org/ala/names/index/NameProviderTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/NameProviderTest.java @@ -1,15 +1,28 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.TaxonomicType; import au.org.ala.names.util.TestUtils; -import com.fasterxml.jackson.databind.JavaType; import com.fasterxml.jackson.databind.ObjectMapper; import org.gbif.api.vocabulary.NomenclaturalCode; import org.junit.Before; import org.junit.Test; -import java.util.List; - import static org.junit.Assert.*; /** diff --git a/ala-name-matching-builder/src/test/java/au/org/ala/names/index/RankComparatorTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/RankComparatorTest.java new file mode 100644 index 000000000..6d628af90 --- /dev/null +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/RankComparatorTest.java @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + +package au.org.ala.names.index; + +import static au.org.ala.names.model.RankType.*; + +import static org.gbif.checklistbank.model.Equality.*; +import org.junit.Before; +import org.junit.Test; + +import static org.junit.Assert.*; + +public class RankComparatorTest { + private RankComparator comparator; + + @Before + public void setUp() throws Exception { + this.comparator = new RankComparator(); + } + + @Test + public void testCompare1() { + assertEquals(EQUAL, this.comparator.compare(KINGDOM, KINGDOM)); + assertEquals(DIFFERENT, this.comparator.compare(KINGDOM, PHYLUM)); + assertEquals(DIFFERENT, this.comparator.compare(KINGDOM, CLASS)); + assertEquals(DIFFERENT, this.comparator.compare(KINGDOM, ORDER)); + assertEquals(DIFFERENT, this.comparator.compare(KINGDOM, FAMILY)); + assertEquals(DIFFERENT, this.comparator.compare(KINGDOM, GENUS)); + assertEquals(DIFFERENT, this.comparator.compare(KINGDOM, SPECIES)); + assertEquals(DIFFERENT, this.comparator.compare(KINGDOM, SUBSPECIES)); + } + + + @Test + public void testCompare2() { + assertEquals(DIFFERENT, this.comparator.compare(PHYLUM, KINGDOM)); + assertEquals(EQUAL, this.comparator.compare(PHYLUM, PHYLUM)); + assertEquals(DIFFERENT, this.comparator.compare(PHYLUM, CLASS)); + assertEquals(DIFFERENT, this.comparator.compare(PHYLUM, ORDER)); + assertEquals(DIFFERENT, this.comparator.compare(PHYLUM, FAMILY)); + assertEquals(DIFFERENT, this.comparator.compare(PHYLUM, GENUS)); + assertEquals(DIFFERENT, this.comparator.compare(PHYLUM, SPECIES)); + assertEquals(DIFFERENT, this.comparator.compare(PHYLUM, SUBSPECIES)); + } + + @Test + public void testCompare3() { + assertEquals(DIFFERENT, this.comparator.compare(CLASS, KINGDOM)); + assertEquals(DIFFERENT, this.comparator.compare(CLASS, PHYLUM)); + assertEquals(EQUAL, this.comparator.compare(CLASS, CLASS)); + assertEquals(DIFFERENT, this.comparator.compare(CLASS, ORDER)); + assertEquals(DIFFERENT, this.comparator.compare(CLASS, FAMILY)); + assertEquals(DIFFERENT, this.comparator.compare(CLASS, GENUS)); + assertEquals(DIFFERENT, this.comparator.compare(CLASS, SPECIES)); + assertEquals(DIFFERENT, this.comparator.compare(CLASS, SUBSPECIES)); + } + + @Test + public void testCompare4() { + assertEquals(DIFFERENT, this.comparator.compare(ORDER, KINGDOM)); + assertEquals(DIFFERENT, this.comparator.compare(ORDER, PHYLUM)); + assertEquals(DIFFERENT, this.comparator.compare(ORDER, CLASS)); + assertEquals(EQUAL, this.comparator.compare(ORDER, ORDER)); + assertEquals(DIFFERENT, this.comparator.compare(ORDER, FAMILY)); + assertEquals(DIFFERENT, this.comparator.compare(ORDER, GENUS)); + assertEquals(DIFFERENT, this.comparator.compare(ORDER, SPECIES)); + assertEquals(DIFFERENT, this.comparator.compare(ORDER, SUBSPECIES)); + } + + @Test + public void testCompare5() { + assertEquals(DIFFERENT, this.comparator.compare(FAMILY, KINGDOM)); + assertEquals(DIFFERENT, this.comparator.compare(FAMILY, PHYLUM)); + assertEquals(DIFFERENT, this.comparator.compare(FAMILY, CLASS)); + assertEquals(DIFFERENT, this.comparator.compare(FAMILY, ORDER)); + assertEquals(EQUAL, this.comparator.compare(FAMILY, FAMILY)); + assertEquals(DIFFERENT, this.comparator.compare(FAMILY, GENUS)); + assertEquals(DIFFERENT, this.comparator.compare(FAMILY, SPECIES)); + assertEquals(DIFFERENT, this.comparator.compare(FAMILY, SUBSPECIES)); + } + + @Test + public void testCompare6() { + assertEquals(DIFFERENT, this.comparator.compare(GENUS, KINGDOM)); + assertEquals(DIFFERENT, this.comparator.compare(GENUS, PHYLUM)); + assertEquals(DIFFERENT, this.comparator.compare(GENUS, CLASS)); + assertEquals(DIFFERENT, this.comparator.compare(GENUS, ORDER)); + assertEquals(DIFFERENT, this.comparator.compare(GENUS, FAMILY)); + assertEquals(EQUAL, this.comparator.compare(GENUS, GENUS)); + assertEquals(DIFFERENT, this.comparator.compare(GENUS, SPECIES)); + assertEquals(DIFFERENT, this.comparator.compare(GENUS, SUBSPECIES)); + } + + @Test + public void testCompare7() { + assertEquals(DIFFERENT, this.comparator.compare(SPECIES, KINGDOM)); + assertEquals(DIFFERENT, this.comparator.compare(SPECIES, PHYLUM)); + assertEquals(DIFFERENT, this.comparator.compare(SPECIES, CLASS)); + assertEquals(DIFFERENT, this.comparator.compare(SPECIES, ORDER)); + assertEquals(DIFFERENT, this.comparator.compare(SPECIES, FAMILY)); + assertEquals(DIFFERENT, this.comparator.compare(SPECIES, GENUS)); + assertEquals(EQUAL, this.comparator.compare(SPECIES, SPECIES)); + assertEquals(DIFFERENT, this.comparator.compare(SPECIES, SUBSPECIES)); + } + + @Test + public void testCompare8() { + assertEquals(DIFFERENT, this.comparator.compare(SUBSPECIES, KINGDOM)); + assertEquals(DIFFERENT, this.comparator.compare(SUBSPECIES, PHYLUM)); + assertEquals(DIFFERENT, this.comparator.compare(SUBSPECIES, CLASS)); + assertEquals(DIFFERENT, this.comparator.compare(SUBSPECIES, ORDER)); + assertEquals(DIFFERENT, this.comparator.compare(SUBSPECIES, FAMILY)); + assertEquals(DIFFERENT, this.comparator.compare(SUBSPECIES, GENUS)); + assertEquals(DIFFERENT, this.comparator.compare(SUBSPECIES, SPECIES)); + assertEquals(EQUAL, this.comparator.compare(SUBSPECIES, SUBSPECIES)); + } + + @Test + public void testCompare9() { + assertEquals(DIFFERENT, this.comparator.compare(KINGDOM, SUBPHYLUM)); + assertEquals(DIFFERENT, this.comparator.compare(CLASS, INFRAGENUS)); + assertEquals(DIFFERENT, this.comparator.compare(SUBORDER, CLASS)); + assertEquals(DIFFERENT, this.comparator.compare(INFRAORDER, INFRAFAMILY)); + assertEquals(DIFFERENT, this.comparator.compare(SUBGENUS, SUBSECTION_ZOOLOGY)); + assertEquals(DIFFERENT, this.comparator.compare(INFRAFAMILY, SECTION_ZOOLOGY)); + assertEquals(DIFFERENT, this.comparator.compare(INFRAFAMILY, SECTION_BOTANY)); + assertEquals(DIFFERENT, this.comparator.compare(PARVORDER, SPECIES)); + } + + + @Test + public void testCompare10() { + assertEquals(EQUAL, this.comparator.compare(INFRAKINGDOM, SUBPHYLUM)); + assertEquals(EQUAL, this.comparator.compare(SUPERCLASS, INFRACLASS)); + assertEquals(EQUAL, this.comparator.compare(SUBORDER, SUPERFAMILY)); + assertEquals(EQUAL, this.comparator.compare(ORDER, SUPERFAMILY)); + assertEquals(EQUAL, this.comparator.compare(INFRACLASS, ORDER)); + assertEquals(EQUAL, this.comparator.compare(SECTION_BOTANY, SERIES_BOTANY)); + assertEquals(EQUAL, this.comparator.compare(INFRAFAMILY, GENUS)); + assertEquals(EQUAL, this.comparator.compare(PARVORDER, SUBFAMILY)); + } + + @Test + public void testCompare11() { + assertEquals(EQUAL, this.comparator.compare(SUBSPECIES, CULTIVAR)); + assertEquals(EQUAL, this.comparator.compare(VARIETY, FORM)); + assertEquals(EQUAL, this.comparator.compare(SUBVARIETY, SUBSPECIES)); + assertEquals(EQUAL, this.comparator.compare(NOTHOSPECIES, SUBFORM)); + } + + @Test + public void testCompare12() { + assertEquals(EQUAL, this.comparator.compare(SUBSPECIES, INFORMAL)); + assertEquals(DIFFERENT, this.comparator.compare(INCERTAE_SEDIS, FORM)); + assertEquals(EQUAL, this.comparator.compare(UNRANKED, SUBSPECIES)); + assertEquals(DIFFERENT, this.comparator.compare(NOTHOSPECIES, SPECIES_INQUIRENDA)); + } + + + @Test + public void testCompare13() { + assertEquals(EQUAL, this.comparator.compare(INCERTAE_SEDIS, INFORMAL)); + assertEquals(EQUAL, this.comparator.compare(UNRANKED, SPECIES_INQUIRENDA)); + assertEquals(EQUAL, this.comparator.compare(UNRANKED, INCERTAE_SEDIS)); + assertEquals(EQUAL, this.comparator.compare(UNRANKED, SPECIES_INQUIRENDA)); + } + +} \ No newline at end of file diff --git a/src/test/java/au/org/ala/names/index/ScientificNameTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/ScientificNameTest.java similarity index 79% rename from src/test/java/au/org/ala/names/index/ScientificNameTest.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/index/ScientificNameTest.java index 8006d53f5..3df72347a 100644 --- a/src/test/java/au/org/ala/names/index/ScientificNameTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/ScientificNameTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; diff --git a/src/test/java/au/org/ala/names/index/TaxonConceptTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonConceptTest.java similarity index 79% rename from src/test/java/au/org/ala/names/index/TaxonConceptTest.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonConceptTest.java index 97c6c435d..bfa40b4c6 100644 --- a/src/test/java/au/org/ala/names/index/TaxonConceptTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonConceptTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; diff --git a/src/test/java/au/org/ala/names/index/TaxonomyConfiugrationTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonomyConfiugrationTest.java similarity index 92% rename from src/test/java/au/org/ala/names/index/TaxonomyConfiugrationTest.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonomyConfiugrationTest.java index 1a108e5a2..f4fc12968 100644 --- a/src/test/java/au/org/ala/names/index/TaxonomyConfiugrationTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonomyConfiugrationTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.util.TestUtils; @@ -5,7 +21,6 @@ import org.gbif.api.vocabulary.NomenclaturalCode; import org.gbif.checklistbank.authorship.AuthorComparator; import org.gbif.checklistbank.model.Equality; -import org.junit.Before; import org.junit.Test; import java.io.StringWriter; diff --git a/src/test/java/au/org/ala/names/index/TaxonomyTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonomyTest.java similarity index 95% rename from src/test/java/au/org/ala/names/index/TaxonomyTest.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonomyTest.java index 7c88c6946..fb6af0c88 100644 --- a/src/test/java/au/org/ala/names/index/TaxonomyTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/TaxonomyTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index; import au.org.ala.names.model.RankType; @@ -404,6 +420,7 @@ public void testResolveUnranked6() throws Exception { assertEquals(RankType.UNRANKED, syn1.getRank()); assertNull(syn1.getProvenance()); } + @Test public void testResolveUnranked7() throws Exception { TaxonomyConfiguration config = TaxonomyConfiguration.read(this.resourceReader("taxonomy-config-2.json")); @@ -431,6 +448,40 @@ public void testResolveUnranked7() throws Exception { assertFalse(syn1.getProvenance() == null || syn1.getProvenance().isEmpty()); } + // Issue #126 Do not merge X y and X aff. y + @Test + public void testResolveUnranked8() throws Exception { + TaxonomyConfiguration config = TaxonomyConfiguration.read(this.resourceReader("taxonomy-config-2.json")); + this.taxonomy = new Taxonomy(config, null); + this.taxonomy.begin(); + CSVNameSource source1 = new CSVNameSource(this.resourceReader("taxonomy-32.csv"), DwcTerm.Taxon); + this.taxonomy.load(Arrays.asList(source1)); + this.taxonomy.resolve(); + TaxonConceptInstance acc1 = this.taxonomy.getInstance("Concept-1-1"); + TaxonConceptInstance acc2 = this.taxonomy.getInstance("Concept-2-1"); + TaxonConceptInstance acc3 = this.taxonomy.getInstance("Concept-3-1"); + assertNotNull(acc1); + assertNotNull(acc2); + assertNotNull(acc3); + TaxonConcept tcAcc1 = acc1.getContainer(); + TaxonConcept tcAcc2 = acc2.getContainer(); + TaxonConcept tcAcc3 = acc3.getContainer(); + assertNotSame(tcAcc1, tcAcc2); + assertNotSame(tcAcc1, tcAcc3); + assertNotSame(tcAcc2, tcAcc3); + ScientificName sn1 = tcAcc1.getContainer(); + ScientificName sn2 = tcAcc2.getContainer(); + ScientificName sn3 = tcAcc2.getContainer(); + assertNotSame(sn1, sn2); + assertSame(sn2, sn3); + UnrankedScientificName usn1 = sn1.getContainer(); + UnrankedScientificName usn2 = sn2.getContainer(); + assertNotSame(usn1, usn2); + BareName bn1 = usn1.getContainer(); + BareName bn2 = usn2.getContainer(); + assertNotSame(bn1, bn2); + } + // Test placement on an uncoded name @Test public void testPlaceUncoded1() throws Exception { diff --git a/src/test/java/au/org/ala/names/index/provider/AndTaxonConditionTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/AndTaxonConditionTest.java similarity index 91% rename from src/test/java/au/org/ala/names/index/provider/AndTaxonConditionTest.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/AndTaxonConditionTest.java index 3f6e3be73..f4cbb35cc 100644 --- a/src/test/java/au/org/ala/names/index/provider/AndTaxonConditionTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/AndTaxonConditionTest.java @@ -1,10 +1,25 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.ALANameAnalyser; import au.org.ala.names.index.NameKey; import au.org.ala.names.index.NameProvider; import au.org.ala.names.index.TaxonConceptInstance; -import au.org.ala.names.model.RankType; import au.org.ala.names.model.TaxonomicType; import au.org.ala.names.util.TestUtils; import com.fasterxml.jackson.databind.ObjectMapper; diff --git a/src/test/java/au/org/ala/names/index/provider/KeyAdjusterTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/KeyAdjusterTest.java similarity index 91% rename from src/test/java/au/org/ala/names/index/provider/KeyAdjusterTest.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/KeyAdjusterTest.java index 9759d26e4..1c01c3599 100644 --- a/src/test/java/au/org/ala/names/index/provider/KeyAdjusterTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/KeyAdjusterTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.*; diff --git a/src/test/java/au/org/ala/names/index/provider/MatchTaxonConditionTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/MatchTaxonConditionTest.java similarity index 93% rename from src/test/java/au/org/ala/names/index/provider/MatchTaxonConditionTest.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/MatchTaxonConditionTest.java index c256f22ad..c9e44d0de 100644 --- a/src/test/java/au/org/ala/names/index/provider/MatchTaxonConditionTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/MatchTaxonConditionTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.*; @@ -440,6 +456,23 @@ public void testMatch41() { assertFalse(condition.match(instance, key)); } + + @Test + public void testMatch42() { + MatchTaxonCondition condition = new MatchTaxonCondition(); + condition.setScientificName("Unknown(\\s.*|)"); + condition.setMatchType(NameMatchType.REGEX); + TaxonConceptInstance instance = new TaxonConceptInstance("ID-1", NomenclaturalCode.BOTANICAL, NomenclaturalCode.BOTANICAL.getAcronym(), this.provider, "unknown", null, null, "1975", TaxonomicType.ACCEPTED, TaxonomicType.ACCEPTED.getTerm(), RankType.SPECIES, RankType.SPECIES.getRank(), null, null, null, null, null, null, null, null, null, null); + NameKey key = this.analyser.analyse(instance); + assertTrue(condition.match(instance, key)); + instance = new TaxonConceptInstance("ID-1", NomenclaturalCode.BOTANICAL, NomenclaturalCode.BOTANICAL.getAcronym(), this.provider, "Unknown sp.", null, null, "1975", TaxonomicType.ACCEPTED, TaxonomicType.ACCEPTED.getTerm(), RankType.SPECIES, RankType.SPECIES.getRank(), null, null, null, null, null, null, null, null, null, null); + key = this.analyser.analyse(instance); + assertTrue(condition.match(instance, key)); + instance = new TaxonConceptInstance("ID-1", NomenclaturalCode.BOTANICAL, NomenclaturalCode.BOTANICAL.getAcronym(), this.provider, "Unknownsp.", null, null, "1975", TaxonomicType.ACCEPTED, TaxonomicType.ACCEPTED.getTerm(), RankType.SPECIES, RankType.SPECIES.getRank(), null, null, null, null, null, null, null, null, null, null); + key = this.analyser.analyse(instance); + assertFalse(condition.match(instance, key)); + } + @Test public void testWrite1() throws Exception { MatchTaxonCondition condition = new MatchTaxonCondition(); diff --git a/src/test/java/au/org/ala/names/index/provider/OrTaxonConditionTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/OrTaxonConditionTest.java similarity index 92% rename from src/test/java/au/org/ala/names/index/provider/OrTaxonConditionTest.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/OrTaxonConditionTest.java index c9a564a3f..9389f5267 100644 --- a/src/test/java/au/org/ala/names/index/provider/OrTaxonConditionTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/OrTaxonConditionTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.ALANameAnalyser; diff --git a/src/test/java/au/org/ala/names/index/provider/ScoreAdjusterTest.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/ScoreAdjusterTest.java similarity index 88% rename from src/test/java/au/org/ala/names/index/provider/ScoreAdjusterTest.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/ScoreAdjusterTest.java index dfc5d0885..eea18097a 100644 --- a/src/test/java/au/org/ala/names/index/provider/ScoreAdjusterTest.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/index/provider/ScoreAdjusterTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.index.provider; import au.org.ala.names.index.ALANameAnalyser; diff --git a/src/test/java/au/org/ala/names/util/TestUtils.java b/ala-name-matching-builder/src/test/java/au/org/ala/names/util/TestUtils.java similarity index 86% rename from src/test/java/au/org/ala/names/util/TestUtils.java rename to ala-name-matching-builder/src/test/java/au/org/ala/names/util/TestUtils.java index 2117faef3..6800124f3 100644 --- a/src/test/java/au/org/ala/names/util/TestUtils.java +++ b/ala-name-matching-builder/src/test/java/au/org/ala/names/util/TestUtils.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.util; import au.org.ala.names.index.NameProvider; diff --git a/src/test/resources/au/org/ala/names/index/dwca-1/meta.xml b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/dwca-1/meta.xml similarity index 81% rename from src/test/resources/au/org/ala/names/index/dwca-1/meta.xml rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/dwca-1/meta.xml index 05e5c1836..55dff6b37 100644 --- a/src/test/resources/au/org/ala/names/index/dwca-1/meta.xml +++ b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/dwca-1/meta.xml @@ -1,3 +1,19 @@ + + diff --git a/src/test/resources/au/org/ala/names/index/dwca-1/taxon.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/dwca-1/taxon.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/dwca-1/taxon.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/dwca-1/taxon.csv diff --git a/src/test/resources/au/org/ala/names/index/dwca-1/vernacularNames.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/dwca-1/vernacularNames.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/dwca-1/vernacularNames.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/dwca-1/vernacularNames.csv diff --git a/src/test/resources/au/org/ala/names/index/name-provider-1.json b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/name-provider-1.json similarity index 100% rename from src/test/resources/au/org/ala/names/index/name-provider-1.json rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/name-provider-1.json diff --git a/src/test/resources/au/org/ala/names/index/provider/and-condition-1.json b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/provider/and-condition-1.json similarity index 100% rename from src/test/resources/au/org/ala/names/index/provider/and-condition-1.json rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/provider/and-condition-1.json diff --git a/src/test/resources/au/org/ala/names/index/provider/match-condition-1.json b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/provider/match-condition-1.json similarity index 100% rename from src/test/resources/au/org/ala/names/index/provider/match-condition-1.json rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/provider/match-condition-1.json diff --git a/src/test/resources/au/org/ala/names/index/provider/or-condition-1.json b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/provider/or-condition-1.json similarity index 100% rename from src/test/resources/au/org/ala/names/index/provider/or-condition-1.json rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/provider/or-condition-1.json diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-1.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-1.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-1.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-1.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-10.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-10.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-10.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-10.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-11.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-11.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-11.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-11.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-12.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-12.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-12.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-12.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-13.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-13.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-13.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-13.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-14.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-14.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-14.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-14.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-15.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-15.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-15.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-15.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-16.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-16.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-16.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-16.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-17.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-17.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-17.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-17.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-18.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-18.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-18.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-18.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-19.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-19.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-19.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-19.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-2.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-2.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-2.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-2.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-20.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-20.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-20.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-20.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-21.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-21.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-21.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-21.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-22.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-22.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-22.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-22.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-23.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-23.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-23.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-23.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-24.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-24.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-24.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-24.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-25.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-25.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-25.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-25.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-26.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-26.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-26.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-26.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-27.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-27.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-27.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-27.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-28.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-28.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-28.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-28.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-29.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-29.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-29.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-29.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-3.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-3.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-3.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-3.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-30.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-30.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-30.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-30.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-31.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-31.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-31.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-31.csv diff --git a/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-32.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-32.csv new file mode 100644 index 000000000..adb1b624f --- /dev/null +++ b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-32.csv @@ -0,0 +1,4 @@ +taxonID,parentNameUsageID,acceptedNameUsageID,datasetID,nomenclaturalCode,scientificName,scientificNameAuthorship,taxonRank,taxonConceptID,scientificNameID,taxonomicStatus,nomenclaturalStatus,establishmentMeans,nameAccordingToID,nameAccordingTo,namePublishedInID,namePublishedIn,namePubishedInYear,nameComplete,nameFormatted,source +"Concept-1-1","","","dr100","ICN","Carex tereticaulis","F.Muell.","species","","","accepted","","","","","","","","","","" +"Concept-2-1","","","dr108","","Carex aff. tereticaulis (Lake Omeo)","","unranked","","","inferredUnplaced","","","","","","","","","","" +"Concept-3-1","","","dr108","","Carex aff. tereticaulis (Lake Omeo)","sensu G.W. Carr","unranked","","","inferredUnplaced","","","","","","","","","","" diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-4.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-4.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-4.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-4.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-5.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-5.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-5.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-5.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-6.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-6.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-6.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-6.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-7.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-7.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-7.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-7.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-8.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-8.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-8.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-8.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-9.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-9.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-9.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-9.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-bad-1.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-bad-1.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-bad-1.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-bad-1.csv diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-config-1.json b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-config-1.json similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-config-1.json rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-config-1.json diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-config-2.json b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-config-2.json similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-config-2.json rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-config-2.json diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-config-3.json b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-config-3.json similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-config-3.json rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-config-3.json diff --git a/src/test/resources/au/org/ala/names/index/taxonomy-config-4.json b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-config-4.json similarity index 100% rename from src/test/resources/au/org/ala/names/index/taxonomy-config-4.json rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/taxonomy-config-4.json diff --git a/src/test/resources/au/org/ala/names/index/vernacular-1.csv b/ala-name-matching-builder/src/test/resources/au/org/ala/names/index/vernacular-1.csv similarity index 100% rename from src/test/resources/au/org/ala/names/index/vernacular-1.csv rename to ala-name-matching-builder/src/test/resources/au/org/ala/names/index/vernacular-1.csv diff --git a/ala-name-matching-distribution/pom.xml b/ala-name-matching-distribution/pom.xml new file mode 100644 index 000000000..29a1d09f4 --- /dev/null +++ b/ala-name-matching-distribution/pom.xml @@ -0,0 +1,68 @@ + + + + ala-name-matching + au.org.ala + 4.0 + + 4.0.0 + + ala-name-matching-distribution + pom + + ALA Name Matching Distribution + Distribution along with dependencies + + + + ${project.parent.groupId} + ala-name-matching-model + ${project.version} + + + ${project.parent.groupId} + ala-name-matching-search + ${project.version} + + + ${project.parent.groupId} + ala-name-matching-builder + ${project.version} + + + ${project.parent.groupId} + ala-name-matching-tools + ${project.version} + + + + + + + org.apache.maven.plugins + maven-assembly-plugin + + + src/assembly/assembly.xml + + + + true + lib/ + + + + + + package + + single + + + + + + + \ No newline at end of file diff --git a/ala-name-matching-distribution/src/assembly/assembly.xml b/ala-name-matching-distribution/src/assembly/assembly.xml new file mode 100644 index 000000000..6c2fe4958 --- /dev/null +++ b/ala-name-matching-distribution/src/assembly/assembly.xml @@ -0,0 +1,35 @@ + + distribution + + zip + + false + + + true + + au.org.ala:ala-name-matching-model + au.org.ala:ala-name-matching-search + au.org.ala:ala-name-matching-builder + au.org.ala:ala-name-matching-tools + + + lib + false + + + + + + ${project.build.scriptSourceDirectory} + + + merge.sh + index.sh + compare.sh + generate.sh + dump.sh + + + + \ No newline at end of file diff --git a/ala-name-matching-distribution/src/main/scripts/compare.sh b/ala-name-matching-distribution/src/main/scripts/compare.sh new file mode 100644 index 000000000..952887cdb --- /dev/null +++ b/ala-name-matching-distribution/src/main/scripts/compare.sh @@ -0,0 +1,20 @@ +#!/bin/sh +# +# Copyright (c) 2021 Atlas of Living Australia +# All Rights Reserved. +# +# The contents of this file are subject to the Mozilla Public +# License Version 1.1 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of +# the License at http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS +# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or +# implied. See the License for the specific language governing +# rights and limitations under the License. +# +# + +SCRIPT_HOME=`dirname $0` +JAVA_OPTIONS="${JAVA_OPTIONS}" +exec java ${JAVA_OPTIONS} -cp "$SCRIPT_HOME/lib/*" au.org.ala.names.util.NameListComparer $* \ No newline at end of file diff --git a/ala-name-matching-distribution/src/main/scripts/dump.sh b/ala-name-matching-distribution/src/main/scripts/dump.sh new file mode 100644 index 000000000..03b3fa27e --- /dev/null +++ b/ala-name-matching-distribution/src/main/scripts/dump.sh @@ -0,0 +1,20 @@ +#!/bin/sh +# +# Copyright (c) 2021 Atlas of Living Australia +# All Rights Reserved. +# +# The contents of this file are subject to the Mozilla Public +# License Version 1.1 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of +# the License at http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS +# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or +# implied. See the License for the specific language governing +# rights and limitations under the License. +# +# + +SCRIPT_HOME=`dirname $0` +JAVA_OPTIONS="${JAVA_OPTIONS}" +exec java ${JAVA_OPTIONS} -cp "$SCRIPT_HOME/lib/*" au.org.ala.names.util.TermDump $* \ No newline at end of file diff --git a/ala-name-matching-distribution/src/main/scripts/generate.sh b/ala-name-matching-distribution/src/main/scripts/generate.sh new file mode 100644 index 000000000..2c8cc304b --- /dev/null +++ b/ala-name-matching-distribution/src/main/scripts/generate.sh @@ -0,0 +1,20 @@ +#!/bin/sh +# +# Copyright (c) 2021 Atlas of Living Australia +# All Rights Reserved. +# +# The contents of this file are subject to the Mozilla Public +# License Version 1.1 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of +# the License at http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS +# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or +# implied. See the License for the specific language governing +# rights and limitations under the License. +# +# + +SCRIPT_HOME=`dirname $0` +JAVA_OPTIONS="${JAVA_OPTIONS}" +exec java ${JAVA_OPTIONS} -cp "$SCRIPT_HOME/lib/*" au.org.ala.names.util.NameListGenerator $* \ No newline at end of file diff --git a/ala-name-matching-distribution/src/main/scripts/index.sh b/ala-name-matching-distribution/src/main/scripts/index.sh new file mode 100644 index 000000000..97f10ed39 --- /dev/null +++ b/ala-name-matching-distribution/src/main/scripts/index.sh @@ -0,0 +1,20 @@ +#!/bin/sh +# +# Copyright (c) 2021 Atlas of Living Australia +# All Rights Reserved. +# +# The contents of this file are subject to the Mozilla Public +# License Version 1.1 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of +# the License at http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS +# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or +# implied. See the License for the specific language governing +# rights and limitations under the License. +# +# + +SCRIPT_HOME=`dirname $0` +JAVA_OPTIONS="${JAVA_OPTIONS}" +exec java ${JAVA_OPTIONS} -cp "$SCRIPT_HOME/lib/*" au.org.ala.names.search.DwcaNameIndexer $* \ No newline at end of file diff --git a/ala-name-matching-distribution/src/main/scripts/merge.sh b/ala-name-matching-distribution/src/main/scripts/merge.sh new file mode 100644 index 000000000..07fa3e583 --- /dev/null +++ b/ala-name-matching-distribution/src/main/scripts/merge.sh @@ -0,0 +1,20 @@ +#!/bin/sh +# +# Copyright (c) 2021 Atlas of Living Australia +# All Rights Reserved. +# +# The contents of this file are subject to the Mozilla Public +# License Version 1.1 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of +# the License at http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS +# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or +# implied. See the License for the specific language governing +# rights and limitations under the License. +# +# + +SCRIPT_HOME=`dirname $0` +JAVA_OPTIONS="${JAVA_OPTIONS} -Xmx6G" +exec java ${JAVA_OPTIONS} -cp "$SCRIPT_HOME/lib/*" au.org.ala.names.index.TaxonomyBuilder $* \ No newline at end of file diff --git a/ala-name-matching-model/pom.xml b/ala-name-matching-model/pom.xml new file mode 100644 index 000000000..8ad2889a0 --- /dev/null +++ b/ala-name-matching-model/pom.xml @@ -0,0 +1,44 @@ + + + + ala-name-matching + au.org.ala + 4.0 + + 4.0.0 + + ala-name-matching-model + ALA Name Matching Model + Core name matching data model and vocabularies + + + + org.apache.commons + commons-lang3 + 3.12.0 + + + org.gbif + gbif-common + 0.37 + + + org.gbif + name-parser + 2.24 + + + uk.ac.shef.wit + simmetrics + ${simmetrics.version} + + + com.opencsv + opencsv + ${opencsv.version} + test + + + \ No newline at end of file diff --git a/src/main/java/au/org/ala/names/model/ALAParsedName.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/ALAParsedName.java similarity index 100% rename from src/main/java/au/org/ala/names/model/ALAParsedName.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/model/ALAParsedName.java diff --git a/src/main/java/au/org/ala/names/model/ErrorType.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/ErrorType.java similarity index 100% rename from src/main/java/au/org/ala/names/model/ErrorType.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/model/ErrorType.java diff --git a/src/main/java/au/org/ala/names/model/LinnaeanRankClassification.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/LinnaeanRankClassification.java similarity index 86% rename from src/main/java/au/org/ala/names/model/LinnaeanRankClassification.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/model/LinnaeanRankClassification.java index 244b4bf49..338f496d3 100644 --- a/src/main/java/au/org/ala/names/model/LinnaeanRankClassification.java +++ b/ala-name-matching-model/src/main/java/au/org/ala/names/model/LinnaeanRankClassification.java @@ -1,10 +1,25 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.model; -import org.apache.commons.lang.builder.EqualsBuilder; -import org.apache.commons.lang.builder.HashCodeBuilder; -import org.apache.commons.lang.builder.ToStringBuilder; -import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.builder.EqualsBuilder; +import org.apache.commons.lang3.builder.HashCodeBuilder; +import org.apache.commons.lang3.builder.ToStringBuilder; /** * A model object that represents a Linnaean Classification. @@ -358,7 +373,7 @@ public void setRank(String rank) { } /** - * @see java.lang.Object#toString() + * @see Object#toString() */ public String toString() { return new ToStringBuilder(this) @@ -386,7 +401,7 @@ public String toCSV(char sep) { } /** - * @see java.lang.Object#equals(Object) + * @see Object#equals(Object) */ public boolean equals(Object object) { if (!(object instanceof LinnaeanRankClassification)) { @@ -459,36 +474,6 @@ public boolean hasIdenticalClassification(LinnaeanRankClassification lrc, RankTy return true; } - /** - * Returns the additional string that needs to be included in a search - * - * @param optional Indicates whether the the terms should be optional - * @return - */ - public String getLuceneSearchString(boolean optional) { - String prefix = optional ? " " : " +"; - StringBuilder sb = new StringBuilder(); - if (StringUtils.isNotEmpty(kingdom)) - sb.append(prefix).append(RankType.KINGDOM.getRank()).append(":\"").append(kingdom).append("\""); - if (StringUtils.isNotEmpty(phylum)) - sb.append(prefix).append(RankType.PHYLUM.getRank()).append(":\"").append(phylum).append("\""); - if (StringUtils.isNotEmpty(klass)) - sb.append(prefix).append(RankType.CLASS.getRank()).append(":\"").append(klass).append("\""); - if (StringUtils.isNotEmpty(order)) - sb.append(prefix).append(RankType.ORDER.getRank()).append(":\"").append(order).append("\""); - if (StringUtils.isNotEmpty(family)) - sb.append(prefix).append(RankType.FAMILY.getRank()).append(":\"").append(family).append("\""); - if (StringUtils.isNotEmpty(genus)) - sb.append(prefix).append(RankType.GENUS.getRank()).append(":\"").append(genus).append("\""); - if (StringUtils.isNotEmpty(species)) - sb.append(prefix).append(RankType.SPECIES.getRank()).append(":\"").append(species).append("\""); - //authorship is always optional due to inconsistencies in the name format etc... - if (StringUtils.isNotEmpty(authorship)) - sb.append(" ").append(NameIndexField.AUTHOR.toString()).append(":\"").append(authorship).append("\"~"); - return sb.toString(); - } - - public static void main(String[] args) { LinnaeanRankClassification a = new LinnaeanRankClassification(null, null, null, null, null, null, "AuS bus"); LinnaeanRankClassification b = new LinnaeanRankClassification(null, null, null, null, null, null, new String("Aus bus")); @@ -497,7 +482,7 @@ public static void main(String[] args) { } /** - * @see java.lang.Object#hashCode() + * @see Object#hashCode() */ public int hashCode() { return new HashCodeBuilder(1497136033, 448920019).append(this.scientificName).append( diff --git a/src/main/java/au/org/ala/names/model/MatchMetrics.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/MatchMetrics.java similarity index 91% rename from src/main/java/au/org/ala/names/model/MatchMetrics.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/model/MatchMetrics.java index a3a8efd1d..eccb0260d 100644 --- a/src/main/java/au/org/ala/names/model/MatchMetrics.java +++ b/ala-name-matching-model/src/main/java/au/org/ala/names/model/MatchMetrics.java @@ -1,6 +1,21 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.model; -import au.org.ala.names.index.TaxonConceptInstance; import org.apache.commons.lang3.StringUtils; import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric; import uk.ac.shef.wit.simmetrics.similaritymetrics.SmithWatermanGotoh; @@ -47,7 +62,7 @@ public MatchMetrics() { * * @return The priority * - * @see TaxonConceptInstance#getScore() + * @see au.org.ala.names.index.TaxonConceptInstance#getScore() */ public int getPriority() { return priority; diff --git a/src/main/java/au/org/ala/names/model/MatchType.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/MatchType.java similarity index 100% rename from src/main/java/au/org/ala/names/model/MatchType.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/model/MatchType.java diff --git a/src/main/java/au/org/ala/names/model/MetricsResultDTO.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/MetricsResultDTO.java similarity index 100% rename from src/main/java/au/org/ala/names/model/MetricsResultDTO.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/model/MetricsResultDTO.java diff --git a/ala-name-matching-model/src/main/java/au/org/ala/names/model/NameFlag.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/NameFlag.java new file mode 100644 index 000000000..41bbe6c55 --- /dev/null +++ b/ala-name-matching-model/src/main/java/au/org/ala/names/model/NameFlag.java @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + +package au.org.ala.names.model; + +/** + * Flags indicating special-case information about a name + * + * @author Doug Palmer <Doug.Palmer@csiro.au> + * @copyright Copyright © 2019 Atlas of Living Australia + */ +public enum NameFlag { + /** The name is an autonymn, meaning that it has been created without an author because a sub-taxon was created */ + AUTONYM +} diff --git a/src/main/java/au/org/ala/names/model/NameSearchResult.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/NameSearchResult.java similarity index 71% rename from src/main/java/au/org/ala/names/model/NameSearchResult.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/model/NameSearchResult.java index fc2abd7eb..cdf1f083b 100644 --- a/src/main/java/au/org/ala/names/model/NameSearchResult.java +++ b/ala-name-matching-model/src/main/java/au/org/ala/names/model/NameSearchResult.java @@ -15,9 +15,7 @@ package au.org.ala.names.model; -import org.apache.commons.lang.StringUtils; -import org.apache.lucene.document.Document; -import org.apache.lucene.index.IndexableField; +import org.apache.commons.lang3.StringUtils; import java.util.LinkedHashMap; import java.util.Map; @@ -54,48 +52,30 @@ public NameSearchResult(String id, String lsid, MatchType type) { this.matchMetrics = new MatchMetrics(); } - public NameSearchResult(Document doc, MatchType type) { - this(doc.get(NameIndexField.ID.toString()), doc.get(NameIndexField.LSID.toString()), type); - kingdom = doc.get(RankType.KINGDOM.getRank()); - //System.out.println("Rank to use : " +doc.get(IndexField.RANK.toString())); - try { - rank = RankType.getForId(Integer.parseInt(doc.get(NameIndexField.RANK_ID.toString()))); - } catch (Exception e) { - } - String name = doc.get(NameIndexField.NAME_CANONICAL.toString()); - if (name == null) - name = doc.get(NameIndexField.NAME.toString()); - if (name == null) - name = doc.get(NameIndexField.NAME_COMPLETE.toString()); - rankClass = new LinnaeanRankClassification(doc.get(RankType.KINGDOM.getRank()), - doc.get(RankType.PHYLUM.getRank()), - doc.get(RankType.CLASS.getRank()), - doc.get(RankType.ORDER.getRank()), - doc.get(RankType.FAMILY.getRank()), - doc.get(RankType.GENUS.getRank()), - name); - rankClass.setSpecies(doc.get(RankType.SPECIES.getRank())); - //add the ids - rankClass.setKid(doc.get("kid")); - rankClass.setPid(doc.get("pid")); - rankClass.setCid(doc.get("cid")); - rankClass.setOid(doc.get("oid")); - rankClass.setFid(doc.get("fid")); - rankClass.setGid(doc.get("gid")); - rankClass.setSid(doc.get("sid")); - rankClass.setAuthorship(doc.get(NameIndexField.AUTHOR.toString())); - //left and right values for the taxon concept - left = doc.get("left"); - right = doc.get("right"); - synonymType = SynonymType.getTypeFor(doc.get(NameIndexField.SYNONYM_TYPE.toString())); - String syn = doc.get(NameIndexField.ACCEPTED.toString()); - if (syn != null) { - acceptedLsid = syn; - } - IndexableField priority = doc.getField(NameIndexField.PRIORITY.toString()); + /** + * Construct a fully filled out result + * + * @param id The result identifier + * @param lsid The lsid of the concept + * @param acceptedLsid The lsid of the accepted concept + * @param left The left-value + * @param right The right-value + * @param rankClass The linnaean classification + * @param rank The rank + * @param type The match type + * @param synonymType The synonym type + * @param priority An optional match priority + */ + public NameSearchResult(String id, String lsid, String acceptedLsid, String left, String right, LinnaeanRankClassification rankClass, RankType rank, MatchType type, SynonymType synonymType, Integer priority) { + this(id, lsid, type); + this.acceptedLsid = acceptedLsid; + this.left = left; + this.right = right; + this.rankClass = rankClass; + this.rank = rank; + this.synonymType = synonymType; if (priority != null) - this.matchMetrics.setPriority(priority.numericValue().intValue()); - + this.matchMetrics.setPriority(priority); } public SynonymType getSynonymType() { diff --git a/src/main/java/au/org/ala/names/model/RankType.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/RankType.java similarity index 96% rename from src/main/java/au/org/ala/names/model/RankType.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/model/RankType.java index 773c1af75..bd68fb125 100644 --- a/src/main/java/au/org/ala/names/model/RankType.java +++ b/ala-name-matching-model/src/main/java/au/org/ala/names/model/RankType.java @@ -31,9 +31,11 @@ public enum RankType { DOMAIN(800, "kingdom", Rank.DOMAIN, null, 800, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Domain", "Domain", "Superkingdom", "Empire"), KINGDOM(1000, "kingdom", Rank.KINGDOM, 2f, 1000, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Kingdom", "Kingdom"), SUBKINGDOM(1200, "subkingdom", Rank.SUBKINGDOM, null, 1200, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Subkingdom"), + INFRAKINGDOM(1400, "infrakingdom", Rank.INFRAKINGDOM, null, 1400, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Infrakingdom"), SUPERPHYLUM(1800, "superphylum", Rank.SUPERPHYLUM, null, 2800, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Superphylum"), PHYLUM(2000, "phylum", Rank.PHYLUM, 2f, 2000, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Phylum", "Phylum", "division botany", "Division Botany"), SUBPHYLUM(2200, "subphylum", Rank.SUBPHYLUM, null, 2200, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Subphylum", "subdivision botany"), + INFRAPHYLUM(2400, "infraphylum", Rank.INFRAPHYLUM, null, 2400, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Infraphylum", "infradivision botany"), SUPERCLASS(2800, "superclass", Rank.SUPERCLASS, null, 2800, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Superclass"), CLASS(3000, "class", Rank.CLASS, 2f, 3000, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Class", "Class"), SUBCLASS(3200, "subclass", Rank.SUBCLASS, null, 3200, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Subclass"), @@ -66,7 +68,8 @@ public enum RankType { SUPERGENUS(5900, "genus", null, null, 5900, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Supergenus", "Supergenus"), GENUS_GROUP(5950, "genus group", null, null, 5950, true, "aggregate genera", "Aggregate Genera", "Genus Group"), GENUS(6000, "genus", Rank.GENUS, 3f, 6000, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Genus", "Genus"), - SUBGENUS(6500, "subgenus", Rank.SUBGENUS, null, 6500, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Subgenus"), + SUBGENUS(6400, "subgenus", Rank.SUBGENUS, null, 6400, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Subgenus"), + INFRAGENUS(6500, "infragenus", Rank.INFRAGENUS, null, 6500, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Infragenus"), SUPERSECTION_BOTANY(6550, "supersection botany", Rank.SECTION, null, 6550, false), SECTION_BOTANY(6600, "section botany", Rank.SECTION, null, 6600, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Section"), SUBSECTION_BOTANY(6700, "subsection botany", Rank.SUBSECTION, null, 6700, false, "http://rs.tdwg.org/ontology/voc/TaxonRank#Subsection"), diff --git a/src/main/java/au/org/ala/names/model/SynonymType.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/SynonymType.java similarity index 100% rename from src/main/java/au/org/ala/names/model/SynonymType.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/model/SynonymType.java diff --git a/src/main/java/au/org/ala/names/model/TaxonomicType.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/TaxonomicType.java similarity index 93% rename from src/main/java/au/org/ala/names/model/TaxonomicType.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/model/TaxonomicType.java index dfad2bba6..549414555 100644 --- a/src/main/java/au/org/ala/names/model/TaxonomicType.java +++ b/ala-name-matching-model/src/main/java/au/org/ala/names/model/TaxonomicType.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.model; /** diff --git a/ala-name-matching-model/src/main/java/au/org/ala/names/model/TaxonomicTypeGroup.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/TaxonomicTypeGroup.java new file mode 100644 index 000000000..8e15d144d --- /dev/null +++ b/ala-name-matching-model/src/main/java/au/org/ala/names/model/TaxonomicTypeGroup.java @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + +package au.org.ala.names.model; + +/** + * Groupings of taxonomic types + * + * @author Doug Palmer <Doug.Palmer@csiro.au> + * @copyright Copyright © 2017 Atlas of Living Australia + */ +public enum TaxonomicTypeGroup { + ACCEPTED, + SYNONYM, + MISAPPLIED, + EXCLUDED, + MISCELLANEOUS, + INCERTAE_SEDIS, + SPECIES_INQUIRENDA, + UNPLACED, + DOUBTFUL, + INVALID +} diff --git a/src/main/java/au/org/ala/names/model/VernacularType.java b/ala-name-matching-model/src/main/java/au/org/ala/names/model/VernacularType.java similarity index 81% rename from src/main/java/au/org/ala/names/model/VernacularType.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/model/VernacularType.java index 92c197073..09cd73ee0 100644 --- a/src/main/java/au/org/ala/names/model/VernacularType.java +++ b/ala-name-matching-model/src/main/java/au/org/ala/names/model/VernacularType.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.model; import java.util.HashMap; diff --git a/src/main/java/au/org/ala/names/search/ExcludedNameException.java b/ala-name-matching-model/src/main/java/au/org/ala/names/search/ExcludedNameException.java similarity index 100% rename from src/main/java/au/org/ala/names/search/ExcludedNameException.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/search/ExcludedNameException.java diff --git a/src/main/java/au/org/ala/names/search/HomonymException.java b/ala-name-matching-model/src/main/java/au/org/ala/names/search/HomonymException.java similarity index 100% rename from src/main/java/au/org/ala/names/search/HomonymException.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/search/HomonymException.java index 18e634062..0eee0734f 100644 --- a/src/main/java/au/org/ala/names/search/HomonymException.java +++ b/ala-name-matching-model/src/main/java/au/org/ala/names/search/HomonymException.java @@ -15,10 +15,10 @@ package au.org.ala.names.search; -import java.util.List; - import au.org.ala.names.model.NameSearchResult; +import java.util.List; + /** * Exception that is thrown when the result is an unresolved * homonym diff --git a/src/main/java/au/org/ala/names/search/MisappliedException.java b/ala-name-matching-model/src/main/java/au/org/ala/names/search/MisappliedException.java similarity index 100% rename from src/main/java/au/org/ala/names/search/MisappliedException.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/search/MisappliedException.java diff --git a/src/main/java/au/org/ala/names/search/ParentSynonymChildException.java b/ala-name-matching-model/src/main/java/au/org/ala/names/search/ParentSynonymChildException.java similarity index 100% rename from src/main/java/au/org/ala/names/search/ParentSynonymChildException.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/search/ParentSynonymChildException.java diff --git a/src/main/java/au/org/ala/names/search/SPPException.java b/ala-name-matching-model/src/main/java/au/org/ala/names/search/SPPException.java similarity index 100% rename from src/main/java/au/org/ala/names/search/SPPException.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/search/SPPException.java diff --git a/src/main/java/au/org/ala/names/search/SearchResultException.java b/ala-name-matching-model/src/main/java/au/org/ala/names/search/SearchResultException.java similarity index 99% rename from src/main/java/au/org/ala/names/search/SearchResultException.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/search/SearchResultException.java index a277c7046..82044dfdb 100644 --- a/src/main/java/au/org/ala/names/search/SearchResultException.java +++ b/ala-name-matching-model/src/main/java/au/org/ala/names/search/SearchResultException.java @@ -16,11 +16,10 @@ package au.org.ala.names.search; import au.org.ala.names.model.ErrorType; +import au.org.ala.names.model.NameSearchResult; import java.util.List; -import au.org.ala.names.model.NameSearchResult; - /** * The generic search result exception that can be thrown during a search. This exception * will be used to wrap any exception that occurs that do not fall into the other categories. diff --git a/src/main/java/au/org/ala/names/util/CleanedScientificName.java b/ala-name-matching-model/src/main/java/au/org/ala/names/util/CleanedScientificName.java similarity index 96% rename from src/main/java/au/org/ala/names/util/CleanedScientificName.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/util/CleanedScientificName.java index afe92e6a1..5246e63ba 100644 --- a/src/main/java/au/org/ala/names/util/CleanedScientificName.java +++ b/ala-name-matching-model/src/main/java/au/org/ala/names/util/CleanedScientificName.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.util; import java.text.CharacterIterator; diff --git a/src/main/java/au/org/ala/names/util/TaxonNameSoundEx.java b/ala-name-matching-model/src/main/java/au/org/ala/names/util/TaxonNameSoundEx.java similarity index 94% rename from src/main/java/au/org/ala/names/util/TaxonNameSoundEx.java rename to ala-name-matching-model/src/main/java/au/org/ala/names/util/TaxonNameSoundEx.java index 96009a3cb..8708e97cb 100644 --- a/src/main/java/au/org/ala/names/util/TaxonNameSoundEx.java +++ b/ala-name-matching-model/src/main/java/au/org/ala/names/util/TaxonNameSoundEx.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.util; import java.util.Collections; @@ -5,7 +21,7 @@ import java.util.List; import java.util.StringTokenizer; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; /** * A Java implementation of the sound ex algorithm supplied by Tony Rees diff --git a/src/main/java/au/org/ala/vocab/ALATerm.java b/ala-name-matching-model/src/main/java/au/org/ala/vocab/ALATerm.java similarity index 83% rename from src/main/java/au/org/ala/vocab/ALATerm.java rename to ala-name-matching-model/src/main/java/au/org/ala/vocab/ALATerm.java index cfaf7dc0b..cc3f1cd20 100644 --- a/src/main/java/au/org/ala/vocab/ALATerm.java +++ b/ala-name-matching-model/src/main/java/au/org/ala/vocab/ALATerm.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.vocab; import org.gbif.dwc.terms.Term; diff --git a/src/main/java/org/gbif/nameparser/PhraseNameParser.java b/ala-name-matching-model/src/main/java/org/gbif/nameparser/PhraseNameParser.java similarity index 95% rename from src/main/java/org/gbif/nameparser/PhraseNameParser.java rename to ala-name-matching-model/src/main/java/org/gbif/nameparser/PhraseNameParser.java index 59b181ffc..f25a66e8e 100644 --- a/src/main/java/org/gbif/nameparser/PhraseNameParser.java +++ b/ala-name-matching-model/src/main/java/org/gbif/nameparser/PhraseNameParser.java @@ -16,8 +16,7 @@ package org.gbif.nameparser; import au.org.ala.names.model.ALAParsedName; -import au.org.ala.names.model.RankType; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.text.WordUtils; import org.gbif.api.exception.UnparsableException; import org.gbif.api.model.checklistbank.ParsedName; @@ -25,8 +24,6 @@ import org.gbif.api.vocabulary.Rank; import java.util.HashMap; -import java.util.HashSet; -import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -56,6 +53,7 @@ public class PhraseNameParser extends GBIFNameParser { static { HashMap ranks = new HashMap(); + ranks.put("f", Rank.FORM); ranks.put("subsp", Rank.SUBSPECIES); ranks.put("ssp", Rank.SUBSPECIES); ranks.put("var", Rank.VARIETY); @@ -68,7 +66,8 @@ public class PhraseNameParser extends GBIFNameParser { public static final String ALL_LETTERS_NUMBERS = NormalisedNameParser.NAME_LETTERS + NormalisedNameParser.name_letters + "0-9"; protected static final String LOCATION_OR_DESCR = "(?:[" + ALL_LETTERS_NUMBERS + " -'\"_\\.]+|\\.)"; protected static final String VOUCHER = "(\\([" + ALL_LETTERS_NUMBERS + "- \\./&,']+\\))"; - protected static final String SOURCE_AUTHORITY = "([" + ALL_LETTERS_NUMBERS + "\\[\\]'\" -,\\.]+|\\.)"; + protected static final String COMMENTARY = "(\\[[^\\]]*\\])"; + protected static final String SOURCE_AUTHORITY = "([" + ALL_LETTERS_NUMBERS + "'\" -,\\.]+|\\.)"; protected static final String PHRASE_RANKS = "(?:" + StringUtils.join(VALID_PHRASE_RANKS.keySet(), "|") + ")\\.? "; private static final String RANK_MARKER_ALL = "(notho)? *(" + StringUtils.join(RankUtils.RANK_MARKER_MAP.keySet(), "|") + ")\\.?"; @@ -85,8 +84,10 @@ public class PhraseNameParser extends GBIFNameParser { // Group 3 indicates the mandatory location/desc for the phrase name. But it may be possible to have homonyms if the VOUCHER is not supplied + "(" + LOCATION_OR_DESCR + ")" //Group 4 is the VOUCHER for the phrase it indicates the collector and a voucher id - + VOUCHER + "?" - //Group 5 is the party propsoing addition of the taxon + + VOUCHER + "?(?: *)" + // Group 5 is any commentary + + COMMENTARY + "?(?: *)" + //Group 6 is the party propsoing addition of the taxon + SOURCE_AUTHORITY + "?$" ); @@ -146,7 +147,7 @@ public ParsedName parse(String scientificName, Rank rank) throws UnparsableExcep alapn.setAuthorsParsed(false); alapn.setLocationPhraseDescription(StringUtils.trimToNull(m.group(3))); alapn.setPhraseVoucher(StringUtils.trimToNull(m.group(4))); - alapn.setPhraseNominatingParty(StringUtils.trimToNull(m.group(5))); + alapn.setPhraseNominatingParty(StringUtils.trimToNull(m.group(6))); return alapn; } diff --git a/src/test/java/au/org/ala/names/parser/util/PhraseNameParserTests.java b/ala-name-matching-model/src/test/java/au/org/ala/names/parser/util/PhraseNameParserTests.java similarity index 91% rename from src/test/java/au/org/ala/names/parser/util/PhraseNameParserTests.java rename to ala-name-matching-model/src/test/java/au/org/ala/names/parser/util/PhraseNameParserTests.java index 79562db5d..8ba03d9b3 100644 --- a/src/test/java/au/org/ala/names/parser/util/PhraseNameParserTests.java +++ b/ala-name-matching-model/src/test/java/au/org/ala/names/parser/util/PhraseNameParserTests.java @@ -1,6 +1,26 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.parser.util; import au.org.ala.names.model.ALAParsedName; +import com.opencsv.CSVParser; +import com.opencsv.CSVParserBuilder; +import com.opencsv.CSVReader; +import com.opencsv.CSVReaderBuilder; import org.gbif.api.exception.UnparsableException; import org.gbif.api.model.checklistbank.ParsedName; import org.gbif.api.vocabulary.NameType; @@ -314,7 +334,6 @@ public void testSpeciesLevelPhraseName5() throws Exception { assertEquals("(BR Maslin 7761)", ((ALAParsedName) pn).getPhraseVoucher()); assertEquals("Goodlands", ((ALAParsedName) pn).cleanPhrase); assertEquals("Maslin7761", ((ALAParsedName) pn).cleanVoucher); - assertEquals("[aff. resinosa]", ((ALAParsedName) pn).getPhraseNominatingParty()); assertEquals(Rank.SPECIES, pn.getRank()); } @@ -331,7 +350,6 @@ public void testSpeciesLevelPhraseName6() throws Exception { assertEquals("(BR Maslin 7711)", ((ALAParsedName) pn).getPhraseVoucher()); assertEquals("Manmanning", ((ALAParsedName) pn).cleanPhrase); assertEquals("Maslin7711", ((ALAParsedName) pn).cleanVoucher); - assertEquals("[aff. multispicata]", ((ALAParsedName) pn).getPhraseNominatingParty()); assertEquals(Rank.SPECIES, pn.getRank()); } @@ -443,7 +461,15 @@ public void testVirusName2() throws Exception { //@Test public void testAllNamesForType() { try { - com.opencsv.CSVReader reader = new com.opencsv.CSVReader(new FileReader("/data/names/Version2011/ala_concepts_dump.txt"), '\t', '"', '\\', 1); + CSVParser csvParser = new CSVParserBuilder() + .withSeparator('\t') + .withQuoteChar('"') + .withEscapeChar('\\') + .build(); + CSVReader reader = new CSVReaderBuilder(new FileReader("/data/names/Version2011/ala_concepts_dump.txt")) + .withCSVParser(csvParser) + .withSkipLines(1) + .build(); PhraseNameParser parser = new PhraseNameParser(); int i = 0; for (String[] values = reader.readNext(); values != null; values = reader.readNext()) { @@ -469,15 +495,29 @@ public void testAllNamesForType() { // See https://github.com/AtlasOfLivingAustralia/ala-name-matching/issues/1 - // At the moment, not able to correctly parse this out - @Ignore @Test - public void testSpeciesMarkerPhraseName() { + public void testRankMarkerPhraseName1() { try { PhraseNameParser parser = new PhraseNameParser(); ParsedName pn = parser.parse("Marsilea sp. Neutral Junction (D.E.Albrecht 9192)"); - pn = parser.parse("Asparagus asparagoides f. Western Cape (R.Taplin 1133)"); - assertEquals("RTaplin1133", ((ALAParsedName) pn).cleanVoucher); + assertEquals(ALAParsedName.class, pn.getClass()); + assertEquals("Albrecht9192", ((ALAParsedName) pn).cleanVoucher); + + } catch (Exception e) { + fail(e.getMessage()); + } + } + + // See https://github.com/AtlasOfLivingAustralia/ala-name-matching/issues/1 + // Form doesn't seem to work correctly as it is treating the voucher as an authort + @Test + @Ignore + public void testRankMarkerPhraseName2() { + try { + PhraseNameParser parser = new PhraseNameParser(); + ParsedName pn = parser.parse("Asparagus asparagoides f. Western Cape (R.Taplin 1133)"); + assertEquals(ALAParsedName.class, pn.getClass()); + assertEquals("Albrecht9192", ((ALAParsedName) pn).cleanVoucher); } catch (Exception e) { fail(e.getMessage()); diff --git a/src/test/java/au/org/ala/names/util/CleanedScientificNameTest.java b/ala-name-matching-model/src/test/java/au/org/ala/names/util/CleanedScientificNameTest.java similarity index 90% rename from src/test/java/au/org/ala/names/util/CleanedScientificNameTest.java rename to ala-name-matching-model/src/test/java/au/org/ala/names/util/CleanedScientificNameTest.java index 5a8c11d5b..b6cde1356 100644 --- a/src/test/java/au/org/ala/names/util/CleanedScientificNameTest.java +++ b/ala-name-matching-model/src/test/java/au/org/ala/names/util/CleanedScientificNameTest.java @@ -1,5 +1,21 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.util; import org.junit.Test; diff --git a/src/test/java/org/gbif/nameparser/PhraseNameParserTest.java b/ala-name-matching-model/src/test/java/org/gbif/nameparser/PhraseNameParserTest.java similarity index 83% rename from src/test/java/org/gbif/nameparser/PhraseNameParserTest.java rename to ala-name-matching-model/src/test/java/org/gbif/nameparser/PhraseNameParserTest.java index 49e1e56b9..d9485da2b 100644 --- a/src/test/java/org/gbif/nameparser/PhraseNameParserTest.java +++ b/ala-name-matching-model/src/test/java/org/gbif/nameparser/PhraseNameParserTest.java @@ -1,10 +1,24 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package org.gbif.nameparser; import au.org.ala.names.model.ALAParsedName; -import au.org.ala.names.util.CleanedScientificName; -import org.codehaus.jackson.map.jsontype.NamedType; import org.gbif.api.model.checklistbank.ParsedName; import org.gbif.api.vocabulary.NameType; import org.junit.Before; diff --git a/ala-name-matching-search/pom.xml b/ala-name-matching-search/pom.xml new file mode 100644 index 000000000..3feaba613 --- /dev/null +++ b/ala-name-matching-search/pom.xml @@ -0,0 +1,55 @@ + + + 4.0.0 + + + au.org.ala + ala-name-matching + 4.0 + + + ala-name-matching-search + jar + + ALA Name Matching Search Library + A library that connects to a local name index and provides name lookup services. + + + + ${project.parent.groupId} + ala-name-matching-model + ${project.version} + + + com.opencsv + opencsv + ${opencsv.version} + + + commons-io + commons-io + ${commons-io.version} + + + + org.apache.lucene + lucene-core + ${org.apache.lucene.version} + + + org.apache.lucene + lucene-backward-codecs + ${org.apache.lucene.version} + + + org.apache.lucene + lucene-analyzers-common + ${org.apache.lucene.version} + + + org.apache.lucene + lucene-queryparser + ${org.apache.lucene.version} + + + diff --git a/ala-name-matching-search/src/main/java/au/org/ala/names/lucene/analyzer/LowerCaseKeywordAnalyzer.java b/ala-name-matching-search/src/main/java/au/org/ala/names/lucene/analyzer/LowerCaseKeywordAnalyzer.java new file mode 100644 index 000000000..a8710a900 --- /dev/null +++ b/ala-name-matching-search/src/main/java/au/org/ala/names/lucene/analyzer/LowerCaseKeywordAnalyzer.java @@ -0,0 +1,48 @@ +/* + * Copyright (C) 2014 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + */ +package au.org.ala.names.lucene.analyzer; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.core.KeywordTokenizerFactory; +import org.apache.lucene.analysis.core.LowerCaseFilterFactory; +import org.apache.lucene.analysis.custom.CustomAnalyzer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; + +/** + * A custom KeywordAnalyzer that converts the text to lowercase before tokenizing + * the complete string as one token + * + * @author Natasha + */ +public final class LowerCaseKeywordAnalyzer { + private static final Logger logger = LoggerFactory.getLogger(LowerCaseKeywordAnalyzer.class); + + /** + * Get an instance of a lower-case keyword analyser. + * + * @return The analyser + */ + public static Analyzer newInstance() { + try { + return CustomAnalyzer.builder().withTokenizer(KeywordTokenizerFactory.class).addTokenFilter(LowerCaseFilterFactory.class).build(); + } catch (IOException ex) { + logger.error("Unable to build analyzer", ex); + throw new IllegalStateException(ex); + } + } + } diff --git a/src/main/java/au/org/ala/names/search/ALANameSearcher.java b/ala-name-matching-search/src/main/java/au/org/ala/names/search/ALANameSearcher.java similarity index 82% rename from src/main/java/au/org/ala/names/search/ALANameSearcher.java rename to ala-name-matching-search/src/main/java/au/org/ala/names/search/ALANameSearcher.java index dff4a273d..3cbe595df 100644 --- a/src/main/java/au/org/ala/names/search/ALANameSearcher.java +++ b/ala-name-matching-search/src/main/java/au/org/ala/names/search/ALANameSearcher.java @@ -14,20 +14,15 @@ */ package au.org.ala.names.search; -import au.org.ala.names.lucene.analyzer.LowerCaseKeywordAnalyzer; import au.org.ala.names.model.*; import au.org.ala.names.util.CleanedScientificName; import au.org.ala.names.util.TaxonNameSoundEx; -import org.apache.commons.io.FileUtils; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; -import org.apache.lucene.index.*; -import org.apache.lucene.queryparser.classic.ParseException; -import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexableField; import org.apache.lucene.search.*; import org.apache.lucene.store.FSDirectory; import org.gbif.api.exception.UnparsableException; @@ -35,11 +30,10 @@ import org.gbif.api.vocabulary.NameType; import org.gbif.api.vocabulary.Rank; import org.gbif.nameparser.PhraseNameParser; +import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric; +import uk.ac.shef.wit.simmetrics.similaritymetrics.Levenshtein; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.OutputStreamWriter; +import java.io.*; import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; @@ -74,8 +68,6 @@ public class ALANameSearcher { protected Log log = LogFactory.getLog(ALANameSearcher.class); protected DirectoryReader cbReader, irmngReader, vernReader; protected IndexSearcher cbSearcher, irmngSearcher, vernSearcher, idSearcher; - protected ThreadLocal queryParser; - protected ThreadLocal idParser; protected TaxonNameSoundEx tnse; protected PhraseNameParser parser; public static final Pattern virusStopPattern = Pattern.compile(" virus| ictv| ICTV"); @@ -83,6 +75,21 @@ public class ALANameSearcher { public static final Pattern affPattern = Pattern.compile("([\\x00-\\x7F\\s]*) aff[#!?\\\\. ]([\\x00-\\x7F\\s]*)"); public static final Pattern cfPattern = Pattern.compile("([\\x00-\\x7F\\s]*) cf[#!?\\\\. ]([\\x00-\\x7F\\s]*)"); + private static Comparator AUTOCOMPLETE_COMPARATOR = new Comparator() { + @Override + public int compare(Map o1, Map o2) { + if (o1 == o2) + return 0; + if (o1 == null) + return Integer.MAX_VALUE; + if (o2 == null) + return Integer.MIN_VALUE; + float score1 = (float) o1.getOrDefault("score", 1.0f); + float score2 = (float) o2.getOrDefault("score", 1.0f); + return -Float.compare(score1, score2); + } + }; + /** * A set of names that are cross rank homonyms. */ @@ -95,56 +102,33 @@ public ALANameSearcher(){} * as the source directory * * @param indexDirectory The directory that contains the index files for the scientific names, irmng and vernacular names. - * @throws CorruptIndexException * @throws IOException */ public ALANameSearcher(String indexDirectory) throws IOException { //Initialise CB index searching items log.debug("Creating the search object for the name matching api..."); - //make the query parsers thread safe - queryParser = new ThreadLocal() { - @Override - protected QueryParser initialValue() { - QueryParser qp = new QueryParser("genus", new LowerCaseKeywordAnalyzer()); - qp.setFuzzyMinSim(0.8f); //fuzzy match similarity setting. used to match the authorship. - return qp; - } - }; - idParser = new ThreadLocal() { - @Override - protected QueryParser initialValue() { - return new QueryParser( "lsid", new org.apache.lucene.analysis.core.KeywordAnalyzer()); - } - }; - - cbReader = DirectoryReader.open(FSDirectory.open(createIfNotExist(indexDirectory + File.separator + "cb")));//false + cbReader = DirectoryReader.open(FSDirectory.open(findPath(indexDirectory + File.separator + "cb")));//false cbSearcher = new IndexSearcher(cbReader); //Initialise the IRMNG index searching items - irmngReader = DirectoryReader.open(FSDirectory.open(createIfNotExist(indexDirectory + File.separator + "irmng"))); + irmngReader = DirectoryReader.open(FSDirectory.open(findPath(indexDirectory + File.separator + "irmng"))); irmngSearcher = new IndexSearcher(irmngReader); //initialise the Common name index searching items - vernReader = DirectoryReader.open(FSDirectory.open(createIfNotExist(indexDirectory + File.separator + "vernacular"))); + vernReader = DirectoryReader.open(FSDirectory.open(findPath(indexDirectory + File.separator + "vernacular"))); vernSearcher = new IndexSearcher(vernReader); //initialise the identifier index - idSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(createIfNotExist(indexDirectory + File.separator + "id")))); + idSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(findPath(indexDirectory + File.separator + "id")))); tnse = new TaxonNameSoundEx(); parser = new PhraseNameParser(); crossRankHomonyms = au.org.ala.names.util.FileUtils.streamToSet( this.getClass().getClassLoader().getResourceAsStream("au/org/ala/homonyms/cross_rank_homonyms.txt"), new java.util.HashSet(), true); } - private Path createIfNotExist(String indexDirectory) throws IOException { - + private Path findPath(String indexDirectory) throws IOException { File idxFile = new File(indexDirectory); - Path path = Paths.get(indexDirectory); if (!idxFile.exists()) { - FileUtils.forceMkdir(idxFile); - Analyzer analyzer = new StandardAnalyzer(); - IndexWriterConfig conf = new IndexWriterConfig(analyzer); - IndexWriter iw = new IndexWriter(FSDirectory.open(path), conf); - iw.commit(); - iw.close(); + throw new FileNotFoundException(idxFile.toString()); } + Path path = Paths.get(indexDirectory); return path; } @@ -154,8 +138,7 @@ private Path createIfNotExist(String indexDirectory) throws IOException { public void dumpSpecies() { try { OutputStreamWriter fileOut = new OutputStreamWriter(new FileOutputStream("/data/species.txt"), "UTF-8"); - Term term = new Term("rank", "species"); - TopDocs hits = cbSearcher.search(new TermQuery(term), 2000000); + TopDocs hits = cbSearcher.search(NameIndexField.RANK.search("species"), 2000000); for (ScoreDoc sdoc : hits.scoreDocs) { Document doc = cbReader.document(sdoc.doc); @@ -894,7 +877,7 @@ public NameSearchResult searchForRecord(String name, LinnaeanRankClassification */ public NameSearchResult searchForRecordByID(String id) { try { - List results = performSearch(ALANameIndexer.IndexField.ID.toString(), id, null, null, 1, null, false, idParser.get()); + List results = performSearch(NameIndexField.ID, id, null, null, 1, null, false); if (results.size() > 0) { results.get(0).setMatchType(MatchType.TAXON_ID); return results.get(0); @@ -1021,7 +1004,7 @@ private List searchForRecords(String name, RankType rank, Linn log.warn("Unable to parse " + name + ". " + e.getMessage()); } //Check for the exact match - List hits = performSearch(NameIndexField.NAME.toString(), cleaned.getNormalised(), rank, cl, max, MatchType.EXACT, true, queryParser.get()); + List hits = performSearch(NameIndexField.NAME, cleaned.getNormalised(), rank, cl, max, MatchType.EXACT, true); if (hits == null) // situation where searcher has not been initialised { return null; @@ -1043,12 +1026,13 @@ private List searchForRecords(String name, RankType rank, Linn String voucher = alapn.cleanVoucher; //String voucher = alapn.phraseVoucher != null ? voucherRemovePattern.matcher(alapn.phraseVoucher).replaceAll("") :null; String specific = alapn.getRank() != null && alapn.getRank().equals(Rank.SPECIES) ? null : alapn.getSpecificEpithet(); - String[][] searchFields = new String[4][]; - searchFields[0] = new String[]{RankType.GENUS.getRank(), genus}; - searchFields[1] = new String[]{NameIndexField.PHRASE.toString(), phrase}; - searchFields[2] = new String[]{NameIndexField.VOUCHER.toString(), voucher}; - searchFields[3] = new String[]{NameIndexField.SPECIFIC.toString(), specific}; - hits = performSearch(searchFields, rank, cl, max, MatchType.PHRASE, false, queryParser.get()); //don't want to check for homonyms yet... + List searchFields = Arrays.asList( + Value.of(NameIndexField.GENUS, genus), + Value.of(NameIndexField.PHRASE, phrase), + Value.of(NameIndexField.VOUCHER, voucher), + Value.of(NameIndexField.SPECIFIC, specific) + ); + hits = performSearch(searchFields, rank, cl, max, MatchType.PHRASE, false); //don't want to check for homonyms yet... if (hits.size() == 1) { return hits; } else if (hits.size() > 1) { @@ -1069,7 +1053,7 @@ private List searchForRecords(String name, RankType rank, Linn if (cl.getAuthorship() == null && pn.isAuthorsParsed()) { cl.setAuthorship(pn.authorshipComplete()); } - hits = performSearch(ALANameIndexer.IndexField.NAME.toString(), canonicalName, rank, cl, max, MatchType.CANONICAL, true, queryParser.get()); + hits = performSearch(NameIndexField.NAME, canonicalName, rank, cl, max, MatchType.CANONICAL, true); if (hits.size() > 0) { return hits; } @@ -1079,12 +1063,13 @@ private List searchForRecords(String name, RankType rank, Linn String phrase = pn.getCultivarEpithet(); String voucher = null; String specific = pn.getRank() != null && pn.getRank().equals(Rank.SPECIES) ? null : pn.getSpecificEpithet(); - String[][] searchFields = new String[4][]; - searchFields[0] = new String[]{RankType.GENUS.getRank(), genus}; - searchFields[1] = new String[]{NameIndexField.PHRASE.toString(), phrase}; - searchFields[2] = new String[]{NameIndexField.VOUCHER.toString(), voucher}; - searchFields[3] = new String[]{NameIndexField.SPECIFIC.toString(), specific}; - hits = performSearch(searchFields, rank, cl, max, MatchType.PHRASE, false, queryParser.get()); + List searchFields = Arrays.asList( + Value.of(NameIndexField.GENUS, genus), + Value.of(NameIndexField.PHRASE, phrase), + Value.of(NameIndexField.VOUCHER, voucher), + Value.of(NameIndexField.SPECIFIC, specific) + ); + hits = performSearch(searchFields, rank, cl, max, MatchType.PHRASE, false); if (hits.size() > 0) { return hits; } @@ -1095,15 +1080,12 @@ private List searchForRecords(String name, RankType rank, Linn String genus = TaxonNameSoundEx.treatWord(pn.getGenusOrAbove(), "genus"); String specific = TaxonNameSoundEx.treatWord(pn.getSpecificEpithet(), "species"); String infra = pn.getInfraSpecificEpithet() == null ? null : TaxonNameSoundEx.treatWord(pn.getInfraSpecificEpithet(), "species"); - String[][] searchFields = new String[3][]; - searchFields[0] = new String[]{NameIndexField.GENUS_EX.toString(), genus}; - searchFields[1] = new String[]{NameIndexField.SPECIES_EX.toString(), specific}; - if (StringUtils.isNotEmpty(infra)) { - searchFields[2] = new String[]{NameIndexField.INFRA_EX.toString(), infra}; - } else { - searchFields[2] = new String[]{NameIndexField.INFRA_EX.toString(), ""}; - } - hits = performSearch(searchFields, rank, cl, max, MatchType.SOUNDEX, false, queryParser.get()); //don't want to check for homonyms yet... + List searchFields = Arrays.asList( + Value.of(NameIndexField.GENUS_EX, genus), + Value.of(NameIndexField.SPECIES_EX, specific), + Value.of(NameIndexField.INFRA_EX, StringUtils.isNotEmpty(infra) ? infra : "") + ); + hits = performSearch(searchFields, rank, cl, max, MatchType.SOUNDEX, false); //don't want to check for homonyms yet... if (hits.size() > 0) { return hits; } @@ -1146,140 +1128,126 @@ else if (hit.getAcceptedLsid() != null) { return acceptedLsid == null ? null : searchForRecordByLsid(acceptedLsid); } - private List performSearch(String field, String value, RankType rank, + private List performSearch(NameIndexField field, String value, RankType rank, LinnaeanRankClassification cl, int max, MatchType type, - boolean checkHomo, QueryParser parser) throws IOException, SearchResultException { - String[][] compValues = new String[1][]; - compValues[0] = new String[]{field, value}; - return performSearch(compValues, rank, cl, max, type, checkHomo, parser); + boolean checkHomo) throws IOException, SearchResultException { + return performSearch(Arrays.asList(Value.of(field, value)), rank, cl, max, type, checkHomo); } /** * Performs an index search based on the supplied field and name * - * @param compulsoryValues 2D array of field and value mappings to perform the search on + * @param compulsoryValues A list of required values * @param rank Optional rank of the value * @param cl The high taxa that form the classification for the search item * @param max The maximum number of results to return * @param type The type of search that is being performed * @param checkHomo Whether or not the result should check for homonyms. - * @param parser * @return * @throws IOException * @throws SearchResultException */ - private List performSearch(String[][] compulsoryValues, RankType rank, - LinnaeanRankClassification cl, int max, MatchType type, boolean checkHomo, - QueryParser parser) throws IOException, SearchResultException { + private List performSearch(List compulsoryValues, RankType rank, + LinnaeanRankClassification cl, int max, MatchType type, boolean checkHomo) throws IOException, SearchResultException { if (cbSearcher != null) { String scientificName = null; - StringBuilder query = new StringBuilder(); - for (String[] values : compulsoryValues) { - if (values[1] != null) { - - query.append("+" + values[0] + ":\"" + values[1] + "\""); - - if (values[0].equals(NameIndexField.NAME.toString())) - scientificName = values[1]; - } + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + for (Value value: compulsoryValues) { + if (value.value != null) { + builder.add(value.field.search(value.value), BooleanClause.Occur.MUST); + if (value.field == NameIndexField.NAME) + scientificName = value.value.toString(); + } } if (rank != null) { - //if the rank is below species include all names that are species level and below in case synonyms have changed ranks. - query.append("+("); - if (rank.getId() >= RankType.SPECIES.getId()) { - query.append(NameIndexField.RANK_ID.toString()).append(":[7000 TO 9999]"); - - } else - query.append(NameIndexField.RANK.toString() + ":\"" + rank.getRank() + "\""); - //cater for the situation where the search term could be a synonym that does not have a rank + int lower = rank.getId(); + int upper = rank.getId() >= RankType.SPECIES.getId() ? 9999 : rank.getId(); + BooleanQuery.Builder rankBuilder = new BooleanQuery.Builder(); + rankBuilder.add(NameIndexField.RANK_ID.searchRange(lower, upper), BooleanClause.Occur.SHOULD); + //cater for the situation where the search term could be a synonym that does not have a rank // also ALA added concepts do NOT have ranks. - query.append(" OR ").append(NameIndexField.iS_SYNONYM.toString()).append(":T OR ").append(NameIndexField.ALA).append(":T)"); - + rankBuilder.add(NameIndexField.iS_SYNONYM.search("T"), BooleanClause.Occur.SHOULD); + rankBuilder.add(NameIndexField.ALA.search("T"), BooleanClause.Occur.SHOULD); + builder.add(rankBuilder.build(), BooleanClause.Occur.MUST); } if (cl != null) { - query.append(cl.getLuceneSearchString(true)); + this.appendLuceneQuery(cl, builder, true); + } + Query query = builder.build(); - } + TopDocs hits = cbSearcher.search(query, max);//cbSearcher.search(boolQuery, max); - try { - Query scoreQuery = parser.parse(query.toString()); - TopDocs hits = cbSearcher.search(scoreQuery, max);//cbSearcher.search(boolQuery, max); + //now put the hits into the arrayof NameSearchResult + List results = new java.util.ArrayList(); - //now put the hits into the arrayof NameSearchResult - List results = new java.util.ArrayList(); - - for (ScoreDoc sdoc : hits.scoreDocs) { - NameSearchResult nsr = new NameSearchResult(cbReader.document(sdoc.doc), type); - nsr.computeMatch(cl); - results.add(nsr); - } - results.sort(Comparator.comparing(NameSearchResult::getMatchMetrics).reversed()); - if (results.stream().filter(r -> r.getMatchMetrics().getMatch() > MATCH_LIMIT).count() > 0) { - results = results.stream().filter(r -> r.getMatchMetrics().getMatch() > MATCH_LIMIT).collect(Collectors.toList()); - } - //HOMONYM CHECKS and other checks - if (checkHomo) { - - //check to see if one of the results is excluded - if (results.size() > 0) { - int exclCount = 0; - NameSearchResult notExcludedResult = null; - NameSearchResult excludedResult = null; - for (NameSearchResult nsr : results) { - if (nsr.getSynonymType() == au.org.ala.names.model.SynonymType.EXCLUDES) { - exclCount++; - excludedResult = nsr; - } else if (notExcludedResult == null) { - notExcludedResult = nsr; - } + for (ScoreDoc sdoc : hits.scoreDocs) { + NameSearchResult nsr = this.createResult(cbReader.document(sdoc.doc), type); + nsr.computeMatch(cl); + results.add(nsr); + } + results.sort(Comparator.comparing(NameSearchResult::getMatchMetrics).reversed()); + if (results.stream().filter(r -> r.getMatchMetrics().getMatch() > MATCH_LIMIT).count() > 0) { + results = results.stream().filter(r -> r.getMatchMetrics().getMatch() > MATCH_LIMIT).collect(Collectors.toList()); + } + //HOMONYM CHECKS and other checks + if (checkHomo) { + + //check to see if one of the results is excluded + if (results.size() > 0) { + int exclCount = 0; + NameSearchResult notExcludedResult = null; + NameSearchResult excludedResult = null; + for (NameSearchResult nsr : results) { + if (nsr.getSynonymType() == au.org.ala.names.model.SynonymType.EXCLUDES) { + exclCount++; + excludedResult = nsr; + } else if (notExcludedResult == null) { + notExcludedResult = nsr; } - if (exclCount > 0) { - //throw the basic exception if count == result size - if (exclCount == results.size()) { - throw new ExcludedNameException("The result is a name that has been excluded from the NSL", excludedResult); - } else if (notExcludedResult != null) { - //one of the results was an excluded concept - throw new ExcludedNameException("One of the results was excluded. Use the nonExcludedName for your match.", notExcludedResult, excludedResult); - } + } + if (exclCount > 0) { + //throw the basic exception if count == result size + if (exclCount == results.size()) { + throw new ExcludedNameException("The result is a name that has been excluded from the NSL", excludedResult); + } else if (notExcludedResult != null) { + //one of the results was an excluded concept + throw new ExcludedNameException("One of the results was excluded. Use the nonExcludedName for your match.", notExcludedResult, excludedResult); } } + } - //check to see if we have a situtation where a species has been split into subspecies and a synonym exists to the subspecies - checkForSpeciesSplit(results); + //check to see if we have a situtation where a species has been split into subspecies and a synonym exists to the subspecies + checkForSpeciesSplit(results); - //check to see if one of the results is a misapplied synonym - checkForMisapplied(results); + //check to see if one of the results is a misapplied synonym + checkForMisapplied(results); - //check result level homonyms - //TODO 2012-04-17: Work out edge case issues for canonical matches... - //checkResultLevelHomonym(results); + //check result level homonyms + //TODO 2012-04-17: Work out edge case issues for canonical matches... + //checkResultLevelHomonym(results); - //check to see if we have a cross rank homonym - //cross rank homonyms are resolvable if a rank has been supplied - if (rank == null) { - checkForCrossRankHomonym(results); - } + //check to see if we have a cross rank homonym + //cross rank homonyms are resolvable if a rank has been supplied + if (rank == null) { + checkForCrossRankHomonym(results); + } - //check to see if the search criteria could represent an unresolved genus or species homonym - if (results.size() > 0) { - RankType resRank = results.get(0).getRank(); - if ((resRank == RankType.GENUS || resRank == RankType.SPECIES) || (results.get(0).isSynonym() && (rank == null || rank == RankType.GENUS || rank == RankType.SPECIES))) { - NameSearchResult result = (cl != null && StringUtils.isNotBlank(cl.getAuthorship())) ? validateHomonymByAuthor(results, scientificName, cl) : validateHomonyms(results, scientificName, cl); - results.clear(); - results.add(result); - } + //check to see if the search criteria could represent an unresolved genus or species homonym + if (results.size() > 0) { + RankType resRank = results.get(0).getRank(); + if ((resRank == RankType.GENUS || resRank == RankType.SPECIES) || (results.get(0).isSynonym() && (rank == null || rank == RankType.GENUS || rank == RankType.SPECIES))) { + NameSearchResult result = (cl != null && StringUtils.isNotBlank(cl.getAuthorship())) ? validateHomonymByAuthor(results, scientificName, cl) : validateHomonyms(results, scientificName, cl); + results.clear(); + results.add(result); } } - - return results; - } catch (ParseException e) { - throw new SearchResultException("Error parsing " + query.toString() + "." + e.getMessage()); } + return results; } return null; } @@ -1513,7 +1481,7 @@ else if (rank == RankType.SPECIES && cl.getSpecies() == null) private boolean isHomonymResolvable(LinnaeanRankClassification cl) { TopDocs results = getIRMNGGenus(cl, RankType.GENUS); if (results != null) - return results.totalHits <= 1; + return results.totalHits.value <= 1; return false; } @@ -1528,12 +1496,10 @@ public TopDocs getIRMNGGenus(LinnaeanRankClassification cl, RankType rank) { if (cl != null && (cl.getGenus() != null || cl.getSpecies() != null)) { try { - - String searchString = "+rank:" + rank + " " + cl.getLuceneSearchString(false).trim(); - - - log.debug("Search string : " + searchString + " classification : " + cl); - Query query = queryParser.get().parse(searchString); + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + builder.add(NameIndexField.RANK.search(rank.getRank()), BooleanClause.Occur.MUST); + this.appendLuceneQuery(cl, builder, false); + Query query = builder.build(); log.debug("getIRMNG query: " + query.toString()); return irmngSearcher.search(query, 10); @@ -1562,7 +1528,7 @@ public RankType resolveIRMNGHomonym(LinnaeanRankClassification cl, RankType rank newcl.setSpecies(cl.getSpecies()); if (cl != null && (cl.getGenus() != null || cl.getSpecies() != null)) { TopDocs results = getIRMNGGenus(newcl, rank); - if (results == null || results.totalHits <= 1) + if (results == null || results.totalHits.value <= 1) return null; if (cl != null && cl.getKingdom() != null) { @@ -1570,39 +1536,39 @@ public RankType resolveIRMNGHomonym(LinnaeanRankClassification cl, RankType rank newcl.setKingdom(cl.getKingdom()); //Step 1 search for kingdom and genus results = getIRMNGGenus(newcl, rank); - if (results.totalHits == 1) + if (results.totalHits.value == 1) return RankType.KINGDOM; } //Step 2 add the phylum - if (cl.getPhylum() != null && results.totalHits > 1) { + if (cl.getPhylum() != null && results.totalHits.value > 1) { newcl.setPhylum(cl.getPhylum()); results = getIRMNGGenus(newcl, rank); - if (results.totalHits == 1) + if (results.totalHits.value == 1) return RankType.PHYLUM; //This may not be a good idea - else if (results.totalHits == 0) + else if (results.totalHits.value == 0) newcl.setPhylum(null);//just in case the phylum was specified incorrectly } //Step 3 try the class if (cl.getKlass() != null) {// && results.totalHits>1){ newcl.setKlass(cl.getKlass()); results = getIRMNGGenus(newcl, rank); - if (results.totalHits == 1) + if (results.totalHits.value == 1) return RankType.CLASS; } //step 4 try order - if (cl.getOrder() != null && results.totalHits > 1) { + if (cl.getOrder() != null && results.totalHits.value > 1) { newcl.setOrder(cl.getOrder()); results = getIRMNGGenus(newcl, rank); - if (results.totalHits == 1) + if (results.totalHits.value == 1) return RankType.ORDER; } //step 5 try the family - if (cl.getFamily() != null && results.totalHits > 1) { + if (cl.getFamily() != null && results.totalHits.value > 1) { newcl.setFamily(cl.getFamily()); results = getIRMNGGenus(newcl, rank); - if (results.totalHits == 1) + if (results.totalHits.value == 1) return RankType.FAMILY; } } @@ -1627,13 +1593,13 @@ public String searchForLSIDCommonName(String commonName) { */ public String getCommonNameForLSID(String lsid) { if (lsid != null) { - TermQuery query = new TermQuery(new Term(ALANameIndexer.IndexField.LSID.toString(), lsid)); + Query query = NameIndexField.LSID.search(lsid); try { TopDocs results = vernSearcher.search(query, 1); log.debug("Number of matches for " + lsid + " " + results.totalHits); for (ScoreDoc sdoc : results.scoreDocs) { org.apache.lucene.document.Document doc = vernSearcher.doc(sdoc.doc); - return doc.get(ALANameIndexer.IndexField.COMMON_NAME.toString()); + return doc.get(NameIndexField.COMMON_NAME.toString()); } } catch (IOException e) { log.debug("Unable to access document for common name.", e); @@ -1652,16 +1618,14 @@ public String getCommonNameForLSID(String lsid, String[] languages) { if (lsid != null) { for (String language: languages) { try { - Query query = queryParser.get().parse( - ALANameIndexer.IndexField.LSID.toString() + ":\"" + lsid + "\" " + - " AND " + - ALANameIndexer.IndexField.LANGUAGE.toString() + ":\"" + language + "\" " - ); - TopDocs results = vernSearcher.search(query, 1); + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + builder.add(NameIndexField.LSID.search(lsid), BooleanClause.Occur.MUST); + builder.add(NameIndexField.LANGUAGE.search(language), BooleanClause.Occur.MUST); + TopDocs results = vernSearcher.search(builder.build(), 1); log.debug("Number of matches for " + lsid + " " + results.totalHits); for (ScoreDoc sdoc : results.scoreDocs) { org.apache.lucene.document.Document doc = vernSearcher.doc(sdoc.doc); - return doc.get(ALANameIndexer.IndexField.COMMON_NAME.toString()); + return doc.get(NameIndexField.COMMON_NAME.toString()); } } catch (Exception e) { log.debug("Unable to access document for common name.", e); @@ -1678,7 +1642,7 @@ public String getCommonNameForLSID(String lsid, String[] languages) { */ public Set getCommonNamesForLSID(String lsid, int maxNumberOfNames) { if (lsid != null) { - TermQuery query = new TermQuery(new Term(ALANameIndexer.IndexField.LSID.toString(), lsid)); + Query query = NameIndexField.LSID.search(lsid); try { TopDocs results = vernSearcher.search(query, maxNumberOfNames); //if all the results have the same scientific name result the LSID for the first @@ -1689,7 +1653,7 @@ public Set getCommonNamesForLSID(String lsid, int maxNumberOfNames) { int idx = 0; for (ScoreDoc sdoc : results.scoreDocs) { org.apache.lucene.document.Document doc = vernSearcher.doc(sdoc.doc); - String name = doc.get(ALANameIndexer.IndexField.COMMON_NAME.toString()); + String name = doc.get(NameIndexField.COMMON_NAME.toString()); if(!lowerCaseResults.contains(name.toLowerCase())){ lowerCaseResults.add(name.toLowerCase()); names.add(name); @@ -1715,7 +1679,7 @@ public Set getCommonNamesForLSID(String lsid, int maxNumberOfNames) { */ private String getLSIDForUniqueCommonName(String name) { if (name != null) { - TermQuery query = new TermQuery(new Term(ALANameIndexer.IndexField.SEARCHABLE_COMMON_NAME.toString(), name.toUpperCase().replaceAll("[^A-Z0-9ÏËÖÜÄÉÈČÁÀÆŒ]", ""))); + Query query = NameIndexField.SEARCHABLE_COMMON_NAME.search(name); try { TopDocs results = vernSearcher.search(query, 10); //if all the results have the same scientific name result the LSID for the first @@ -1725,10 +1689,10 @@ private String getLSIDForUniqueCommonName(String name) { for (ScoreDoc sdoc : results.scoreDocs) { org.apache.lucene.document.Document doc = vernSearcher.doc(sdoc.doc); if (firstLsid == null) { - firstLsid = doc.get(ALANameIndexer.IndexField.LSID.toString()); - firstName = doc.get(ALANameIndexer.IndexField.NAME.toString()); + firstLsid = doc.get(NameIndexField.LSID.toString()); + firstName = doc.get(NameIndexField.NAME.toString()); } else { - if (!doSciNamesMatch(firstName, doc.get(ALANameIndexer.IndexField.NAME.toString()))) + if (!doSciNamesMatch(firstName, doc.get(NameIndexField.NAME.toString()))) return null; } } @@ -1791,11 +1755,11 @@ public NameSearchResult searchForCommonName(String name) { */ public String getPrimaryLsid(String lsid) { if (lsid != null) { - TermQuery tq = new TermQuery(new Term("lsid", lsid)); + Query tq = NameIndexField.LSID.search(lsid); try { org.apache.lucene.search.TopDocs results = idSearcher.search(tq, 1); - if (results.totalHits > 0) - return idSearcher.doc(results.scoreDocs[0].doc).get("reallsid"); + if (results.totalHits.value > 0) + return idSearcher.doc(results.scoreDocs[0].doc).get(NameIndexField.REAL_LSID.toString()); } catch (IOException e) { } } @@ -1806,12 +1770,12 @@ public String getPrimaryLsid(String lsid) { public NameSearchResult searchForRecordByLsid(String lsid) { NameSearchResult result = null; try { - Query query = new TermQuery(new Term(NameIndexField.LSID.toString(), lsid)); + Query query = NameIndexField.LSID.search(lsid); TopDocs hits = this.idSearcher.search(query, 1); - if (hits.totalHits == 0) + if (hits.totalHits.value == 0) hits = this.cbSearcher.search(query, 1); - if (hits.totalHits > 0) - return new NameSearchResult(cbSearcher.doc(hits.scoreDocs[0].doc), MatchType.TAXON_ID); + if (hits.totalHits.value > 0) + return this.createResult(cbSearcher.doc(hits.scoreDocs[0].doc), MatchType.TAXON_ID); } catch (Exception ex) { log.error("Unable to search for record by LSID " + lsid, ex); } @@ -1845,7 +1809,7 @@ public List getGuidsForTaxa(List taxaQueries) { return guids; } - private void appendAutocompleteResults(Map output, TopDocs results, boolean includeSynonyms, boolean commonNameResults) throws IOException { + private void appendAutocompleteResults(Map output, TopDocs results, boolean includeSynonyms, boolean commonNameResults, String q, AbstractStringMetric similarity) throws IOException { ScoreDoc[] scoreDocs = results.scoreDocs; int scoreDocsCount = scoreDocs.length; for(int excludedResult = 0; excludedResult < scoreDocsCount; ++excludedResult) { @@ -1853,15 +1817,20 @@ private void appendAutocompleteResults(Map output, TopDocs results, Document src = commonNameResults ? vernSearcher.doc(i.doc) : cbSearcher.doc(i.doc); NameSearchResult nsr = commonNameResults ? searchForRecordByLsid(src.get("lsid")) - : new NameSearchResult(src, null); + : this.createResult(src, null); if (nsr == null || (nsr.getLeft() == null && !includeSynonyms)) continue; - Map m = formatAutocompleteNsr(i.score, nsr); + String name = commonNameResults ? src.get("common_orig") : src.get("name"); + float score = similarity.getSimilarity(q, name); + score *= i.score; + if (!commonNameResults) + score *= 2.0f; + Map m = formatAutocompleteNsr(score, nsr); //use the matched common name if (commonNameResults) { - m.put("commonname", src.get("common_orig")); + m.put("commonname", name); m.put("match", "commonName"); } else { m.put("match", "scientificName"); @@ -1889,10 +1858,7 @@ private void appendAutocompleteResults(Map output, TopDocs results, } if (((nsr != null && nsr.getAcceptedLsid() == null) || includeSynonyms) && m != null) { - if (m.get("name").toString().equals("Acacia")) { - int aa = 4; - } - Map existing = output.get(m.get("lsid").toString()); + Map existing = output.get(m.get("lsid").toString()); if (existing == null) { output.put(m.get("lsid").toString(), m); } else { @@ -1905,16 +1871,16 @@ private void appendAutocompleteResults(Map output, TopDocs results, } } - private Query buildAutocompleteQuery(String field, String q, boolean allSearches) { + private Query buildAutocompleteQuery(NameIndexField field, String q, boolean allSearches) { //best match - Query fq1 = new BoostQuery(new TermQuery(new Term(field,q)), 12f); //exact match + Query fq1 = new BoostQuery(field.search(q), 12f); //exact match //partial matches - Query fq5 = new WildcardQuery(new Term(field,q + "*")); //begins with that begins with - Query fq6 = new WildcardQuery(new Term(field,"* " + q + "*")); //contains word that begins with + Query fq5 = field.searchWildcard(q + "*"); //begins with that begins with + Query fq6 = field.searchWildcard("* " + q + "*"); //contains word that begins with //any match - Query fq7 = new WildcardQuery(new Term(field,"*" + q + "*")); //any match + Query fq7 = field.searchWildcard("*" + q + "*"); //any match //join BooleanQuery o = new BooleanQuery.Builder() @@ -1927,8 +1893,8 @@ private Query buildAutocompleteQuery(String field, String q, boolean allSearches } private String getPreferredGuid(String taxonConceptGuid) throws Exception { - Query qGuid = new TermQuery(new Term("guid", taxonConceptGuid)); - Query qOtherGuid = new TermQuery(new Term("otherGuid", taxonConceptGuid)); + Query qGuid = NameIndexField.GUID.search(taxonConceptGuid); + Query qOtherGuid = NameIndexField.OTHER_GUID.search(taxonConceptGuid); BooleanQuery fullQuery = new BooleanQuery.Builder() .add(qGuid, BooleanClause.Occur.SHOULD) @@ -1937,7 +1903,7 @@ private String getPreferredGuid(String taxonConceptGuid) throws Exception { TopDocs topDocs = cbSearcher.search(fullQuery, 1); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { Document doc = cbSearcher.doc(scoreDoc.doc); - return doc.get("guid"); + return doc.get(NameIndexField.GUID.toString()); } return taxonConceptGuid; } @@ -2052,10 +2018,10 @@ private String findLSIDByConcatName(String name) { try { String concatName = concatName(name); - Query query = new TermQuery(new Term("concat_name", concatName)); + Query query = NameIndexField.CONCAT_NAME.search(concatName); TopDocs topDocs = cbSearcher.search(query, 2); - if (topDocs != null && topDocs.totalHits == 1) { + if (topDocs != null && topDocs.totalHits.value == 1) { for (ScoreDoc scoreDoc : topDocs.scoreDocs) { Document doc = cbSearcher.doc(scoreDoc.doc); return doc.get("guid"); @@ -2096,36 +2062,36 @@ private String getExtendedTaxonConceptByGuid(String guid, boolean checkPreferred */ public List autocomplete(String q, int max, boolean includeSynonyms) { try { - if(false) { - return null; - } else { - Map output = new HashMap(); - - //more queries for better scoring values - String lq = q.toLowerCase(); - String uq = q.toUpperCase(); - - //name search - Query fq = buildAutocompleteQuery("name", lq, false); - BooleanQuery b = new BooleanQuery.Builder() - .add(fq, BooleanClause.Occur.MUST) - .add(new WildcardQuery(new Term("left", "*")), includeSynonyms ? BooleanClause.Occur.SHOULD : BooleanClause.Occur.MUST) - .build(); - TopDocs results = cbSearcher.search(b, max); - appendAutocompleteResults(output, results, includeSynonyms, false); - - //format search term for the current common name index - uq = concatName(uq).toUpperCase(); - - //common name search - fq = buildAutocompleteQuery("common", uq, true); - results = vernSearcher.search(fq, max); - appendAutocompleteResults(output, results, includeSynonyms, true); - - return new ArrayList(output.values()); - } + AbstractStringMetric similarity = new Levenshtein(); + Map output = new HashMap<>(); + + //more queries for better scoring values + String lq = q.toLowerCase(); + String uq = q.toUpperCase(); + + //name search + Query fq = buildAutocompleteQuery(NameIndexField.NAME, lq, false); + BooleanQuery.Builder bb = new BooleanQuery.Builder(); + bb.add(fq, BooleanClause.Occur.MUST); + if (!includeSynonyms) + bb.add(NameIndexField.iS_SYNONYM.search("T"), BooleanClause.Occur.MUST_NOT); + BooleanQuery b = bb.build(); + TopDocs results = cbSearcher.search(b, max); + appendAutocompleteResults(output, results, includeSynonyms, false, q, similarity); + + //format search term for the current common name index + uq = concatName(uq).toUpperCase(); + + //common name search + fq = buildAutocompleteQuery(NameIndexField.SEARCHABLE_COMMON_NAME, uq, true); + results = vernSearcher.search(fq, max); + appendAutocompleteResults(output, results, includeSynonyms, true, q, similarity); + + List matches = new ArrayList<>(output.values()); + matches.sort(AUTOCOMPLETE_COMPARATOR); + return matches; } catch (Exception e) { - log.error("Autocomplete error.",e); + log.error("Autocomplete error.", e); } return null; } @@ -2165,6 +2131,90 @@ private String escapeQueryChars(String s) { return sb.toString(); } + /** + * Construct a name match result from a document. + * + * @param doc The document + * @param type The match type + * + * @return The resultng name match + */ + protected NameSearchResult createResult(Document doc, MatchType type) { + String name = doc.get(NameIndexField.NAME_CANONICAL.toString()); + if (name == null) + name = doc.get(NameIndexField.NAME.toString()); + if (name == null) + name = doc.get(NameIndexField.NAME_COMPLETE.toString()); + LinnaeanRankClassification rankClass = new LinnaeanRankClassification(doc.get(RankType.KINGDOM.getRank()), + doc.get(RankType.PHYLUM.getRank()), + doc.get(RankType.CLASS.getRank()), + doc.get(RankType.ORDER.getRank()), + doc.get(RankType.FAMILY.getRank()), + doc.get(RankType.GENUS.getRank()), + name); + rankClass.setSpecies(doc.get(RankType.SPECIES.getRank())); + //add the ids + rankClass.setKid(doc.get("kid")); + rankClass.setPid(doc.get("pid")); + rankClass.setCid(doc.get("cid")); + rankClass.setOid(doc.get("oid")); + rankClass.setFid(doc.get("fid")); + rankClass.setGid(doc.get("gid")); + rankClass.setSid(doc.get("sid")); + rankClass.setAuthorship(doc.get(NameIndexField.AUTHOR.toString())); + + String id = doc.get(NameIndexField.ID.toString()); + String lsid = doc.get(NameIndexField.LSID.toString()); + String kingdom = doc.get(RankType.KINGDOM.getRank()); + RankType rank = null; + try { + rank = RankType.getForId(Integer.parseInt(doc.get(NameIndexField.RANK_ID.toString()))); + } catch (Exception e) { + } + //left and right values for the taxon concept + String left = doc.get("left"); + String right = doc.get("right"); + SynonymType synonymType = SynonymType.getTypeFor(doc.get(NameIndexField.SYNONYM_TYPE.toString())); + String acceptedLsid = doc.get(NameIndexField.ACCEPTED.toString()); + IndexableField pf = doc.getField(NameIndexField.PRIORITY.toString()); + Integer priority = pf == null ? null : pf.numericValue().intValue(); + NameSearchResult result = new NameSearchResult(id, lsid, acceptedLsid, left, right, rankClass, rank, type, synonymType, priority); + result.setRank(rank); + result.setLeft(left); + result.setRight(right); + return result; + } + + + /** + * Returns the additional string that needs to be included in a search + * + * @param optional Indicates whether the the terms should be optional + * @return + */ + public void appendLuceneQuery(LinnaeanRankClassification classification, BooleanQuery.Builder builder, boolean optional) { + BooleanClause.Occur occurs = optional ? BooleanClause.Occur.SHOULD : BooleanClause.Occur.FILTER; + StringBuilder sb = new StringBuilder(); + if (StringUtils.isNotEmpty(classification.getKingdom())) + builder.add(NameIndexField.KINGDOM.search(classification.getKingdom()), occurs); + if (StringUtils.isNotEmpty(classification.getPhylum())) + builder.add(NameIndexField.PHYLUM.search(classification.getPhylum()), occurs); + if (StringUtils.isNotEmpty(classification.getKlass())) + builder.add(NameIndexField.CLASS.search(classification.getKlass()), occurs); + if (StringUtils.isNotEmpty(classification.getOrder())) + builder.add(NameIndexField.ORDER.search(classification.getOrder()), occurs); + if (StringUtils.isNotEmpty(classification.getFamily())) + builder.add(NameIndexField.FAMILY.search(classification.getFamily()), occurs); + if (StringUtils.isNotEmpty(classification.getGenus())) + builder.add(NameIndexField.GENUS.search(classification.getGenus()), occurs); + if (StringUtils.isNotEmpty(classification.getSpecies())) + builder.add(NameIndexField.SPECIES.search(classification.getSpecies()), occurs); + //authorship is always optional due to inconsistencies in the name format etc... + if (StringUtils.isNotEmpty(classification.getAuthorship())) + builder.add(NameIndexField.AUTHOR.search(classification.getAuthorship()), BooleanClause.Occur.SHOULD); + } + + public static void main(String[] args) throws IOException { ALANameSearcher nameindex = new ALANameSearcher(args[0]); @@ -2177,4 +2227,21 @@ public static void main(String[] args) throws IOException { } } + /** + * Values for fields + */ + private static class Value { + public NameIndexField field; + public T value; + + private Value(NameIndexField field, T value) { + this.field = field; + this.value = value; + } + + public static Value of(NameIndexField field, T value) { + return new Value<>(field, value); + } + } + } diff --git a/ala-name-matching-search/src/main/java/au/org/ala/names/search/FieldType.java b/ala-name-matching-search/src/main/java/au/org/ala/names/search/FieldType.java new file mode 100644 index 000000000..b8a6efc60 --- /dev/null +++ b/ala-name-matching-search/src/main/java/au/org/ala/names/search/FieldType.java @@ -0,0 +1,258 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + +package au.org.ala.names.search; + +import au.org.ala.names.lucene.analyzer.LowerCaseKeywordAnalyzer; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.document.*; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.util.QueryBuilder; + +/** + * The type of field stored in the lucene index. + *

+ * Used to determine how to store and search for a field. + *

+ */ +abstract public class FieldType { + protected static final ThreadLocal ANALYZER = ThreadLocal.withInitial( + () -> LowerCaseKeywordAnalyzer.newInstance() + ); + protected static final ThreadLocal QUERY_BUILDER = ThreadLocal.withInitial( + () -> new QueryBuilder(ANALYZER.get()) + ); + protected static final ThreadLocal TERM_FIELD_TYPE = ThreadLocal.withInitial( + () -> { + org.apache.lucene.document.FieldType ft = new org.apache.lucene.document.FieldType(TextField.TYPE_STORED); + ft.setOmitNorms(true); + return ft; + } + ); + + /** The class of term stored */ + private Class class_; + /** The name of the field type */ + private String name; + + /** + * Construct with a name + * + * @param name The name + */ + public FieldType(Class class_, String name) { + this.class_ = class_; + this.name = name; + } + + /** + * Store a field into a lucene document. + *

+ * This may involve storing multiple lucene fields for range types. + *

+ * + * @param value The value to store + * @param name The name of the field + * @param document The document to add the field to + */ + abstract public void store(T value, String name, Document document); + + /** + * Generate a query for a field of this type. + * + * @param value The value to search for + * @param name The field name + * @return A query that searches for the value + */ + abstract public Query search(T value, String name); + + /** + * Search for a value in a range (inclusive). + *

+ * By default, this throws a {@link UnsupportedOperationException}. + * Types that have a concept of range can use this to implement a range search. + *

+ * + * @param lower The lower bound + * @param upper The upper bound + * @param name The field name + * + * @return A query based on the range + */ + public Query searchRange(T lower, T upper, String name) { + throw new UnsupportedOperationException("Field type " + this.name + " does not support ranges"); + } + + + /** + * Store-only field. + */ + public static final FieldType STORE = new FieldType(String.class,"store") { + @Override + public void store(String value, String name, Document document) { + document.add(new StoredField(name, value)); + } + + @Override + public Query search(String value, String name) { + throw new UnsupportedOperationException("Store-only field"); + } + }; + + /** + * An exact identifier. + *

+ * Storage and search is accomplished via extact lookup. + *

+ */ + public static final FieldType IDENTIFIER = new FieldType(String.class,"identifier") { + @Override + public void store(String value, String name, Document document) { + document.add(new StringField(name, value, Field.Store.YES)); + } + + @Override + public Query search(String value, String name) { + return new TermQuery(new Term(name, value)); + } + }; + + /** + * A simple term. + *

+ * Storage and search is accomplished via case-insensitive storage and lookup. + *

+ */ + public static final FieldType TERM = new FieldType(String.class, "term") { + @Override + public void store(String value, String name, Document document) { + Field field = new Field(name, value, TERM_FIELD_TYPE.get()); + document.add(field); + } + + @Override + public Query search(String value, String name) { + return QUERY_BUILDER.get().createPhraseQuery(name, value); + } + }; + + /** + * A tokenisable term. + *

+ * Storage and search is accomplished via case-insensitive tokenisation and search + *

+ */ + public static final FieldType TEXT = new FieldType(String.class, "text") { + + @Override + public void store(String value, String name, Document document) { + document.add(new TextField(name, value, Field.Store.YES)); + } + + @Override + public Query search(String value, String name) { + return QUERY_BUILDER.get().createPhraseQuery(name, value); + } + }; + + /** + * A common name. + *

+ * Storage and search is based on a simplified lookup where non alpha-numeric characters are removed + * and made case insensitive. + *

+ */ + public static final FieldType COMMON = new FieldType(String.class,"common") { + @Override + public void store(String value, String name, Document document) { + value = value.toUpperCase().replaceAll("[^A-Z0-9ÏËÖÜÄÉÈČÁÀÆŒ]", ""); + document.add(new StringField(name, value, Field.Store.YES)); + } + + @Override + public Query search(String value, String name) { + value = value.toUpperCase().replaceAll("[^A-Z0-9ÏËÖÜÄÉÈČÁÀÆŒ]", ""); + return new TermQuery(new Term(name, value)); + } + }; + + /** + * An integer term. + *

+ * Storage and search allow range-based queries. + *

+ */ + public static final FieldType INTEGER = new FieldType(Integer.class, "integer") { + @Override + public void store(Integer value, String name, Document document) { + document.add(new IntPoint(name, value)); + document.add(new StoredField(name, value)); + } + + @Override + public Query search(Integer value, String name) { + return IntPoint.newExactQuery(name, value); + } + + /** + * Search for a value in a range (inclusive). + * + * @param lower The lower bound + * @param upper The upper bound + * @param name The field name + * @return A query based on the range + */ + @Override + public Query searchRange(Integer lower, Integer upper, String name) { + return IntPoint.newRangeQuery(name, lower, upper); + } + }; + + /** + * A double term. + *

+ * Storage and search allow range-based queries. + *

+ */ + public static final FieldType DOUBLE = new FieldType(Double.class, "double") { + @Override + public void store(Double value, String name, Document document) { + document.add(new DoublePoint(name, value)); + document.add(new StoredField(name, value)); + } + + @Override + public Query search(Double value, String name) { + return DoublePoint.newExactQuery(name, value); + } + + /** + * Search for a value in a range (inclusive). + * + * @param lower The lower bound + * @param upper The upper bound + * @param name The field name + * @return A query based on the range + */ + @Override + public Query searchRange(Double lower, Double upper, String name) { + return DoublePoint.newRangeQuery(name, lower, upper); + } + }; + +} diff --git a/ala-name-matching-search/src/main/java/au/org/ala/names/search/NameIndexField.java b/ala-name-matching-search/src/main/java/au/org/ala/names/search/NameIndexField.java new file mode 100644 index 000000000..16219e6d6 --- /dev/null +++ b/ala-name-matching-search/src/main/java/au/org/ala/names/search/NameIndexField.java @@ -0,0 +1,139 @@ +/* + * Copyright (C) 2014 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + */ +package au.org.ala.names.search; + +import org.apache.lucene.document.Document; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.WildcardQuery; + +/** + * An Enum for all the fields that are indexed for the name matching. + * + * @author Natasha Carter + */ +public enum NameIndexField { + ID("id", FieldType.IDENTIFIER), + GUID("guid", FieldType.IDENTIFIER), + OTHER_GUID("otherGuid", FieldType.IDENTIFIER), + LEFT("left", FieldType.INTEGER), + RIGHT("right", FieldType.INTEGER), + LSID("lsid", FieldType.IDENTIFIER), + REAL_LSID("reallsid", FieldType.STORE), + PARENT_ID("parent_id", FieldType.IDENTIFIER), + DOCUMENT_TYPE("doctype", FieldType.IDENTIFIER), + ACCEPTED("accepted_lsid", FieldType.IDENTIFIER), + iS_SYNONYM("is_synonym", FieldType.IDENTIFIER),//whether or not the record is a synonym + KINGDOM("kingdom", FieldType.TERM), + KINGDOM_ID("kid", FieldType.STORE), + PHYLUM("phylum", FieldType.TERM), + PHYLUM_ID("pid", FieldType.STORE), + CLASS("class", FieldType.TERM), + CLASS_ID("cid", FieldType.STORE), + ORDER("order", FieldType.TERM), + ORDER_ID("oid", FieldType.STORE), + FAMILY("family", FieldType.TERM), + FAMILY_ID("fid", FieldType.STORE), + GENUS("genus", FieldType.TERM), + GENUS_ID("gid", FieldType.STORE), + GENUS_EX("genus_ex", FieldType.TERM), //genus sounds like expression - handles masculine and feminine too. + SPECIES("species", FieldType.TERM), + SPECIES_ID("sid", FieldType.STORE), + SPECIES_EX("specific_ex", FieldType.TERM),// specific epithet sounds like expression + INFRA_EX("infra_ex", FieldType.TERM),//infra specific epithet sounds like expression + SPECIFIC("specific", FieldType.TERM), + INFRA_SPECIFIC("infra", FieldType.TERM), + NAME("name", FieldType.TEXT),// search name + OTHER_NAMES("other_names", FieldType.TEXT),// Alternative names + NAME_CANONICAL("name_canonical", FieldType.TEXT), // Canonical name + NAME_COMPLETE("name_complete", FieldType.TEXT), // Complete name + SEARCHABLE_COMMON_NAME("common", FieldType.COMMON), + COMMON_NAME("common_orig", FieldType.TEXT), + CONCAT_NAME("concat_name", FieldType.TERM), + RANK_ID("rank_id", FieldType.INTEGER), + RANK("rank", FieldType.TERM), + AUTHOR("author", FieldType.TEXT), + PHRASE("phrase", FieldType.TEXT),//stores the values of a "phrase" name. Some more intelligence will be needed when matching these + VOUCHER("voucher", FieldType.TEXT), //stores a voucher value minus the spaces and fullstops. + ALA("ala", FieldType.IDENTIFIER), //stores whether or not it is an ALA generated name + DATASET_ID("dataset_id", FieldType.IDENTIFIER), // The source dataset + SYNONYM_TYPE("syn_type", FieldType.IDENTIFIER), //stores the type of synonym that it represents + HOMONYM("homonym", FieldType.IDENTIFIER), + LANGUAGE("lang", FieldType.IDENTIFIER), + /* Stores the priority score associated with a taxon */ + PRIORITY("priority", FieldType.INTEGER); + + /** The field name */ + String name; + /** The field type */ + FieldType type; + + NameIndexField(String name, FieldType type) { + this.name = name; + this.type = type; + } + + public String toString() { + return name; + } + + /** + * Store a value into this field in a document + * + * @param value The value + * @param document The document + */ + public void store(T value, Document document) { + if (value == null) + return; + this.type.store(value, this.name, document); + } + + /** + * Make a query for this field for a value. + * + * @param value The value + * + * @return A matching query + */ + public Query search(T value) { + return this.type.search(value, this.name); + } + + /** + * Make a range query for this field for a value. + * + * @param lower The lower value (inclusive) + * @param upper The upper value (inclusive) + * + * @return A matching query + */ + public Query searchRange(T lower, T upper) { + return this.type.searchRange(lower, upper, this.name); + } + + + /** + * Make a wildcard query for this field for a value. + * + * @param value The value, including "*" for wildcards + * + * @return A matching query + */ + public Query searchWildcard(String value) { + return new WildcardQuery(new Term(this.name, value)); + } + +} diff --git a/src/main/java/au/org/ala/names/util/FileUtils.java b/ala-name-matching-search/src/main/java/au/org/ala/names/util/FileUtils.java similarity index 98% rename from src/main/java/au/org/ala/names/util/FileUtils.java rename to ala-name-matching-search/src/main/java/au/org/ala/names/util/FileUtils.java index cd3cee8ca..8b0fde71e 100644 --- a/src/main/java/au/org/ala/names/util/FileUtils.java +++ b/ala-name-matching-search/src/main/java/au/org/ala/names/util/FileUtils.java @@ -19,7 +19,7 @@ import java.util.Set; import org.apache.commons.io.LineIterator; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; /** * Some Generic file utilities. diff --git a/ala-name-matching-search/src/main/java/au/org/ala/names/util/TaxonNameSoundEx.java b/ala-name-matching-search/src/main/java/au/org/ala/names/util/TaxonNameSoundEx.java new file mode 100644 index 000000000..8708e97cb --- /dev/null +++ b/ala-name-matching-search/src/main/java/au/org/ala/names/util/TaxonNameSoundEx.java @@ -0,0 +1,297 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + +package au.org.ala.names.util; + +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; +import java.util.StringTokenizer; + +import org.apache.commons.lang3.StringUtils; + +/** + * A Java implementation of the sound ex algorithm supplied by Tony Rees + * Copied from Taxamatch project. We don't need full taxamatch... + */ +public class TaxonNameSoundEx { + + private static String translate(String source, String transSource, String transTarget) { + String result = source; + + while (transSource.length() > transTarget.length()) { + transTarget += " "; + } + for (int i = 0; i < transSource.length(); i++) { + result = result.replace(transSource.charAt(i), transTarget.charAt(i)); + } + return result; + } + + + public static String normalize(String str) { + + if (str == null) return null; + + String output = str; + + // trim any leading, trailing spaces or line feeds + //output = ltrim(rtrim(str)); + + output = output.replace(" cf ", " "); + output = output.replace(" cf. ", " "); + output = output.replace(" near ", " "); + output = output.replace(" aff. ", " "); + output = output.replace(" sp.", " "); + output = output.replace(" spp.", " "); + output = output.replace(" spp ", " "); + + output = str.toUpperCase(); + + // replace any HTML ampersands + output = output.replace(" & ", " & "); + + // remove any content in angle brackets (e.g. html tags - , , etc.) + output = output.replaceAll("\\<.+?\\>", ""); + + output = translate(output, "\u00c1\u00c9\u00cd\u00d3\u00da\u00c0\u00c8\u00cc\u00d2\u00d9" + + "\u00c2\u00ca\u00ce\u00d4\u00db\u00c4\u00cb\u00cf\u00d6\u00dc\u00c3\u00d1\u00d5" + + "\u00c5\u00c7\u00d8", "AEIOUAEIOUAEIOUAEIOUANOACO"); + + output = output.replace("\u00c6", "AE"); + output = output.replaceAll("[^a-zA-Z .]", ""); + output = StringUtils.trimToNull(output); + + return output; + } + + + public static String treatWord(String str2, String wordType) { + char startLetter; + String temp = normalize(str2); + // Do some selective replacement on the leading letter/s only: + if (StringUtils.isNotEmpty(temp)) { + if (temp.startsWith("AE")) { + temp = "E" + temp.substring(2); + } else if (temp.startsWith("CN")) { + temp = "N" + temp.substring(2); + } else if (temp.startsWith("CT")) { + temp = "T" + temp.substring(2); + } else if (temp.startsWith("CZ")) { + temp = "C" + temp.substring(2); + } else if (temp.startsWith("DJ")) { + temp = "J" + temp.substring(2); + } else if (temp.startsWith("EA")) { + temp = "E" + temp.substring(2); + } else if (temp.startsWith("EU")) { + temp = "U" + temp.substring(2); + } else if (temp.startsWith("GN")) { + temp = "N" + temp.substring(2); + } else if (temp.startsWith("KN")) { + temp = "N" + temp.substring(2); + } else if (temp.startsWith("MC")) { + temp = "MAC" + temp.substring(2); + } else if (temp.startsWith("MN")) { + temp = "N" + temp.substring(2); + } else if (temp.startsWith("OE")) { + temp = "E" + temp.substring(2); + } else if (temp.startsWith("QU")) { + temp = "Q" + temp.substring(2); + } else if (temp.startsWith("PS")) { + temp = "S" + temp.substring(2); + } else if (temp.startsWith("PT")) { + temp = "T" + temp.substring(2); + } else if (temp.startsWith("TS")) { + temp = "S" + temp.substring(2); + } else if (temp.startsWith("WR")) { + temp = "R" + temp.substring(2); + } else if (temp.startsWith("X")) { + temp = "Z" + temp.substring(2); + } + // Now keep the leading character, then do selected "soundalike" replacements. The + // following letters are equated: AE, OE, E, U, Y and I; IA and A are equated; + // K and C; Z and S; and H is dropped. Also, A and O are equated, MAC and MC are equated, and SC and S. + startLetter = temp.charAt(0); // quarantine the leading letter + temp = temp.substring(1); // snip off the leading letter + // now do the replacements + temp = temp.replaceAll("AE", "I"); + temp = temp.replaceAll("IA", "A"); + temp = temp.replaceAll("OE", "I"); + temp = temp.replaceAll("OI", "A"); + temp = temp.replaceAll("SC", "S"); + temp = temp.replaceAll("E", "I"); + temp = temp.replaceAll("O", "A"); + temp = temp.replaceAll("U", "I"); + temp = temp.replaceAll("Y", "I"); + temp = temp.replaceAll("K", "C"); + temp = temp.replaceAll("Z", "C"); + temp = temp.replaceAll("H", ""); + // add back the leading letter + temp = startLetter + temp; + // now drop any repeated characters (AA becomes A, BB or BBB becomes B, etc.) + temp = temp.replaceAll("(\\w)\\1+", "$1"); + + if (wordType == "species") { + if (temp.endsWith("IS")) { + temp = temp.substring(0, temp.length() - 2) + "A"; + } else if (temp.endsWith("IM")) { + temp = temp.substring(0, temp.length() - 2) + "A"; + } else if (temp.endsWith("AS")) { + temp = temp.substring(0, temp.length() - 2) + "A"; + } + //temp = temp.replaceAll("(\\w)\\1+", "$1"); + } + } + return temp; + } + + + /** + * Returns the SoundEx for the source string + * + * @param source String to get the sound ex of + * @return The sound ex string + */ + public String soundEx(String source) { + String temp = source.toUpperCase(); + temp = selectiveReplaceFirstChar(temp); + temp = selectiveReplaceWithoutFirstChar(temp); + temp = removeRepeatedChars(temp); + temp = alphabetiseWordsIgnoringFirstLetter(temp); + + return temp; + } + + /** + * Ignoring the first letter, alphabetise each word + */ + String alphabetiseWordsIgnoringFirstLetter(String source) { + StringTokenizer st = new StringTokenizer(source, " "); + StringBuffer sb = new StringBuffer(); + while (st.hasMoreTokens()) { + String token = st.nextToken(); + char[] chars = token.toCharArray(); + List charList = new LinkedList(); + for (int i = 1; i < chars.length; i++) { + charList.add(chars[i]); + } + Collections.sort(charList); + sb.append(chars[0]); + for (Character c : charList) { + sb.append(c); + } + if (st.hasMoreTokens()) { + sb.append(" "); + } + } + return sb.toString(); + } + + /** + * Removes repeated characters + * Can't get the regex version working so pretty primitive... + */ + String removeRepeatedChars(String source) { + StringBuffer sb = new StringBuffer(); + char c = ' '; + for (int i = 0; i < source.length(); i++) { + char sourceC = source.charAt(i); + if (sourceC != c) { + sb.append(sourceC); + } + c = sourceC; + } + return sb.toString(); + } + + /** + * Ignoring the first character, selectively replace sound alikes + */ + String selectiveReplaceWithoutFirstChar(String source) { + if (source.length() > 1) { + String temp = source.substring(1); + temp = temp.replaceAll("AE", "I"); + temp = temp.replaceAll("IA", "A"); + temp = temp.replaceAll("OE", "I"); + temp = temp.replaceAll("OI", "A"); + temp = temp.replaceAll("MC", "MAC"); + temp = temp.replaceAll("SC", "S"); + temp = temp.replaceAll("EOUYKZH", "IAIICS"); + + return source.substring(0, 1) + temp; + } else { + return source; + } + } + + /** + * Selectively replaces the first character + */ + String selectiveReplaceFirstChar(String source) { + if (source.startsWith("Æ")) { + return source.replaceFirst("Æ", "E"); + + } else if (source.startsWith("AE")) { + return source.replaceFirst("AE", "E"); + + } else if (source.startsWith("CN")) { + return source.replaceFirst("CN", "N"); + + } else if (source.startsWith("CT")) { + return source.replaceFirst("CT", "T"); + + } else if (source.startsWith("CZ")) { + return source.replaceFirst("CZ", "C"); + + } else if (source.startsWith("DJ")) { + return source.replaceFirst("DJ", "J"); + + } else if (source.startsWith("EA")) { + return source.replaceFirst("EA", "E"); + + } else if (source.startsWith("EU")) { + return source.replaceFirst("EU", "U"); + + } else if (source.startsWith("GN")) { + return source.replaceFirst("GN", "N"); + + } else if (source.startsWith("KN")) { + return source.replaceFirst("KN", "N"); + + } else if (source.startsWith("MN")) { + return source.replaceFirst("MN", "N"); + + } else if (source.startsWith("OE")) { + return source.replaceFirst("OE", "E"); + + } else if (source.startsWith("QU")) { + return source.replaceFirst("QU", "Q"); + + } else if (source.startsWith("PS")) { + return source.replaceFirst("PS", "S"); + + } else if (source.startsWith("PT")) { + return source.replaceFirst("PT", "T"); + + } else if (source.startsWith("TS")) { + return source.replaceFirst("TS", "S"); + + } else if (source.startsWith("X")) { + return source.replaceFirst("X", "Z"); + + } else return source; + } +} \ No newline at end of file diff --git a/ala-name-matching-search/src/main/resources/au/org/ala/homonyms/cross_rank_homonyms.txt b/ala-name-matching-search/src/main/resources/au/org/ala/homonyms/cross_rank_homonyms.txt new file mode 100644 index 000000000..6f3cca49b --- /dev/null +++ b/ala-name-matching-search/src/main/resources/au/org/ala/homonyms/cross_rank_homonyms.txt @@ -0,0 +1,33 @@ +# A list of the cross rank homonyms according to Tony Rees source: wikispecies homonyms lists, 1/12/2010: +Acanthocephala +Acantharia +Acrasia +Adenophora +Adenophora +Anisoptera +Anura +Articulata +Cerapoda +Coccoidea +Ctenophora +Dirina +Echinacea +Ensifera +Ephemeroidea +Furnariidae +Lestoidea +Lichina +Lobata +Oligochaeta +Ommatophora +Patellina +Pholidota +Platynota +Plecoptera +Pogonophora +Polychaeta +Polyphaga +Pterygota +Raphiinae +Symphyta +Theria \ No newline at end of file diff --git a/src/main/resources/au/org/ala/propertystore/known_homonyms.txt b/ala-name-matching-search/src/main/resources/au/org/ala/propertystore/known_homonyms.txt similarity index 100% rename from src/main/resources/au/org/ala/propertystore/known_homonyms.txt rename to ala-name-matching-search/src/main/resources/au/org/ala/propertystore/known_homonyms.txt diff --git a/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java b/ala-name-matching-search/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java similarity index 73% rename from src/test/java/au/org/ala/names/search/ALANameSearcherTest.java rename to ala-name-matching-search/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java index 44f77b492..2782b2ddd 100644 --- a/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java +++ b/ala-name-matching-search/src/test/java/au/org/ala/names/search/ALANameSearcherTest.java @@ -1,5 +1,21 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.search; import au.org.ala.names.model.*; @@ -20,7 +36,7 @@ public class ALANameSearcherTest { @org.junit.BeforeClass public static void init() throws Exception { - searcher = new ALANameSearcher("/data/lucene/namematching-20200214"); + searcher = new ALANameSearcher("/data/lucene/namematching-20210811"); } @Test @@ -31,7 +47,7 @@ public void testMisappliedNames1() throws Exception { fail("A misapplied exception should be thrown"); //assertEquals("urn:lsid:biodiversity.org.au:apni.taxon:549612",lsid); } catch (MisappliedException ex) { - assertEquals("https://id.biodiversity.org.au/node/apni/2915977", ex.getMatchedResult().getLsid()); + assertEquals("https://id.biodiversity.org.au/taxon/apni/51401037", ex.getMatchedResult().getLsid()); //assertNull(ex.getMisappliedResult()); } } @@ -53,12 +69,27 @@ public void testMisappliedNames2() { @Test public void testMisappliedNames3() { try { - String name = "Scleroderma aurantium (L. : Pers.) Pers."; + String name = "Acacia bivenosa DC."; NameSearchResult nsr = searcher.searchForRecord(name); fail("Expecting misapplied exception"); assertNotNull(nsr); } catch (MisappliedException ex) { - assertEquals("92a4e5c4-32c1-44c6-a9f7-410659692dfa", ex.getMatchedResult().getLsid()); + assertEquals("https://id.biodiversity.org.au/node/apni/2912987", ex.getMatchedResult().getLsid()); + } catch (SearchResultException ex) { + fail("Unexpected search exception " + ex); + } + } + + + @Test + public void testMisappliedNames4() { + try { + String name = "Caladenia concinna"; + NameSearchResult nsr = searcher.searchForRecord(name); + fail("Expecting misapplied exception"); + assertNotNull(nsr); + } catch (MisappliedException ex) { + assertEquals("https://id.biodiversity.org.au/taxon/apni/51398909", ex.getMatchedResult().getLsid()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex); } @@ -119,7 +150,7 @@ public void parserBlackList() throws Exception { String name = "Petaurus australis unnamed subsp."; String lsid = searcher.searchForLSID(name, true); assertNotNull(lsid); - assertEquals("ALA_Petaurus_australis_unnamed_subsp", lsid); + assertEquals("ALA_3617757", lsid); } @Test @@ -133,7 +164,7 @@ public void testRecursiveSearch() { try { NameSearchResult nsr = searcher.searchForRecord(cl, true, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:3309bb2e-5b3f-4664-977b-147e60b66109", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/3309bb2e-5b3f-4664-977b-147e60b66109", nsr.getLsid()); System.out.println(nsr); } catch (Exception e) { e.printStackTrace(); @@ -150,8 +181,8 @@ public void testSpeciesSplitSynonym() { } catch (Exception e) { assertTrue(e instanceof ParentSynonymChildException); ParentSynonymChildException psce = (ParentSynonymChildException) e; - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:c195483c-6ef0-4043-8bdf-6d9464bef8f9", psce.getParentResult().getLsid()); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:db338300-a464-4ccb-bdc6-2cf92665fb7d", psce.getChildResult().getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/2c5fd509-d4d6-4adb-9566-96280ff9e6af", psce.getParentResult().getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/b4f39a2b-cfaf-4c69-8ace-77f1664acd6b", psce.getChildResult().getLsid()); } } @@ -176,7 +207,7 @@ public void testExcludedNames() { } catch (Exception e) { assertTrue(e instanceof ExcludedNameException); ExcludedNameException ene = (ExcludedNameException) e; - assertEquals("urn:lsid:biodiversity.org.au:afd.name:433c43fe-cf38-4b76-9bdb-55a89fbac291", ene.getExcludedName().getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/74ac7082-6138-4eb0-86ba-95535deab180", ene.getExcludedName().getLsid()); } String apcExcludedName = "Parestia elegans"; @@ -205,7 +236,7 @@ public void testHomonymsWithResolution1() throws Exception { cl.setScientificName("Thalia"); try { nsr = searcher.searchForRecord("Thalia", null, true); - fail("Thalia should throw a homonym without kingdom or author"); + fail("Thalia should throw a homonym without kingdom or author, got " + nsr.getLsid()); } catch (HomonymException e) { } } @@ -220,7 +251,7 @@ public void testHomonymsWithResolution2() throws Exception { try { nsr = searcher.searchForRecord(cl, false); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:52c68649-47d5-4f2e-9730-417fc54fb080", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/52c68649-47d5-4f2e-9730-417fc54fb080", nsr.getLsid()); } catch (HomonymException e) { fail("Homonym should be resolved via the Kingdom"); } @@ -250,7 +281,7 @@ public void testHomonymsWithResolution4() throws Exception { try { nsr = searcher.searchForRecord(cl, false); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:52c68649-47d5-4f2e-9730-417fc54fb080", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/52c68649-47d5-4f2e-9730-417fc54fb080", nsr.getLsid()); } catch (HomonymException e) { fail("Author should identify homonym value to use"); } @@ -265,7 +296,7 @@ public void testHomonymsWithResolution5() throws Exception { try { nsr = searcher.searchForRecord(cl, false); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:52c68649-47d5-4f2e-9730-417fc54fb080", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/52c68649-47d5-4f2e-9730-417fc54fb080", nsr.getLsid()); } catch (HomonymException e) { fail("Author should identify homonym value to use"); } @@ -294,7 +325,7 @@ public void testsStrMarker1(){ cl.setScientificName("Macropus rufus"); nsr = searcher.searchForRecord(cl, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.name:fbe09d8b-8cc2-444a-b8f7-d06730543781", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/fbe09d8b-8cc2-444a-b8f7-d06730543781", nsr.getLsid()); } catch (SearchResultException ex) { fail("Not expecting exception " + ex); } @@ -311,7 +342,7 @@ public void testsStrMarker2(){ cl.setScientificName("Osphranter rufus"); nsr = searcher.searchForRecord(cl, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:e6aff6af-ff36-4ad5-95f2-2dfdcca8caff", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/e6aff6af-ff36-4ad5-95f2-2dfdcca8caff", nsr.getLsid()); } catch (SearchResultException ex) { fail("Not expecting exception " + ex); } @@ -324,7 +355,7 @@ public void testsStrMarker3() { String name = "Oenochrominae s. str."; // There's only one of these left NameSearchResult nsr = searcher.searchForRecord(name, null); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:537ff8fb-b6c2-4536-9cb8-ad244832c1de", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/537ff8fb-b6c2-4536-9cb8-ad244832c1de", nsr.getLsid()); } catch (SearchResultException ex) { fail("Not expecting exception " + ex); } @@ -349,7 +380,7 @@ public void testsStrMarker5() { String name = "Stennella longirostris longirostris"; NameSearchResult nsr = searcher.searchForRecord(name, null, true); assertNotNull(nsr); - assertEquals("ALA_Stennella_longirostris_longirostris", nsr.getLsid()); + assertEquals("ALA_190693", nsr.getLsid()); } catch (SearchResultException ex) { fail("Not expecting exception " + ex); } @@ -361,7 +392,7 @@ public void testsStrMarker6() { String name = "Aplonis fusca hulliana"; NameSearchResult nsr = searcher.searchForRecord(name, null); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:d1674a33-af14-4592-be4d-2ededc1b53cd", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/7b241ea8-07ab-4aa0-a2d7-c0b43767c3d4", nsr.getLsid()); } catch (SearchResultException ex) { fail("Not expecting exception " + ex); } @@ -398,8 +429,8 @@ public void testsStrMarker9() { String name = "Siganus nebulosus"; NameSearchResult nsr = searcher.searchForRecord(name, null, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.name:c2d406d8-1066-4fd3-8c95-31ee6343a1b8", nsr.getLsid()); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:0aa9653f-00c7-42b9-896b-f399103703b8", nsr.getAcceptedLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/c2d406d8-1066-4fd3-8c95-31ee6343a1b8", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/0aa9653f-00c7-42b9-896b-f399103703b8", nsr.getAcceptedLsid()); } catch (SearchResultException ex) { fail("Not expecting exception " + ex); @@ -411,7 +442,7 @@ public void testsStrMarker10() { String name = "Anabathron contabulatum"; NameSearchResult nsr = searcher.searchForRecord(name, null, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:eea54328-a4a5-406b-bdfd-3ed119241591", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/39a6129d-dca7-4e3f-bec7-88f0e848c92c", nsr.getLsid()); } catch (SearchResultException ex) { fail("Not expecting exception " + ex); } @@ -443,7 +474,7 @@ public void testQuestionSpeciesMatch() { //test the "name based" synonym "has generic combination" nsr = searcher.searchForRecord("Cacatua leadbeateri", null); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:0217f06f-664c-4c64-bc59-1b54650fa23d", nsr.getAcceptedLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/5815e99d-01cd-4a92-99ba-36f480c4834d", nsr.getAcceptedLsid()); name = "Zieria smithii"; nsr = searcher.searchForRecord(name, null); @@ -465,7 +496,7 @@ public void testSpMarker1() { nsr = searcher.searchForRecord(name, null); assertNotNull(nsr); // Either one can match - assertTrue("https://id.biodiversity.org.au/name/apni/190511".equals(nsr.getLsid()) || "https://id.biodiversity.org.au/name/apni/233691".equals(nsr.getLsid())); + assertEquals("https://id.biodiversity.org.au/taxon/apni/51414212", nsr.getLsid()); } catch (SearchResultException e) { fail("Unexpected search exception " + e); } @@ -489,7 +520,7 @@ public void testSpMarker3() { try { String name = "Lindernia sp. Pilbara (M.N.Lyons & L.Lewis FV 1069)"; NameSearchResult nsr = null; - nsr = searcher.searchForRecord(name, RankType.SUBSPECIES); + nsr = searcher.searchForRecord(name, RankType.SPECIES); assertNotNull(nsr); assertEquals("https://id.biodiversity.org.au/name/apni/51306553", nsr.getLsid()); } catch (SearchResultException e) { @@ -583,7 +614,7 @@ public void testPhraseMatch4() { NameSearchResult nsr = null; nsr = searcher.searchForRecord(name, null); assertNotNull(nsr); - assertEquals("https://id.biodiversity.org.au/name/apni/233691", nsr.getLsid()); + assertEquals("https://id.biodiversity.org.au/taxon/apni/51414212", nsr.getLsid()); } catch (SearchResultException e) { fail("Unexpected search exception " + e); } @@ -596,29 +627,39 @@ public void testPhraseMatch5() { NameSearchResult nsr = null; nsr = searcher.searchForRecord(name, null); assertNotNull(nsr); - assertEquals("https://id.biodiversity.org.au/node/apni/2898916", nsr.getLsid()); + assertEquals("https://id.biodiversity.org.au/instance/apni/9302042", nsr.getLsid()); } catch (SearchResultException e) { fail("Unexpected search exception " + e); } } - @Test - public void testSynonymWithoutRank() { - try { - LinnaeanRankClassification cl = new LinnaeanRankClassification(); - cl.setKingdom("Animalia"); - cl.setScientificName("Gymnorhina tibicen"); - NameSearchResult nsr = searcher.searchForRecord(cl, true, true); - assertEquals("Gymnorhina tibicen", nsr.getRankClassification().getScientificName()); - assertEquals("(Latham, 1801)", nsr.getRankClassification().getAuthorship()); - nsr = searcher.searchForRecord("Cracticus tibicen", RankType.SPECIES); - assertEquals("Cracticus tibicen", nsr.getRankClassification().getScientificName()); - nsr = searcher.searchForRecord("Cracticus tibicen", RankType.GENUS); - assertEquals(null, nsr); - } catch (Exception e) { + public void testSynonymWithoutRank1() throws Exception { + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setKingdom("Animalia"); + cl.setScientificName("Gymnorhina tibicen"); + NameSearchResult nsr = searcher.searchForRecord(cl, true, true); + assertEquals("Gymnorhina tibicen", nsr.getRankClassification().getScientificName()); + assertEquals("(Latham, 1801)", nsr.getRankClassification().getAuthorship()); + cl.setScientificName("Cracticus tibicen"); + cl.setRank(RankType.SPECIES.getRank()); + nsr = searcher.searchForRecord(cl, true, true); + assertEquals("ALA_3267030", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/5291343e-fdeb-4a65-8ba5-928f5b96acf5", nsr.getAcceptedLsid()); + } - } + + @Test + public void testSynonymWithoutRank2() throws Exception { + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName("Abantiades zonatriticum"); + NameSearchResult nsr = searcher.searchForRecord(cl, true, true); + assertEquals("Abantiades", nsr.getRankClassification().getScientificName()); + assertEquals(MatchType.RECURSIVE, nsr.getMatchType()); + cl.setRank(RankType.SPECIES.getRank()); + nsr = searcher.searchForRecord(cl, true, true); + assertEquals("Abantiades", nsr.getRankClassification().getScientificName()); + assertEquals(MatchType.RECURSIVE, nsr.getMatchType()); } @Test @@ -637,11 +678,11 @@ public void testRecordSearchWithoutScientificName() { @Test public void testInfragenricAndSoundEx1() { - String nameDifferentEnding = "Phylidonyris pyrrhopterus"; + String nameDifferentEnding = "Phylidonyris pyrrhoptera"; try { NameSearchResult nsr = searcher.searchForRecord(nameDifferentEnding, null, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:604e6ea0-1a7f-4ee4-ad50-2cff8243631f", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/61f2bc62-dd50-4ba2-82a0-0377d386e4d8", nsr.getLsid()); assertEquals(MatchType.SOUNDEX, nsr.getMatchType()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); @@ -654,8 +695,8 @@ public void testInfragenricAndSoundEx2() { try { NameSearchResult nsr = searcher.searchForRecord(nameWithInfraGenric, null, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:604e6ea0-1a7f-4ee4-ad50-2cff8243631f", nsr.getLsid()); - assertEquals(MatchType.EXACT, nsr.getMatchType()); + assertEquals("https://biodiversity.org.au/afd/taxa/61f2bc62-dd50-4ba2-82a0-0377d386e4d8", nsr.getLsid()); + assertEquals(MatchType.SOUNDEX, nsr.getMatchType()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); } @@ -663,11 +704,11 @@ public void testInfragenricAndSoundEx2() { @Test public void testInfragenricAndSoundEx3() { - String nameDiffEndInfraGeneric = "Phylidonyris (Phylidonyris) pyrrhopterus"; + String nameDiffEndInfraGeneric = "Phylidonyris (Phylidonyris) pyrrhopteras"; try { NameSearchResult nsr = searcher.searchForRecord(nameDiffEndInfraGeneric, null, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:604e6ea0-1a7f-4ee4-ad50-2cff8243631f", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/61f2bc62-dd50-4ba2-82a0-0377d386e4d8", nsr.getLsid()); assertEquals(MatchType.SOUNDEX, nsr.getMatchType()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); @@ -680,7 +721,7 @@ public void testInfragenricAndSoundEx4() { try { NameSearchResult nsr = searcher.searchForRecord(name, null, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:b1ba5449-a68e-4c3b-ae90-8e667617945b", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/c7d8dbc8-dcde-4182-85ba-907182f95ea9", nsr.getLsid()); assertEquals(MatchType.EXACT, nsr.getMatchType()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); @@ -693,7 +734,7 @@ public void testInfragenricAndSoundEx5() { try { NameSearchResult nsr = searcher.searchForRecord(name, null, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:b1ba5449-a68e-4c3b-ae90-8e667617945b", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/c7d8dbc8-dcde-4182-85ba-907182f95ea9", nsr.getLsid()); assertEquals(MatchType.SOUNDEX, nsr.getMatchType()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); @@ -706,8 +747,8 @@ public void testInfragenricAndSoundEx6() { try { NameSearchResult nsr = searcher.searchForRecord(name, null, true); assertNotNull(nsr); - assertEquals("CAAB:79629da1:6054320e:589caaa6:bb265593", nsr.getLsid()); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:1a85a82f-5a1f-4c56-9f04-918643461260", nsr.getAcceptedLsid()); + assertEquals("SY_39006017_1", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/1a85a82f-5a1f-4c56-9f04-918643461260", nsr.getAcceptedLsid()); assertEquals(MatchType.SOUNDEX, nsr.getMatchType()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); @@ -780,7 +821,7 @@ public void testOutOfGeography1() { try { NameSearchResult nsr = searcher.searchForRecord(classification, true, true, true); assertNotNull(nsr); - assertEquals("ALA_Proboscidea", nsr.getLsid()); + assertEquals("ALA_3267031", nsr.getLsid()); assertEquals(MatchType.RECURSIVE, nsr.getMatchType()); assertEquals(RankType.ORDER, nsr.getRank()); } catch (SearchResultException ex) { @@ -797,7 +838,7 @@ public void testOutOfGeography2() { try { NameSearchResult nsr = searcher.searchForRecord(classification, true, true, true); assertNotNull(nsr); - assertEquals("ALA_Myrina", nsr.getLsid()); + assertEquals("ALA_3267033", nsr.getLsid()); assertEquals(MatchType.RECURSIVE, nsr.getMatchType()); assertEquals(RankType.GENUS, nsr.getRank()); } catch (SearchResultException ex) { @@ -856,7 +897,7 @@ public void testPhraseNames() { public void testNoRank() { try { String lsid = searcher.searchForLSID("Animalia"); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:4647863b-760d-4b59-aaa1-502c8cdf8d3c", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/4647863b-760d-4b59-aaa1-502c8cdf8d3c", lsid); lsid = searcher.searchForLSID("Bacteria"); assertEquals("NZOR-6-73174", lsid); } catch (SearchResultException e) { @@ -911,7 +952,7 @@ public void testIgnoredHomonyms2() { cl.setGenus("Macropus"); //NameSearchResult nsr =searcher.searchForRecord(cl.getId(), cl, null, true,true); String lsid = searcher.searchForLSID("Macropus", false, true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:b1d9bf29-648f-47e6-8544-2c2fbdf632b1", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/b1d9bf29-648f-47e6-8544-2c2fbdf632b1", lsid); } catch (Exception e) { fail("ignored homonyms should not throw exception " + e.getMessage()); } @@ -940,7 +981,7 @@ public void testIgnoredHomonyms4() { cl.setGenus("Agathis"); cl.setKingdom("Animalia"); NameSearchResult nsr = searcher.searchForRecord(cl.getScientificName(), cl, null, true, true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:a4109d9e-723c-491a-9363-95df428fe230", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/d02923bc-cf54-4d7f-ae74-aac1d6af1830", nsr.getLsid()); } catch (Exception e) { fail("A kingdom was supplied and should be resolvable. " + e.getMessage()); } @@ -1022,7 +1063,7 @@ public void testCommonNames1() { String name = "Red Kangaroo"; String lsid = getCommonNameLSID(name); String sciName = getCommonName(name); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:e6aff6af-ff36-4ad5-95f2-2dfdcca8caff", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/e6aff6af-ff36-4ad5-95f2-2dfdcca8caff", lsid); assertEquals("Osphranter rufus", sciName); } @@ -1031,8 +1072,8 @@ public void testCommonNames2() { String name = "Yellow-tailed Black-Cockatoo"; String lsid = getCommonNameLSID(name); String sciName = getCommonName(name); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:72ca8d75-71da-4751-a5cf-aa07ac3869f7", lsid); - assertEquals("Calyptorhynchus (Zanda) funereus", sciName); + assertEquals("https://biodiversity.org.au/afd/taxa/145b081d-eca7-4d9b-9171-b97e2d061536", lsid); + assertEquals("Zanda funerea", sciName); } @Test @@ -1040,7 +1081,7 @@ public void testCommonNames3() { String name = "Scarlet Robin"; String lsid = getCommonNameLSID(name); String sciName = getCommonName(name); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:b02a8195-266e-463b-89b7-3dc2a1c48450", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/a3e5376b-f9e6-4bdf-adae-1e7add9f5c29", lsid); assertEquals("Petroica (Petroica) boodang", sciName); } @@ -1049,7 +1090,7 @@ public void testCommonNames4() { String name = "Pacific Bluefin Tuna"; String lsid = getCommonNameLSID(name); String sciName = getCommonName(name); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:b35bf6d6-3b67-4d4c-b81e-b7ca7a64d341", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/b35bf6d6-3b67-4d4c-b81e-b7ca7a64d341", lsid); assertEquals("Thunnus orientalis", sciName); } @@ -1058,7 +1099,7 @@ public void testCommonNames5() { String name = "Pacific Black Duck"; String lsid = getCommonNameLSID(name); String sciName = getCommonName(name); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:da8a156f-95e2-4fcb-a6e7-52721705a70c", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/81be58f5-caf7-4f3d-b1eb-d4f83eb0af5a", lsid); assertEquals("Anas (Anas) superciliosa", sciName); } @@ -1067,7 +1108,7 @@ public void testCommonNames6() { String name = "European Carp"; String lsid = getCommonNameLSID(name); String sciName = getCommonName(name); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:16171fac-8d6c-4327-9fab-f2db864d71bf", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/16171fac-8d6c-4327-9fab-f2db864d71bf", lsid); assertEquals("Cyprinus carpio", sciName); } @@ -1075,13 +1116,13 @@ public void testCommonNames6() { public void testCommonNames7() { String name = "Sulphur-crested Cockatoo"; String lsid = getCommonNameLSID(name); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9eb417b-2de3-48ac-ba4e-1d438f0cb323", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/2c33a1fd-34f4-48ec-9ae6-38b51f2aa7ea", lsid); name = "Sulphur crested Cockatoo"; lsid = getCommonNameLSID(name); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9eb417b-2de3-48ac-ba4e-1d438f0cb323", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/2c33a1fd-34f4-48ec-9ae6-38b51f2aa7ea", lsid); name = "SULPHUR CRESTED COCKATOO"; lsid = getCommonNameLSID(name); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9eb417b-2de3-48ac-ba4e-1d438f0cb323", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/2c33a1fd-34f4-48ec-9ae6-38b51f2aa7ea", lsid); String sciName = getCommonName(name); assertEquals("Cacatua (Cacatua) galerita", sciName); } @@ -1159,7 +1200,7 @@ public void testMyrmecia() { public void testSearchForLSID1() { try { String lsid = searcher.searchForLSID("Anochetus"); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); } @@ -1169,7 +1210,7 @@ public void testSearchForLSID1() { public void testSearchForLSID2() { try { String lsid = searcher.searchForLSID("Anochetus", true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); } @@ -1179,7 +1220,7 @@ public void testSearchForLSID2() { public void testSearchForLSID3() { try { String lsid = searcher.searchForLSID("Anochetus", true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); } @@ -1189,7 +1230,7 @@ public void testSearchForLSID3() { public void testSearchForLSID4() { try { String lsid = searcher.searchForLSID("Anochetus", RankType.GENUS); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); } @@ -1200,7 +1241,7 @@ public void testSearchForLSID5() { try { LinnaeanRankClassification cl = new LinnaeanRankClassification("Animalia", "Arthropoda", "Insecta", "Hymenoptera", "Formicidae", "Anochetus", null); String lsid = searcher.searchForLSID("Anochetus", cl, RankType.GENUS); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); } @@ -1211,7 +1252,7 @@ public void testSearchForLSID6() { try { LinnaeanRankClassification cl = new LinnaeanRankClassification("Animalia", "Arthropoda", "Insecta", "Hymenoptera", "Formicidae", "Anochetus", null); String lsid = searcher.searchForLSID(cl, true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); } @@ -1222,7 +1263,7 @@ public void testSearchForLSID7() { try { LinnaeanRankClassification cl = new LinnaeanRankClassification("Animalia", "Arthropoda", "Insecta", "Hymenoptera", "Formicidae", "Anochetus", null); String lsid = searcher.searchForLSID(cl, true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); + assertEquals("https://biodiversity.org.au/afd/taxa/f9d0d9dc-597d-4344-9e06-1704af36b9b1", lsid); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex.getMessage()); } @@ -1230,11 +1271,11 @@ public void testSearchForLSID7() { @Test public void testFuzzyMatches() throws Exception { - //Eolophus roseicapillus - non fuzzy match - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:577ff059-a2a7-48b0-976c-fdd6a345f878", searcher.searchForLSID("Eolophus roseicapilla")); + //Eolophus roseicapilla - non fuzzy match + assertEquals("https://biodiversity.org.au/afd/taxa/9b4ad548-8bb3-486a-ab0a-905506c463ea", searcher.searchForLSID("Eolophus roseicapilla")); - //Eolophus roseicapilla - fuzzy match - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:577ff059-a2a7-48b0-976c-fdd6a345f878", searcher.searchForLSID("Eolophus roseicapillus", true)); + //Eolophus roseicapillus - fuzzy match + assertEquals("https://biodiversity.org.au/afd/taxa/9b4ad548-8bb3-486a-ab0a-905506c463ea", searcher.searchForLSID("Eolophus roseicapillus", true)); } @Test @@ -1308,10 +1349,10 @@ public void testRankMarker() { @Test public void testSimpleLookup1() { try { - String name = "Megalurus gramineus"; + String name = "Poodytes gramineus"; NameSearchResult nsr = searcher.searchForRecord(name); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:b88430ed-f7d7-482e-a586-f0a02d8e11ce", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/061fef09-7c9d-4b6d-9827-4da13a350dc6", nsr.getLsid()); } catch (SearchResultException e) { fail("Unexpected search exception " + e); } @@ -1324,7 +1365,7 @@ public void testSimpleLookup2() { String name = "Synemon plana"; NameSearchResult nsr = searcher.searchForRecord(name); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:a51dca29-50e7-49b4-ae35-5c35a9c4f854", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/a51dca29-50e7-49b4-ae35-5c35a9c4f854", nsr.getLsid()); } catch (SearchResultException e) { fail("Unexpected search exception " + e); } @@ -1349,7 +1390,7 @@ public void testSimpleLookup4() { String name = "Chenopodium x bontei nothovar. submelanocarpum"; NameSearchResult nsr = searcher.searchForRecord(name); assertNotNull(nsr); - assertEquals("https://id.biodiversity.org.au/instance/apni/769095", nsr.getLsid()); + assertEquals("https://id.biodiversity.org.au/node/apni/2902250", nsr.getLsid()); } catch (SearchResultException e) { fail("Unexpected search exception " + e); } @@ -1363,8 +1404,8 @@ public void testSimpleLookup5() { cl.setScientificName("Favolus princeps"); NameSearchResult nsr = searcher.searchForRecord(cl, true); assertNotNull(nsr); // Been removed - assertEquals("43e1bc65-3580-47db-b269-cdb066ed49e9", nsr.getLsid()); - assertEquals( "10911fd1-a2dd-41f1-9c4d-8dff7f118670", nsr.getAcceptedLsid()); + assertEquals("https://id.biodiversity.org.au/instance/fungi/60071845", nsr.getLsid()); + assertEquals( "https://id.biodiversity.org.au/node/fungi/60098663", nsr.getAcceptedLsid()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex); } @@ -1393,7 +1434,7 @@ public void testSimpleLookup7() { String name = "Astomum"; NameSearchResult nsr = searcher.searchForRecord(name, cl, RankType.GENUS); assertNotNull(nsr); - assertEquals("https://id.biodiversity.org.au/name/ausmoss/10001613", nsr.getLsid()); + assertEquals("NZOR-6-29460", nsr.getLsid()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex); } @@ -1404,7 +1445,7 @@ public void testSimpleLookup8() { try { String name = "Carbo ater"; NameSearchResult nsr = searcher.searchForRecord(name); - fail("Expecting ecxluded name exception"); + fail("Expecting excluded name exception"); } catch (ExcludedNameException ex) { assertNull(ex.getNonExcludedName()); // Two types both excluded } catch (SearchResultException ex) { @@ -1418,7 +1459,7 @@ public void testSimpleLookup9() { String name = "Neobatrachus sudellae"; NameSearchResult nsr = searcher.searchForRecord(name); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:953a5af4-2932-4c8b-8f33-850b5f8f3fed", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/953a5af4-2932-4c8b-8f33-850b5f8f3fed", nsr.getLsid()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex); } @@ -1449,6 +1490,93 @@ public void testSimpleLookup12() { fail("Unexpected search exception " + e); } } + + // Do not match nom illeg. taxonomic status + @Test + public void testSimpleLookup13() throws Exception { + String name = "Banksia collina"; + NameSearchResult nsr = searcher.searchForRecord(name); + assertNotNull(nsr); + assertEquals("https://id.biodiversity.org.au/instance/apni/838699", nsr.getLsid()); + } + + @Test + public void testSimpleLookup14() throws Exception { + String name = "Stephanopis similis"; + NameSearchResult nsr = searcher.searchForRecord(name); + assertNotNull(nsr); + assertEquals("https://biodiversity.org.au/afd/taxa/24bc164a-85b2-4633-85c5-a3b399daec0a", nsr.getLsid()); + } + + @Test + public void testSimpleLookup15() throws Exception { + String name = "Fraus latistria"; + NameSearchResult nsr = searcher.searchForRecord(name); + assertNotNull(nsr); + assertEquals("https://biodiversity.org.au/afd/taxa/2358fcc0-8db2-475d-8da4-fd4bd5e711f2", nsr.getLsid()); + } + + @Test + public void testSimpleLookup16() throws Exception { + String name = "Metrosideros fulgens"; + NameSearchResult nsr = searcher.searchForRecord(name); + assertNotNull(nsr); + assertEquals("https://id.biodiversity.org.au/name/apni/110385", nsr.getLsid()); + } + + + @Test + public void testSimpleLookup17() throws Exception { + String name = "Metrosideros scandens"; + NameSearchResult nsr = searcher.searchForRecord(name); + assertNotNull(nsr); + assertEquals("https://id.biodiversity.org.au/name/apni/233086", nsr.getLsid()); + } + + + @Test + public void testAffLookup1() throws Exception { + String name = "Carex aff. tereticaulis (Lake Omeo)"; + NameSearchResult nsr = searcher.searchForRecord(name); + assertNotNull(nsr); + assertEquals("ALA_186619", nsr.getLsid()); + name = "Carex aff. tereticaulis"; + nsr = searcher.searchForRecord(name); + assertNotNull(nsr); + assertEquals("ALA_186619", nsr.getLsid()); + name = "Carex tereticaulis"; + nsr = searcher.searchForRecord(name); + assertNotNull(nsr); + assertEquals("https://id.biodiversity.org.au/node/apni/2919780", nsr.getLsid()); + } + + + @Test + public void testMetricsLookup1() throws Exception { + String name = "Geopelia placida"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true, true); + assertNotNull(metrics); + assertEquals("https://biodiversity.org.au/afd/taxa/3d5c4e0d-5138-46e0-8e14-5acd8fd2c523", metrics.getResult().getLsid()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + assertTrue(metrics.getErrors().contains(ErrorType.PARENT_CHILD_SYNONYM)); + } + + @Ignore // Until sub-taxon synonymy decided + @Test + public void testMetricsLookup2() throws Exception { + String name = "Trigonaphera vinnulum"; // Synonym of Trigonostoma vinnulum + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true, true); + assertNotNull(metrics); + assertEquals("https://biodiversity.org.au/afd/taxa/7e67e588-927e-48a9-8765-365ae9f25fcb", metrics.getResult().getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/5855a347-eee2-47bb-8130-94d49602d232", metrics.getResult().getAcceptedLsid()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + assertTrue(metrics.getErrors().contains(ErrorType.PARENT_CHILD_SYNONYM)); + } + @Test public void testParentChildSynonym1() { try { @@ -1458,7 +1586,7 @@ public void testParentChildSynonym1() { } catch (ParentSynonymChildException ex) { NameSearchResult nsr = ex.getChildResult(); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:d0e66526-1cdd-4b03-85b2-71b7e7d8b84a", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/3e062650-6ecb-43e7-a903-5487e3dbbbb5", nsr.getLsid()); assertEquals(RankType.SUBSPECIES, nsr.getRank()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex); @@ -1474,7 +1602,7 @@ public void testParentChildSynonym2() { } catch (ParentSynonymChildException ex) { NameSearchResult nsr = ex.getChildResult(); assertNotNull(nsr); - assertEquals("8e64942a-f300-46c8-ba97-76492d25d985", nsr.getLsid()); + assertEquals("https://id.biodiversity.org.au/node/fungi/60083449", nsr.getLsid()); assertEquals(RankType.FORM, nsr.getRank()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex); @@ -1490,8 +1618,8 @@ public void testStigmoderaAurifera() { cl.setScientificName(name); NameSearchResult nsr = searcher.searchForRecord(cl, true); assertNotNull(nsr); - assertEquals("urn:lsid:biodiversity.org.au:afd.name:e89de580-2942-479d-b5ef-5edd60424560", nsr.getLsid()); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:2e8ac1d8-5f2b-4fcd-a124-c619c7cab6b0", nsr.getAcceptedLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/426ab801-0d5f-4b43-b1b4-55ce7ce7a44e", nsr.getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/6c212123-fadc-4307-8dd8-ac501bb534ba", nsr.getAcceptedLsid()); assertEquals("Stigmodera aurifera", nsr.getRankClassification().getScientificName()); assertEquals(MatchType.CANONICAL, nsr.getMatchType()); } catch (SearchResultException e) { @@ -1577,7 +1705,7 @@ public void testHigherTaxonMatch2() { } @Test - public void testHomonymWithOrderResolution1() { + public void testHomonymWithOrderResolution1() throws Exception { try { String name = "Abelia"; LinnaeanRankClassification cl = new LinnaeanRankClassification(); @@ -1585,22 +1713,16 @@ public void testHomonymWithOrderResolution1() { NameSearchResult nsr = searcher.searchForRecord(cl, true); fail("Expecting homonym exception"); } catch (HomonymException ex) { - assertEquals(1, ex.getResults().size()); - } catch (SearchResultException e) { - fail("Unexpected search exception " + e); - } - try { - String name = "Abelia"; - LinnaeanRankClassification cl = new LinnaeanRankClassification(); - cl.setScientificName(name); - cl.setOrder("Dipsacales"); - NameSearchResult nsr = searcher.searchForRecord(cl, true); - assertNotNull(nsr); - assertEquals("https://id.biodiversity.org.au/node/apni/2892114", nsr.getLsid()); - } catch (SearchResultException e) { - fail("Unexpected search exception " + e); + assertEquals(2, ex.getResults().size()); } - } + String name = "Abelia"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName(name); + cl.setOrder("Dipsacales"); + NameSearchResult nsr = searcher.searchForRecord(cl, true); + assertNotNull(nsr); + assertEquals("https://id.biodiversity.org.au/node/apni/2892114", nsr.getLsid()); + } @Test public void testMultipleMisappliedResolution1() throws Exception { @@ -1639,6 +1761,26 @@ public void testMultipleMisappliedResolution3() throws Exception { assertTrue(metrics.getErrors().contains(ErrorType.MISAPPLIED)); } + // Ensure misapplication is ignored + @Test + public void testMultipleMisappliedResolution4() throws Exception { + String name = "Pterostylis bryophila"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertNotNull(metrics); + assertEquals("https://id.biodiversity.org.au/taxon/apni/51412050", metrics.getResult().getLsid()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + name = "Pterostylis obtusa"; + cl = new LinnaeanRankClassification(); + cl.setScientificName(name); + metrics = searcher.searchForRecordMetrics(cl, true); + assertNotNull(metrics); + assertEquals("https://id.biodiversity.org.au/taxon/apni/51412242", metrics.getResult().getLsid()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + assertTrue(metrics.getErrors().contains(ErrorType.MATCH_MISAPPLIED)); + } + // Synonym and accepted @Test @@ -1678,6 +1820,20 @@ public void testSynonymAccepted3() throws Exception { assertEquals("https://id.biodiversity.org.au/node/apni/2911212", metrics.getResult().getAcceptedLsid()); } + + @Test + public void testSynonymAccepted4() throws Exception { + String name = "Sugomel niger"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertNotNull(metrics); + assertEquals("ALA_3782348", metrics.getResult().getLsid()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + assertTrue(metrics.getErrors().contains(ErrorType.NONE)); + assertEquals("https://biodiversity.org.au/afd/taxa/b32a2ec6-315c-48cf-84b3-4898e39f4b57", metrics.getResult().getAcceptedLsid()); + } + // Available as a synonym but also misapplied. @Test public void testSynonymMisapplied1() throws Exception { @@ -1693,4 +1849,125 @@ public void testSynonymMisapplied1() throws Exception { assertTrue(metrics.getErrors().contains(ErrorType.MATCH_MISAPPLIED)); } + + // Higher taxonomy only filled out + @Test + public void testHigherTaxonomy() throws Exception { + String family = "Pterophoridae"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setFamily(family); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertNotNull(metrics); + assertEquals("https://biodiversity.org.au/afd/taxa/81da9a0d-ecb6-4040-a56d-12a44042b63b", metrics.getResult().getLsid()); + assertEquals(RankType.FAMILY, metrics.getResult().getRank()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + } + + // Phrase name with rank marker + @Test + public void testPhraseName1() throws Exception { + String name = "Tephrosia sp. Crowded pinnae (C.R.Dunlop 8202)"; + String kingdom = "Plantae"; + String phylum = "Streptophyta"; + String class_ = "Equisetopsida"; + String order = "Fabales"; + String genus = "Tephrosia"; + String specificEpithet = "sp. Crowded pinnae (C.R.Dunlop 8202)"; + String rank = "species"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setKingdom(kingdom); + cl.setPhylum(phylum); + cl.setKlass(class_); + cl.setOrder(order); + cl.setGenus(genus); + cl.setSpecificEpithet(specificEpithet); + cl.setRank(rank); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertNotNull(metrics); + assertEquals("https://id.biodiversity.org.au/instance/apni/932722", metrics.getResult().getLsid()); + assertEquals("https://id.biodiversity.org.au/node/apni/2890778", metrics.getResult().getAcceptedLsid()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + } + + @Test + public void testPhraseName2() throws Exception { + String name = "Tephrosia sp. (Miriam Vale E.J.Thompson+ MIR33)"; + String kingdom = "Plantae"; + String class_ = "Equisetopsida"; + String genus = "Tephrosia"; + String rank = "species"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setKingdom(kingdom); + cl.setKlass(class_); + cl.setGenus(genus); + //cl.setRank(rank); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertNotNull(metrics); + assertEquals("https://id.biodiversity.org.au/instance/apni/51376249", metrics.getResult().getLsid()); + assertEquals("https://id.biodiversity.org.au/node/apni/2903953", metrics.getResult().getAcceptedLsid()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + assertEquals(SynonymType.SUBJECTIVE_SYNONYM, metrics.getResult().getSynonymType()); + } + + @Test + public void testPhraseName3() throws Exception { + String name = "Thryptomene sp. Leinster (B.J. Lepschi & L.A. Craven 4362) PN"; + String kingdom = "Plantae"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setKingdom(kingdom); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertNotNull(metrics); + assertEquals("https://id.biodiversity.org.au/node/apni/2904210", metrics.getResult().getLsid()); + assertEquals(MatchType.PHRASE, metrics.getResult().getMatchType()); + } + + @Test + public void testPhraseName4() throws Exception { + String name = "Tephrosia sp. Miriam Vale (E.J.Thompson+ MIR33) WA Herbarium"; + String kingdom = "Plantae"; + String class_ = "Equisetopsida"; + String genus = "Tephrosia"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setKingdom(kingdom); + cl.setKlass(class_); + cl.setGenus(genus); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertNotNull(metrics); + assertEquals("https://id.biodiversity.org.au/node/apni/2903953", metrics.getResult().getLsid()); + assertEquals(MatchType.PHRASE, metrics.getResult().getMatchType()); + } + + // Ensure illegitimate names are excluded from the system and don't gum the works up + @Test + public void testIllegitimate1() throws Exception { + String name = "Banksia collina"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertNotNull(metrics); + assertEquals("https://id.biodiversity.org.au/instance/apni/838699", metrics.getResult().getLsid()); + assertEquals("https://id.biodiversity.org.au/node/apni/2900678", metrics.getResult().getAcceptedLsid()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + assertEquals(SynonymType.OBJECTIVE_SYNONYM, metrics.getResult().getSynonymType()); + } + + + // Ensure illegitimate names are excluded from the system and don't gum the works up + @Test + public void testIllegitimate2() throws Exception { + String name = "Zieria fordii"; + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName(name); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertNotNull(metrics); + assertEquals("https://id.biodiversity.org.au/name/apni/51337126", metrics.getResult().getLsid()); + assertEquals("https://id.biodiversity.org.au/taxon/apni/51367864", metrics.getResult().getRankClassification().getGid()); + assertEquals("https://id.biodiversity.org.au/taxon/apni/51367862", metrics.getResult().getRankClassification().getFid()); + assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); + } + } diff --git a/ala-name-matching-search/src/test/java/au/org/ala/names/search/AutocompleteTest.java b/ala-name-matching-search/src/test/java/au/org/ala/names/search/AutocompleteTest.java new file mode 100644 index 000000000..0f0925add --- /dev/null +++ b/ala-name-matching-search/src/test/java/au/org/ala/names/search/AutocompleteTest.java @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + +package au.org.ala.names.search; + +import au.org.ala.names.model.*; +import org.gbif.api.model.checklistbank.ParsedName; +import org.gbif.nameparser.PhraseNameParser; +import org.junit.Ignore; +import org.junit.Test; + +import java.util.List; +import java.util.Map; + +import static org.junit.Assert.*; + +public class AutocompleteTest { + private static ALANameSearcher searcher; + + @org.junit.BeforeClass + public static void init() throws Exception { + searcher = new ALANameSearcher("/data/lucene/namematching-20210811"); + } + + @Test + public void testAutocomplete1() throws Exception { + List results = searcher.autocomplete("Elusor", 10, false); + assertNotNull(results); + assertTrue(results.size() > 0); + Map first = results.get(0); + assertEquals("Elusor", first.get("name")); + } + + @Test + public void testAutocomplete2() throws Exception { + List results = searcher.autocomplete("Mary riv", 10, false); + assertNotNull(results); + assertTrue(results.size() > 1); + Map first = results.get(0); + assertEquals("Samadera sp. Mary River", first.get("name")); + Map second = results.get(1); + assertEquals("Mary River cod", second.get("commonname")); + assertEquals("Maccullochella mariensis", second.get("name")); + Map third = results.get(2); + assertEquals("Mary River turtle", third.get("commonname")); + assertEquals("Elusor macrurus", third.get("name")); + } + + @Test + public void testAutocomplete3() throws Exception { + List results = searcher.autocomplete("Mary river t", 10, false); + assertNotNull(results); + assertTrue(results.size() > 0); + Map first = results.get(0); + assertEquals("Mary River turtle", first.get("commonname")); + assertEquals("Elusor macrurus", first.get("name")); + } + + @Test + public void testAutocomplete4() throws Exception { + List results = searcher.autocomplete("Acacia", 10, true); + assertNotNull(results); + assertTrue(results.size() > 0); + Map first = results.get(0); + assertEquals("Acacia", first.get("name")); + } + + @Test + public void testAutocomplete5() throws Exception { + List results = searcher.autocomplete("Acacia d", 10, true); + assertNotNull(results); + assertTrue(results.size() > 0); + Map first = results.get(0); + assertEquals("Acacia dampieri", first.get("name")); + } + + @Test + public void testAutocomplete6() throws Exception { + List results = searcher.autocomplete("Mylitta pse", 10, true); + assertNotNull(results); + assertTrue(results.size() > 0); + Map first = results.get(0); + assertEquals("Hysterangium pseudacaciae", first.get("name")); + assertNotNull(first.get("synonymMatch")); + } + + + @Test + public void testAutocomplete7() throws Exception { + // No match with synonym + List results = searcher.autocomplete("Mylitta pse", 10, false); + assertNotNull(results); + assertTrue(results.isEmpty()); + } + + + @Test + public void testAutocomplete8() throws Exception { + // No match with garbage + List results = searcher.autocomplete("Glurglefkluff11", 10, true); + assertNotNull(results); + assertTrue(results.isEmpty()); + } + + @Test + public void testAutocomplete9() throws Exception { + List results = searcher.autocomplete("Osphra", 10, true); + assertNotNull(results); + assertTrue(results.size() > 0); + Map first = results.get(0); + assertEquals("Osphranter", first.get("name")); + } + + @Test + public void testAutocomplete10() throws Exception { + List results = searcher.autocomplete("Rossi", 10, true); + assertNotNull(results); + assertTrue(results.size() > 0); + Map first = results.get(0); + assertEquals("Rhachotropis rossi", first.get("name")); + } + + + @Test + public void testAutocomplete11() throws Exception { + List results = searcher.autocomplete("rush", 10, false); + assertNotNull(results); + assertTrue(results.size() > 0); + Map first = results.get(0); + assertEquals("Juncus", first.get("name")); + assertEquals("Rushes", first.get("commonname")); + } + + + @Test + public void testAutocomplete12() throws Exception { + List results = searcher.autocomplete("rush", 10, true); + assertNotNull(results); + assertTrue(results.size() > 0); + Map first = results.get(0); + assertEquals("Acacia alleniana", first.get("name")); + List synonyms = (List) first.get("synonymMatch"); + assertNotNull(synonyms); + assertTrue(synonyms.size() > 0); + Map synonym = synonyms.get(0); + assertEquals("Rush-leaved Wattle", synonym.get("commonname")); + } + +} \ No newline at end of file diff --git a/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java b/ala-name-matching-search/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java similarity index 88% rename from src/test/java/au/org/ala/names/search/BiocacheMatchTest.java rename to ala-name-matching-search/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java index 20f522b1c..a4ba57a8a 100644 --- a/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java +++ b/ala-name-matching-search/src/test/java/au/org/ala/names/search/BiocacheMatchTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.search; import au.org.ala.names.model.*; @@ -22,17 +38,16 @@ public class BiocacheMatchTest { @org.junit.BeforeClass public static void init() throws Exception { - searcher = new ALANameSearcher("/data/lucene/namematching-20200214"); + searcher = new ALANameSearcher("/data/lucene/namematching-20210811"); } @Test - @Ignore public void testMatchHybrid(){ try{ LinnaeanRankClassification cl = new LinnaeanRankClassification(); cl.setScientificName("Eucalyptus globulus x Eucalyptus ovata"); MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); - assertEquals("hybrid", metrics.getNameType().toString()); + assertEquals(NameType.HYBRID, metrics.getNameType()); assertEquals(RankType.SPECIES, metrics.getResult().getRank()); } catch(Exception e){ @@ -73,20 +88,15 @@ public void synonymHomonymIssue(){ } @Test - public void testRecursiveAuthorshipIssue() { - try { - LinnaeanRankClassification cl = new LinnaeanRankClassification(); - cl.setScientificName("Graphis notreallyaname Mull.Arg."); - cl.setAuthorship("Mull.Arg."); - cl.setKingdom("Animalia"); - cl.setGenus("Graphis"); - cl.setSpecificEpithet("notreallyaname"); - MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:2af76a1e-2086-46e3-90b9-6f00983b21a5", metrics.getResult().getLsid()); // Graphis from AFD - } catch (Exception e) { - e.printStackTrace(); - fail("Exception should not occur"); - } + public void testRecursiveAuthorshipIssue1() throws Exception { + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName("Graphis notreallyaname Mull.Arg."); + cl.setAuthorship("Mull.Arg."); + cl.setKingdom("Animalia"); + cl.setGenus("Graphis"); + cl.setSpecificEpithet("notreallyaname"); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertEquals("https://biodiversity.org.au/afd/taxa/2af76a1e-2086-46e3-90b9-6f00983b21a5", metrics.getResult().getLsid()); // Graphis from AFD } @Test @@ -98,7 +108,17 @@ public void testRecursiveAuthorshipIssue2() throws Exception { cl.setGenus("Graphis"); cl.setSpecificEpithet("notreallyaname"); MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); - assertEquals("NZOR-6-122770", metrics.getResult().getLsid()); // Can't find Graphis since not APC placed so gets Graphidaceae + assertEquals("NZOR-6-132826", metrics.getResult().getLsid()); // Can't find Graphis homonym so gets Graphidaceae + } + + @Test + public void testRecursiveAuthorshipIssue3() throws Exception { + LinnaeanRankClassification cl = new LinnaeanRankClassification(); + cl.setScientificName("Graphis"); + cl.setKingdom("Fungi"); + cl.setGenus("Graphis"); + MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); + assertEquals("NZOR-6-122770", metrics.getResult().getLsid()); // Can't find Graphis homonym so gets Graphidaceae } @Test @@ -120,7 +140,7 @@ public void testCrossRankHomonym() throws Exception { assertFalse("Cross rank homonym should have been resolved",metrics.getErrors().contains(ErrorType.HOMONYM)); } - // @Test + @Test public void testTibicentibicen() { try { LinnaeanRankClassification cl = new LinnaeanRankClassification(); @@ -147,7 +167,7 @@ public void testSPNovName() { cl.setSpecificEpithet(spEp); MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); //System.out.println(metrics.getResult()); - assertEquals("http://id.biodiversity.org.au/instance/apni/884433", metrics.getResult().getLsid()); + assertEquals("https://id.biodiversity.org.au/instance/apni/884433", metrics.getResult().getLsid()); assertTrue(metrics.getErrors().contains(ErrorType.HOMONYM)); } catch (Exception e) { @@ -236,7 +256,7 @@ public void testParentChildWithDifferentSpelling1() throws Exception { cl.setScientificName("Climacteris affinis"); MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:5d7c50bc-2c2d-4984-9924-d2a46dc3b00f", metrics.getResult().getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/0d28bce2-0bae-44f6-9c73-0afc0f343b8c", metrics.getResult().getLsid()); assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); // Dereferenced synonym assertTrue(metrics.getErrors().contains(ErrorType.PARENT_CHILD_SYNONYM)); } @@ -247,7 +267,7 @@ public void testParentChildWithDifferentSpelling2() throws Exception { cl.setScientificName("Limnodynastes dumerilii"); MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:2c50c2f6-7a0d-44e1-b549-458427b420c4", metrics.getResult().getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/2c50c2f6-7a0d-44e1-b549-458427b420c4", metrics.getResult().getLsid()); assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); // Dereferenced synonym assertTrue(metrics.getErrors().contains(ErrorType.PARENT_CHILD_SYNONYM)); } @@ -259,7 +279,7 @@ public void testAffCfSpecies1() throws Exception { // No issues cl.setScientificName("Zabidius novemaculeatus"); MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:58e06bba-de3b-4c8c-b165-d75bbeb21a36", metrics.getResult().getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/58e06bba-de3b-4c8c-b165-d75bbeb21a36", metrics.getResult().getLsid()); assertTrue(metrics.getErrors().contains(ErrorType.NONE)); cl = new LinnaeanRankClassification(); @@ -338,6 +358,7 @@ public void testSubSpeciesMarker1() { fail("Unexpected search exception " + ex); } } + // See https://github.com/AtlasOfLivingAustralia/ala-name-matching/issues/1 @Test public void testSubSpeciesMarker2() { @@ -453,8 +474,8 @@ public void testDingo1() { cl.setScientificName(name); MetricsResultDTO metrics = searcher.searchForRecordMetrics(cl, true); assertNotNull(metrics); - assertEquals("urn:lsid:biodiversity.org.au:afd.name:3064f20b-f6de-4375-8377-904cbd6cf9fa", metrics.getResult().getLsid()); - assertEquals("urn:lsid:biodiversity.org.au:afd.taxon:c2056f1b-fcde-45b9-904b-1cab280368d1", metrics.getResult().getAcceptedLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/3064f20b-f6de-4375-8377-904cbd6cf9fa", metrics.getResult().getLsid()); + assertEquals("https://biodiversity.org.au/afd/taxa/c2056f1b-fcde-45b9-904b-1cab280368d1", metrics.getResult().getAcceptedLsid()); assertEquals(MatchType.EXACT, metrics.getResult().getMatchType()); } catch (SearchResultException ex) { fail("Unexpected search exception " + ex); diff --git a/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java b/ala-name-matching-search/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java similarity index 88% rename from src/test/java/au/org/ala/names/search/IconicSpeciesTest.java rename to ala-name-matching-search/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java index f0a071cef..42a696fe1 100644 --- a/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java +++ b/ala-name-matching-search/src/test/java/au/org/ala/names/search/IconicSpeciesTest.java @@ -1,14 +1,31 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.search; -import com.opencsv.CSVReader; -import au.org.ala.names.model.NameSearchResult; import au.org.ala.names.model.LinnaeanRankClassification; +import au.org.ala.names.model.NameSearchResult; import au.org.ala.names.model.RankType; -import org.apache.commons.lang.StringUtils; -import org.junit.Ignore; +import com.opencsv.CSVParser; +import com.opencsv.CSVParserBuilder; +import com.opencsv.CSVReader; +import com.opencsv.CSVReaderBuilder; +import org.apache.commons.lang3.StringUtils; import org.junit.Test; -import java.io.File; import java.io.FileReader; import java.io.InputStreamReader; import java.util.List; @@ -30,7 +47,7 @@ public class IconicSpeciesTest { @org.junit.BeforeClass public static void init() throws Exception { - searcher = new ALANameSearcher("/data/lucene/namematching-20200214"); + searcher = new ALANameSearcher("/data/lucene/namematching-20210811"); } //@Test @@ -90,7 +107,15 @@ public void testIconicSpeciesSRCCOL() { @Test public void testIconicSpeciesFile() { try { - CSVReader reader = new CSVReader(new InputStreamReader(this.getClass().getResourceAsStream("iconic_species_list.csv")), ',', '"'); + CSVParser csvParser = new CSVParserBuilder() + .withSeparator(',') + .withQuoteChar('"') + .withEscapeChar('\\') + .build(); + CSVReader reader = new CSVReaderBuilder(new InputStreamReader(this.getClass().getResourceAsStream("iconic_species_list.csv"))) + .withCSVParser(csvParser) + .withSkipLines(1) + .build(); String[] values; int passed = 0, failed = 0; diff --git a/src/test/java/au/org/ala/names/search/VernacularMatchTest.java b/ala-name-matching-search/src/test/java/au/org/ala/names/search/VernacularMatchTest.java similarity index 78% rename from src/test/java/au/org/ala/names/search/VernacularMatchTest.java rename to ala-name-matching-search/src/test/java/au/org/ala/names/search/VernacularMatchTest.java index 3f9c16177..6d221ee79 100644 --- a/src/test/java/au/org/ala/names/search/VernacularMatchTest.java +++ b/ala-name-matching-search/src/test/java/au/org/ala/names/search/VernacularMatchTest.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.search; import au.org.ala.names.model.LinnaeanRankClassification; @@ -24,13 +40,13 @@ public class VernacularMatchTest { @org.junit.BeforeClass public static void init() throws Exception { - searcher = new ALANameSearcher("/data/lucene/namematching-20200214"); + searcher = new ALANameSearcher("/data/lucene/namematching-20210811"); } @Test public void testVernacular1() throws Exception { String name = "Mary River Turtle"; - String expectedLsid = "urn:lsid:biodiversity.org.au:afd.taxon:d315deea-822c-4f2c-b439-da33d6af5fd6"; + String expectedLsid = "https://biodiversity.org.au/afd/taxa/d315deea-822c-4f2c-b439-da33d6af5fd6"; NameSearchResult result = null; result = searcher.searchForCommonName(name); @@ -38,7 +54,7 @@ public void testVernacular1() throws Exception { assertEquals(expectedLsid, result.getLsid()); } - @Ignore // Requires indidgenous names + //@Ignore // Requires indidgenous names @Test public void testVernacular2() throws Exception { String name = "Dhulwa"; @@ -52,7 +68,7 @@ public void testVernacular2() throws Exception { @Test public void testVernacular3() throws Exception { String name = "Drain Mangrovegoby"; - String expectedLsid = "urn:lsid:biodiversity.org.au:afd.taxon:19c60dcd-93a0-40a2-9ac1-3abe7119c505"; + String expectedLsid = "https://biodiversity.org.au/afd/taxa/19c60dcd-93a0-40a2-9ac1-3abe7119c505"; NameSearchResult result = null; result = searcher.searchForCommonName(name); @@ -64,7 +80,7 @@ public void testVernacular3() throws Exception { @Test public void testVernacular4() throws Exception { String name = "Onespine Unicornfish"; - String expectedLsid = "urn:lsid:biodiversity.org.au:afd.taxon:f7bfd383-5501-4196-9acb-d9d4d03cc45d"; + String expectedLsid = "https://biodiversity.org.au/afd/taxa/f7bfd383-5501-4196-9acb-d9d4d03cc45d"; NameSearchResult result = null; result = searcher.searchForCommonName(name); diff --git a/src/test/resources/au/org/ala/names/search/iconic_species_list.csv b/ala-name-matching-search/src/test/resources/au/org/ala/names/search/iconic_species_list.csv similarity index 96% rename from src/test/resources/au/org/ala/names/search/iconic_species_list.csv rename to ala-name-matching-search/src/test/resources/au/org/ala/names/search/iconic_species_list.csv index d96313247..568e24879 100644 --- a/src/test/resources/au/org/ala/names/search/iconic_species_list.csv +++ b/ala-name-matching-search/src/test/resources/au/org/ala/names/search/iconic_species_list.csv @@ -23,12 +23,12 @@ BIRDS ,Boobook Owl,,Animalia,Chordata,Aves,STRIGIFORMES,STRIGIDAE,Ninox,novaeseelandiae,,,Yes,Yes,Yes ,Little Raven,,Animalia,Chordata,Aves,PASSERIFORMES,CORVIDAE,Corvus,mellori,,,Yes,Yes,Yes ,Sulphur-crested Cockatoo,,Animalia,Chordata,Aves,PSITTACIFORMES,CACATUIDAE,Cacatua,galerita,,,Yes,Yes,Yes -,Osprey,,Animalia,Chordata,Aves,FALCONIFORMES,ACCIPITRIDAE,Pandion,haliaetus,,,Yes,Yes,Yes +,Osprey,,Animalia,Chordata,Aves,Accipitriformes,Accipitridae,Pandion,haliaetus,,,Yes,Yes,Yes ,Major Mitchell Cockatoo,,Animalia,Chordata,Aves,PSITTACIFORMES,CACATUIDAE,Lophochroa,leadbeateri,,,Yes,Yes,Yes ,Southern Cassowary,,Animalia,Chordata,Aves,STRUTHIONIFORMES,CASUARIIDAE,Casuarius,casuarius,,,Yes,No,No ,Cape Baron Goose,,Animalia,Chordata,Aves,ANSERIFORMES,ANATIDAE,Cereopsis,novaehollandiae,novaehollandiae,,Yes,Yes,Yes ,Brolga,,Animalia,Chordata,Aves,GRUIFORMES,GRUIDAE,Grus,rubicunda,,,Yes,No,No -,Wedge-tailed Eagle,,Animalia,Chordata,Aves,FALCONIFORMES,ACCIPITRIDAE,Aquila,audax,,,Yes,No,Yes +,Wedge-tailed Eagle,,Animalia,Chordata,Aves,Accipitriformes,Accipitridae,Aquila,audax,,,Yes,No,Yes FISH ,Barramundi,,Animalia,CHORDATA,ACTINOPTERYGII,PERCIFORMES,LATIDAE,Lates,calcarifer,,,yes,yes,yes @@ -116,7 +116,7 @@ REPTILES INVERTEBRATES ,Red-backed Spider,,Animalia,Arthropoda,Arachnida,ARANEAE,THERIDIIDAE,Latrodectus,hasseltii,,,yes,yes,yes -,Sydney Funnelweb Spider,,Animalia,Arthropoda,Arachnida,ARANEAE,HEXATHELIDAE,Atrax,robustus,,,yes,yes,yes +,Sydney Funnelweb Spider,,Animalia,Arthropoda,Arachnida,Araneae,Atracidae,Atrax,robustus,,,yes,yes,yes ,Red-headed Mouse Spider,,Animalia,Arthropoda,Arachnida,ARANEAE,ACTINOPODIDAE,Missulena,occatoria,,,yes,yes,yes ,Cairn's Birdwing,,Animalia,Arthropoda,Insecta,LEPIDOPTERA,PAPILIONIDAE,Ornithoptera,priamus,,,yes,yes,yes ,Cabbage White Butterfly,,Animalia,Arthropoda,Insecta,LEPIDOPTERA,PIERIDAE,Pieris,rapae,,,yes,yes,yes @@ -196,12 +196,12 @@ Marine,Blue Groper,Official,Animalia,,,,,Achoerodus,viridis NT Animal,Red Kangaroo,Official,Animalia,CHORDATA,MAMMALIA,DIPROTODONTIA,MACROPODIDAE,Osphranter,rufus,,,yes,yes - limited,yes Plant,Sturt's Desert Rose,official,Plantae,Charophyta,Equisetopsida,MALVALES,MALVACEAE,Gossypium,sturtianum,,,yes,yes - limited,yes -Bird,Wedge-tailed,Official,Animalia,CHORDATA,AVES,FALCONIFORMES,ACCIPITRIDAE,Aquila,audax,,Wedge-tailed Eagle,yes,yes - limited,yes - change image on region page +Bird,Wedge-tailed,Official,Animalia,CHORDATA,AVES,Accipitriformes,Accipitridae,Aquila,audax,,Wedge-tailed Eagle,yes,yes - limited,yes - change image on region page Marine,No emblem QLD Animal,Koala,Official,ANIMALIA,CHORDATA,MAMMALIA,DIPROTODONTIA,PHASCOLARCTIDAE,Phascolarctos,cinereus,,,yes,yes,yes - change image on Region Page -Plant,,Official,,,,,,Vappodes,phalaenopsis,,Cooktown Orchid,yes,yes - limited,yes - limited +Plant,,Official,Plantae,Charophyta,Equisetopsida,Asparagales,Orchidaceae,Dendrobium,bigibbum ,,Cooktown Orchid,yes,yes - limited,yes - limited Bird,Brolga,Official,ANIMALIA,CHORDATA,AVES,GRUIFORMES,GRUIDAE,Grus,rubicunda,,,yes,yes,yes Marine,Anemone Fish,Official,ANIMALIA,,,,,Amphiprion,akindynos diff --git a/ala-name-matching-tools/pom.xml b/ala-name-matching-tools/pom.xml new file mode 100644 index 000000000..a06455898 --- /dev/null +++ b/ala-name-matching-tools/pom.xml @@ -0,0 +1,38 @@ + + + + ala-name-matching + au.org.ala + 4.0 + + 4.0.0 + + ala-name-matching-tools + ALA Name Matching Tools + Tools for testing and analysing name matching indexes + + + + au.org.ala + ala-name-matching-model + ${project.version} + + + au.org.ala + ala-name-matching-search + ${project.version} + + + org.slf4j + slf4j-log4j12 + ${slf4j.version} + + + commons-cli + commons-cli + ${commons-cli.version} + + + \ No newline at end of file diff --git a/src/main/java/au/org/ala/names/util/NameListComparer.java b/ala-name-matching-tools/src/main/java/au/org/ala/names/util/NameListComparer.java similarity index 92% rename from src/main/java/au/org/ala/names/util/NameListComparer.java rename to ala-name-matching-tools/src/main/java/au/org/ala/names/util/NameListComparer.java index 1b19ad0bb..522e97a81 100644 --- a/src/main/java/au/org/ala/names/util/NameListComparer.java +++ b/ala-name-matching-tools/src/main/java/au/org/ala/names/util/NameListComparer.java @@ -1,12 +1,29 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.util; import com.opencsv.*; import au.org.ala.names.model.*; import au.org.ala.names.search.*; +import com.opencsv.exceptions.CsvValidationException; import org.apache.commons.cli.*; -import org.apache.commons.lang.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; +import org.apache.commons.lang3.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.*; import java.util.*; @@ -19,7 +36,7 @@ * Copyright (c) 2016 CSIRO */ public class NameListComparer { - private static Log log = LogFactory.getLog(NameListComparer.class); + private static Logger log = LoggerFactory.getLogger(NameListComparer.class); private static String[][] TERMS = { { "originalId", "Species", "taxonConceptID", "taxon_concept_lsid", "taxonID" }, @@ -72,7 +89,7 @@ protected String mapTerm(String column) { return null; } - protected void readHeader() throws IOException { + protected void readHeader() throws IOException, CsvValidationException { String[] header = names.readNext(); int i = 0; @@ -230,7 +247,7 @@ public String[] match(String[] row) { return values.toArray(new String[values.size()]); } - public void compare() throws IOException { + public void compare() throws IOException, CsvValidationException { String[] row, match; int count = 0; diff --git a/src/main/java/au/org/ala/names/util/NameListGenerator.java b/ala-name-matching-tools/src/main/java/au/org/ala/names/util/NameListGenerator.java similarity index 86% rename from src/main/java/au/org/ala/names/util/NameListGenerator.java rename to ala-name-matching-tools/src/main/java/au/org/ala/names/util/NameListGenerator.java index e1ee0e0f7..c3992ce11 100644 --- a/src/main/java/au/org/ala/names/util/NameListGenerator.java +++ b/ala-name-matching-tools/src/main/java/au/org/ala/names/util/NameListGenerator.java @@ -1,15 +1,30 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.util; import com.opencsv.CSVWriter; import au.org.ala.names.model.SynonymType; import org.apache.commons.cli.*; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexableField; import org.apache.lucene.store.FSDirectory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.*; import java.util.*; @@ -23,7 +38,7 @@ * Copyright (c) 2015 CSIRO */ public class NameListGenerator implements Closeable { - private static Log log = LogFactory.getLog(NameListGenerator.class); + private static Logger log = LoggerFactory.getLogger(NameListGenerator.class); private static String[][] FIELDS = { {"lsid", "taxonID"}, diff --git a/src/main/java/au/org/ala/names/util/TermDump.java b/ala-name-matching-tools/src/main/java/au/org/ala/names/util/TermDump.java similarity index 76% rename from src/main/java/au/org/ala/names/util/TermDump.java rename to ala-name-matching-tools/src/main/java/au/org/ala/names/util/TermDump.java index 5be096686..aaf356902 100644 --- a/src/main/java/au/org/ala/names/util/TermDump.java +++ b/ala-name-matching-tools/src/main/java/au/org/ala/names/util/TermDump.java @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021 Atlas of Living Australia + * All Rights Reserved. + * + * The contents of this file are subject to the Mozilla Public + * License Version 1.1 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or + * implied. See the License for the specific language governing + * rights and limitations under the License. + * + */ + package au.org.ala.names.util; import org.apache.commons.cli.*; @@ -6,6 +22,8 @@ import org.apache.lucene.util.BytesRef; import java.io.*; +import java.util.HashSet; +import java.util.Set; /** * Dump the terms in an index. @@ -33,7 +51,11 @@ public TermDump(File index, Writer output) { public void dump() throws IOException { DirectoryReader reader = DirectoryReader.open(FSDirectory.open(this.index.toPath())); - Fields fields = MultiFields.getFields(reader); + Set fields = new HashSet<>(); + for (LeafReaderContext lc: reader.leaves()) { + for (FieldInfo fi: lc.reader().getFieldInfos()) + fields.add(fi.name); + } PrintWriter pw = new PrintWriter(this.output); for (String field: fields) { diff --git a/ala-name-matching-tools/src/main/resources/log4j.xml b/ala-name-matching-tools/src/main/resources/log4j.xml new file mode 100644 index 000000000..e0fbd4773 --- /dev/null +++ b/ala-name-matching-tools/src/main/resources/log4j.xml @@ -0,0 +1,32 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/data/ala-taxon-config.json b/data/ala-taxon-config.json index f04775e7e..a21af6956 100644 --- a/data/ala-taxon-config.json +++ b/data/ala-taxon-config.json @@ -26,6 +26,21 @@ { "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", "taxonomicStatus": "INFERRED_INVALID" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "matchType": "REGEX", + "scientificName": "Unknown( .*|)" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "matchType": "INSENSITIVE", + "scientificName": "Not assigned" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "matchType": "REGEX", + "scientificName": "[A-Z][A-Za-z]+ sp\\.?" } ], "adjustments": [ @@ -183,13 +198,6 @@ }, "adjustment": -20 }, - { - "condition": { - "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", - "nomenclaturalStatus": "FORGOTTEN" - }, - "adjustment": -20 - }, { "condition": { "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", @@ -626,10 +634,12 @@ "Cerapus murrayae": "Cerapus murrayi", "Chelonaplysilla noevus": "Chelonaplysilla naevus", "Chromonephthea muironensis": "Chromonephthea murionensis", + "Cis munitus": "Cis minutus", "Compsopogon coeruleus": "Compsopogon caeruleus", "Cortinarius campbellae": "Cortinarius campbelliae", "Diastylopsis thileniusi": "Diastylopsis thilenuisi", "Difflugia garmen": "Difflugia gramen", + "Encyonema auerwaldsii": "Encyonema auerswaldii", "Euglypha loevis": "Euglypha laevis", "Eumida hawkseburyensis": "Eumida hawkesburyensis", "Euryspongia deliculata": "Euryspongia delicatula", @@ -646,26 +656,35 @@ "Liljeborgia aequiabilis": "Liljeborgia aequabilis", "Limnodriloides winckelmanni": "Limnodriloides wincklemanni", "Liocranchia valdiviae": "Liocranchia valdivae", + "Lyngbya digueti": "Lyngbya diguetii", + "Macromitrium ligulifolium": "Macromitrium ligulaefolium", "Marasmius crinis-equi": "Marasmius crinisequi", "Mesoplodon gingkodens": "Mesoplodon ginkgodens", "Metacirolana basteni": "Metacirolana bastenae", "Mycedium mancoi": "Mycedium mancaoi", "Mysticoncha wilsonae": "Mysticoncha wilsoni", + "Navicula fromenterae": "Navicula formenterae", + "Navicula laterostriata": "Navicula laterostrata", "Nectria quisquiliaris": "Nectria quisquilaris", "Neelaps calonotos": "Neelaps calonotus", "Odontosyllis langerhansaesetosa": "Odontosyllis langerhansiaesetosa", "Paraminabea aldersaldei": "Paraminabea aldersladei", "Phyllodoce madierensis": "Phyllodoce madeirensis", + "Phytophthora fragariifolia": "Phytophthora fragariaefolia", "Plumatella \"longigemmis\"": "Plumatella longigemmis", "Prionospio auckalndica": "Prionospio aucklandica", + "Porphyrosiphon notarisii": "Porphyrosiphon notarissi", "Puccinia duthiae": "Puccinia duthiei", "Puccinia argophyllae": "Puccinia argophylli", "Reteporella lacinata": "Reteporella laciniata", "Reteporella malleatia": "Reteporella malleata", "Ringicula doliaris": "Ringicula dolaris", + "Scytonema hofmanni": "Scytonema hofmannii", + "Scytonema viarium": "Scytonema varium", "Smittoidea discoverae": "Smittoidea discoveriae", "Sporisorium australiasiaticum": "Sporisorium australasiaticum", "Stereum amoenum": "Stereum amaenum", + "Sticta wiegelii": "Sticta weigelii", "Stylopoma thornelyae": "Stylopoma thornelyi", "Tectacingulum tumidum": "Tectacingulum tumidium", "Tesarius sulcipennis": "Tesarius suclipennis", @@ -733,8 +752,7 @@ "Plantae": 6000 }, "owner": [ - "Plantae", - "Solanum torvum" + "Plantae" ] }, { @@ -743,7 +761,59 @@ "description": "Australian Plant Name Index entries not placed by the Australian Plant Census, given an assumed parent of Plantae", "parent": "apni-apc", "authority": false, - "defaultScore" : 4000 + "defaultScore" : 4000, + "adjuster": { + "forbidden": [ + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "FORGOTTEN" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "CONFUSED" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "ABORTED" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "SUPERFLUOUS" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "NUDUM" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "NULL_NAME" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "SUPPRESSED" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "REJECTED" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "REJECTED_OUTRIGHT" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "ILLEGITIMATE" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "INVALID" + }, + { + "@class": "au.org.ala.names.index.provider.MatchTaxonCondition", + "nomenclaturalStatus": "DENIED" + } + ] + } }, { "id" : "dr2699", @@ -792,6 +862,17 @@ ] } }, + { + "id" : "dr17664", + "name": "ABRSL", + "description": "ABRS Lichen Checklist", + "parent": "apni-apc", + "rightsHolder": "Commonwealth Scientific and Industrial Research Organisation", + "authority": false, + "defaultScore" : 2500, + "defaultParentTaxon": "Plantae", + "conceptResolutionPriority": "ADDITIONAL" + }, { "id" : "dr2704", "name": "CAAB", diff --git a/data/CB_script_AFD_synonyms.sql b/data/historical/CB_script_AFD_synonyms.sql similarity index 100% rename from data/CB_script_AFD_synonyms.sql rename to data/historical/CB_script_AFD_synonyms.sql diff --git a/data/historical/README.md b/data/historical/README.md new file mode 100644 index 000000000..b3d56a5bf --- /dev/null +++ b/data/historical/README.md @@ -0,0 +1 @@ +Historical files showing the history of data exctraction and use. diff --git a/data/all-families.txt b/data/historical/all-families.txt similarity index 100% rename from data/all-families.txt rename to data/historical/all-families.txt diff --git a/src/main/resources/au/org/ala/propertystore/applicationContext-cb.xml b/data/historical/applicationContext-cb.xml similarity index 100% rename from src/main/resources/au/org/ala/propertystore/applicationContext-cb.xml rename to data/historical/applicationContext-cb.xml diff --git a/src/test/resources/au/org/ala/names/search/ba_names.txt b/data/historical/ba_names.txt similarity index 100% rename from src/test/resources/au/org/ala/names/search/ba_names.txt rename to data/historical/ba_names.txt diff --git a/src/test/resources/au/org/ala/names/search/bio_aust_birds.txt b/data/historical/bio_aust_birds.txt similarity index 100% rename from src/test/resources/au/org/ala/names/search/bio_aust_birds.txt rename to data/historical/bio_aust_birds.txt diff --git a/src/test/resources/au/org/ala/names/search/biocache_animal_col.txt b/data/historical/biocache_animal_col.txt similarity index 100% rename from src/test/resources/au/org/ala/names/search/biocache_animal_col.txt rename to data/historical/biocache_animal_col.txt diff --git a/src/test/resources/au/org/ala/names/search/birds.txt b/data/historical/birds.txt similarity index 100% rename from src/test/resources/au/org/ala/names/search/birds.txt rename to data/historical/birds.txt diff --git a/src/test/resources/au/org/ala/names/search/caab_fish.txt b/data/historical/caab_fish.txt similarity index 100% rename from src/test/resources/au/org/ala/names/search/caab_fish.txt rename to data/historical/caab_fish.txt diff --git a/src/main/resources/au/org/ala/db/CoL2010_dwc_export.sql b/data/historical/db/CoL2010_dwc_export.sql similarity index 98% rename from src/main/resources/au/org/ala/db/CoL2010_dwc_export.sql rename to data/historical/db/CoL2010_dwc_export.sql index 240309127..d3b475f22 100644 --- a/src/main/resources/au/org/ala/db/CoL2010_dwc_export.sql +++ b/data/historical/db/CoL2010_dwc_export.sql @@ -1,12 +1,12 @@ ---create the COL2010 DwC archive for use in Checklist Bank --- This script needs to group by record id to prevent mulitle entries occurring when a taxa's name has multiple entries in the scientific_name table ---? What is the reason for having mulitple names?? ---Query OK, 2424622 rows affected (23 min 52.22 sec) - -select t.record_id ,ifnull(t.lsid,''), ifnull(replace(replace(t.name, '\n', ' '), '\r',''),'') , if(t.parent_id>0, cast(t.parent_id as CHAR), '') , ifnull(t.taxon,'') , ifnull(cast(accepted.record_id as CHAR),'') , ifnull(replace(replace(accepted.name, '\n', ' '), '\r', ''), ''), ifnull(replace(replace(name.author,'\n',' '), '\r', ''),''), ifnull(replace(replace(name.infraspecies,'\n', ' '), '\r', ''), '') -INTO OUTFILE '/data/checklistbank/rawdata/col2010/DarwinCore.txt' character set UTF8 -from taxa t -LEFT JOIN scientific_names name on t.name_code = name.name_code -LEFT JOIN taxa accepted ON name.accepted_name_code = accepted.name_code and accepted.record_id <> t.record_id -group by t.record_id +--create the COL2010 DwC archive for use in Checklist Bank +-- This script needs to group by record id to prevent mulitle entries occurring when a taxa's name has multiple entries in the scientific_name table +--? What is the reason for having mulitple names?? +--Query OK, 2424622 rows affected (23 min 52.22 sec) + +select t.record_id ,ifnull(t.lsid,''), ifnull(replace(replace(t.name, '\n', ' '), '\r',''),'') , if(t.parent_id>0, cast(t.parent_id as CHAR), '') , ifnull(t.taxon,'') , ifnull(cast(accepted.record_id as CHAR),'') , ifnull(replace(replace(accepted.name, '\n', ' '), '\r', ''), ''), ifnull(replace(replace(name.author,'\n',' '), '\r', ''),''), ifnull(replace(replace(name.infraspecies,'\n', ' '), '\r', ''), '') +INTO OUTFILE '/data/checklistbank/rawdata/col2010/DarwinCore.txt' character set UTF8 +from taxa t +LEFT JOIN scientific_names name on t.name_code = name.name_code +LEFT JOIN taxa accepted ON name.accepted_name_code = accepted.name_code and accepted.record_id <> t.record_id +group by t.record_id order by t.record_id \ No newline at end of file diff --git a/src/main/resources/au/org/ala/db/CoL_commonNames.sql b/data/historical/db/CoL_commonNames.sql similarity index 98% rename from src/main/resources/au/org/ala/db/CoL_commonNames.sql rename to data/historical/db/CoL_commonNames.sql index ab89eb367..5e65a96ef 100644 --- a/src/main/resources/au/org/ala/db/CoL_commonNames.sql +++ b/data/historical/db/CoL_commonNames.sql @@ -1,10 +1,10 @@ ---export the CoL common names for use in the name matching API --- We are only interested in the names that are in English or have no assigned language --- Mark the Australian common names so that they be given a higher rating -SELECT cn.common_name, t.name, t.lsid,CASE WHEN cn.country='Australia' then 'T' ELSE '' END -INTO OUTFILE '/data/exports/col_common_names.txt' FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n' -FROM common_names cn -JOIN scientific_names sn ON cn.name_code = sn.name_code -JOIN taxa t ON sn.accepted_name_code = t.name_code -WHERE cn.language = 'English' or cn.language is null or cn.language='English;English' or cn.language ='' +--export the CoL common names for use in the name matching API +-- We are only interested in the names that are in English or have no assigned language +-- Mark the Australian common names so that they be given a higher rating +SELECT cn.common_name, t.name, t.lsid,CASE WHEN cn.country='Australia' then 'T' ELSE '' END +INTO OUTFILE '/data/exports/col_common_names.txt' FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY '\n' +FROM common_names cn +JOIN scientific_names sn ON cn.name_code = sn.name_code +JOIN taxa t ON sn.accepted_name_code = t.name_code +WHERE cn.language = 'English' or cn.language is null or cn.language='English;English' or cn.language ='' GROUP BY cn.common_name, t.name,t.lsid,CASE WHEN cn.country='Australia' then 'T' ELSE '' END \ No newline at end of file diff --git a/src/main/resources/au/org/ala/db/checklist_bank_model_additions_ala.sql b/data/historical/db/checklist_bank_model_additions_ala.sql similarity index 98% rename from src/main/resources/au/org/ala/db/checklist_bank_model_additions_ala.sql rename to data/historical/db/checklist_bank_model_additions_ala.sql index 5578d88ff..ea10cfe45 100644 --- a/src/main/resources/au/org/ala/db/checklist_bank_model_additions_ala.sql +++ b/data/historical/db/checklist_bank_model_additions_ala.sql @@ -1,121 +1,121 @@ --- inserting some missing ranks so that all the exported values will have a valid (?) rank --- not 100% sure about some of the mappings. --- 575 infrageneric --- 725 subvariety --- 825 cultivar --- 875 unranked --- 900 supergenericname - -DELETE FROM term_gbif_portal_rank WHERE term_fk in (575, 725, 825, 875, 900); - -INSERT INTO term_gbif_portal_rank (term_fk, portal_rank) VALUES -(575, 6925), -(725,8015), -(825,8050), -(875, 0), -(900,8200) -; - --- Create the view that is necessary to export the taxon names in the format that ALA needs ---WHEN sci_pn.is_hybrid_formula = true THEN 1 (necessary for the old CB repository) ---WHEN sci.type = 5 THEN 1 (for the new) -CREATE OR REPLACE VIEW export_ala_taxon_name AS - SELECT COALESCE(can.id, sci.id) AS id, COALESCE(can.scientific_name, sci.scientific_name) AS canonical, - - CASE - WHEN tr.portal_rank < 6000 THEN sci_pn.monomial - ELSE NULL::character varying - END AS supra_generic, - CASE - WHEN tr.portal_rank >= 6000 THEN sci_pn.monomial - ELSE NULL::character varying - END AS generic, NULL::text AS infrageneric, sci_pn.specific_epithet, sci_pn.infra_specific_epithet AS infraspecific, NULL::text AS infraspecific_marker, - CASE - WHEN sci.type = 5 THEN 1 - ELSE 0 - END AS is_hybrid, tr.portal_rank AS rank, sci_pn.authorship AS author, NULL::unknown AS searchable_canonical - FROM name_usage nu - JOIN name_string sci ON nu.name_fk = sci.id - LEFT JOIN name_string can ON sci.canonical_name_fk = can.id - LEFT JOIN parsed_name sci_pn ON sci_pn.name_fk = sci.id - LEFT JOIN term_gbif_portal_rank tr ON nu.rank_fk = tr.term_fk - WHERE nu.checklist_fk = 1 - GROUP BY 1,2,3,4,5,6,7,8,9,10,11 - ORDER BY COALESCE(can.scientific_name, sci.scientific_name), tr.portal_rank, sci_pn.authorship; - ---create the view used for the taxon_concepts ---remove all the parent_fks and kingdom_fks that refer back to the "incertae sedis" record ie id=9. - -CREATE OR REPLACE VIEW ala_dwc_classification AS - SELECT u.id AS id, u.name_fk, n.id as sci_name_id,COALESCE(n.canonical_name_fk, n.id) as can_id,n.scientific_name, COALESCE(nc.scientific_name, n.scientific_name) AS canonical_name, u.lexical_group_fk, u.lft AS lft, u.rgt AS rgt, (COALESCE(np.authorship, ''::character varying)::text || - CASE - WHEN np.year IS NOT NULL THEN ', '::text || np.year::text - ELSE ''::text - END) || - CASE - WHEN np.authorship_basionym IS NOT NULL OR np.year_basionym IS NOT NULL THEN (' ('::text || COALESCE((np.authorship_basionym::text || ', '::text) || np.year_basionym::text, np.authorship_basionym::text, np.year_basionym::text)) || ')'::text - ELSE ''::text - END AS authorship, case u.parent_fk when 9 then null else u.parent_fk end, u.is_synonym, u.rank_fk, r.term as rank, case u.kingdom_fk when 9 then null else u.kingdom_fk end, knc.scientific_name AS kingdom, u.phylum_fk, COALESCE(pnc.scientific_name, pn.scientific_name) AS phylum, u.class_fk, COALESCE(cnc.scientific_name, cn.scientific_name) AS class, u.order_fk, COALESCE(onc.scientific_name, onn.scientific_name) AS "order", u.family_fk, COALESCE(fnc.scientific_name, fn.scientific_name) AS family, u.genus_fk, COALESCE(gnc.scientific_name, gn.scientific_name) AS genus, u.species_fk, COALESCE(snc.scientific_name, sn.scientific_name) AS species - FROM name_usage u - LEFT JOIN name_string n ON u.name_fk = n.id - LEFT JOIN name_string nc ON n.canonical_name_fk = nc.id - LEFT JOIN parsed_name np ON np.name_fk = n.id - LEFT JOIN term r ON u.rank_fk = r.id - LEFT JOIN name_usage ku ON u.kingdom_fk = ku.id - LEFT JOIN name_string kn ON ku.name_fk = kn.id - LEFT JOIN name_string knc ON kn.canonical_name_fk = knc.id - LEFT JOIN name_usage pu ON u.phylum_fk = pu.id - LEFT JOIN name_string pn ON pu.name_fk = pn.id - LEFT JOIN name_string pnc ON pn.canonical_name_fk = pnc.id - LEFT JOIN name_usage cu ON u.class_fk = cu.id - LEFT JOIN name_string cn ON cu.name_fk = cn.id - LEFT JOIN name_string cnc ON cn.canonical_name_fk = cnc.id - LEFT JOIN name_usage ou ON u.order_fk = ou.id - LEFT JOIN name_string onn ON ou.name_fk = onn.id - LEFT JOIN name_string onc ON onn.canonical_name_fk = onc.id - LEFT JOIN name_usage fu ON u.family_fk = fu.id - LEFT JOIN name_string fn ON fu.name_fk = fn.id - LEFT JOIN name_string fnc ON fn.canonical_name_fk = fnc.id - LEFT JOIN name_usage gu ON u.genus_fk = gu.id - LEFT JOIN name_string gn ON gu.name_fk = gn.id - LEFT JOIN name_string gnc ON gn.canonical_name_fk = gnc.id - LEFT JOIN name_usage su ON u.species_fk = su.id - LEFT JOIN name_string sn ON su.name_fk = sn.id - LEFT JOIN name_string snc ON sn.canonical_name_fk = snc.id - WHERE u.checklist_fk = 1; - ---may need to materialise the view so that SELECT statements are performant Query returned successfully with no result in 3842271 ms. -drop table IF EXISTS tmp_export_name_usage; - -create table tmp_export_name_usage AS SELECT * from ala_dwc_classification; -CREATE INDEX tmp_export_name_id_idx - ON tmp_export_name_usage - USING btree - (id) - WITH (FILLFACTOR=90); - ---create a tmp table with index on lookup columns to improve the performance of the lsid identifier lookup -drop table IF EXISTS tmp_identifiers; - -create table tmp_identifiers( -id serial NOT NULL, -lexical_group_fk integer, -name_fk integer, -identifier character varying(500), -checklist_fk integer -); -CREATE INDEX idx_tmp_ids_lg - ON tmp_identifiers - USING btree - (lexical_group_fk, name_fk); - ---insert the lsid type identifiers into the temporary identifiers table. 2636708 rows affected, 1107075 ms ---2622695 rows affected, 1129295 ms ---Query returned successfully: 2560234 rows affected, 1781306 ms execution time. --- NC: Added a order by identifier so that the consistent LSIDs are reported when multiple LSIDs exist for one taxon -INSERT into tmp_identifiers (lexical_group_fk, name_fk, identifier,checklist_fk) -SELECT nu.lexical_group_fk, COALESCE(ns.canonical_name_fk, ns.id), i.identifier, nu.checklist_fk FROM identifier i JOIN name_usage nu ON i.usage_fk = nu.id JOIN name_string ns on nu.name_fk = ns.id where i.type_fk = 2001 ORDER BY CASE nu.checklist_fk WHEN 1001 THEN 1 WHEN 1002 THEN 2 WHEN 1003 THEN 3 ELSE 4 END, i.identifier; - ---The SQL below identifies potential lexical groups that will have issues when the nub is genertaed ---The is specific to when 2 different ranks belong to the same lexical group eg Plecoptera is an ORDER and GENUS +-- inserting some missing ranks so that all the exported values will have a valid (?) rank +-- not 100% sure about some of the mappings. +-- 575 infrageneric +-- 725 subvariety +-- 825 cultivar +-- 875 unranked +-- 900 supergenericname + +DELETE FROM term_gbif_portal_rank WHERE term_fk in (575, 725, 825, 875, 900); + +INSERT INTO term_gbif_portal_rank (term_fk, portal_rank) VALUES +(575, 6925), +(725,8015), +(825,8050), +(875, 0), +(900,8200) +; + +-- Create the view that is necessary to export the taxon names in the format that ALA needs +--WHEN sci_pn.is_hybrid_formula = true THEN 1 (necessary for the old CB repository) +--WHEN sci.type = 5 THEN 1 (for the new) +CREATE OR REPLACE VIEW export_ala_taxon_name AS + SELECT COALESCE(can.id, sci.id) AS id, COALESCE(can.scientific_name, sci.scientific_name) AS canonical, + + CASE + WHEN tr.portal_rank < 6000 THEN sci_pn.monomial + ELSE NULL::character varying + END AS supra_generic, + CASE + WHEN tr.portal_rank >= 6000 THEN sci_pn.monomial + ELSE NULL::character varying + END AS generic, NULL::text AS infrageneric, sci_pn.specific_epithet, sci_pn.infra_specific_epithet AS infraspecific, NULL::text AS infraspecific_marker, + CASE + WHEN sci.type = 5 THEN 1 + ELSE 0 + END AS is_hybrid, tr.portal_rank AS rank, sci_pn.authorship AS author, NULL::unknown AS searchable_canonical + FROM name_usage nu + JOIN name_string sci ON nu.name_fk = sci.id + LEFT JOIN name_string can ON sci.canonical_name_fk = can.id + LEFT JOIN parsed_name sci_pn ON sci_pn.name_fk = sci.id + LEFT JOIN term_gbif_portal_rank tr ON nu.rank_fk = tr.term_fk + WHERE nu.checklist_fk = 1 + GROUP BY 1,2,3,4,5,6,7,8,9,10,11 + ORDER BY COALESCE(can.scientific_name, sci.scientific_name), tr.portal_rank, sci_pn.authorship; + +--create the view used for the taxon_concepts +--remove all the parent_fks and kingdom_fks that refer back to the "incertae sedis" record ie id=9. + +CREATE OR REPLACE VIEW ala_dwc_classification AS + SELECT u.id AS id, u.name_fk, n.id as sci_name_id,COALESCE(n.canonical_name_fk, n.id) as can_id,n.scientific_name, COALESCE(nc.scientific_name, n.scientific_name) AS canonical_name, u.lexical_group_fk, u.lft AS lft, u.rgt AS rgt, (COALESCE(np.authorship, ''::character varying)::text || + CASE + WHEN np.year IS NOT NULL THEN ', '::text || np.year::text + ELSE ''::text + END) || + CASE + WHEN np.authorship_basionym IS NOT NULL OR np.year_basionym IS NOT NULL THEN (' ('::text || COALESCE((np.authorship_basionym::text || ', '::text) || np.year_basionym::text, np.authorship_basionym::text, np.year_basionym::text)) || ')'::text + ELSE ''::text + END AS authorship, case u.parent_fk when 9 then null else u.parent_fk end, u.is_synonym, u.rank_fk, r.term as rank, case u.kingdom_fk when 9 then null else u.kingdom_fk end, knc.scientific_name AS kingdom, u.phylum_fk, COALESCE(pnc.scientific_name, pn.scientific_name) AS phylum, u.class_fk, COALESCE(cnc.scientific_name, cn.scientific_name) AS class, u.order_fk, COALESCE(onc.scientific_name, onn.scientific_name) AS "order", u.family_fk, COALESCE(fnc.scientific_name, fn.scientific_name) AS family, u.genus_fk, COALESCE(gnc.scientific_name, gn.scientific_name) AS genus, u.species_fk, COALESCE(snc.scientific_name, sn.scientific_name) AS species + FROM name_usage u + LEFT JOIN name_string n ON u.name_fk = n.id + LEFT JOIN name_string nc ON n.canonical_name_fk = nc.id + LEFT JOIN parsed_name np ON np.name_fk = n.id + LEFT JOIN term r ON u.rank_fk = r.id + LEFT JOIN name_usage ku ON u.kingdom_fk = ku.id + LEFT JOIN name_string kn ON ku.name_fk = kn.id + LEFT JOIN name_string knc ON kn.canonical_name_fk = knc.id + LEFT JOIN name_usage pu ON u.phylum_fk = pu.id + LEFT JOIN name_string pn ON pu.name_fk = pn.id + LEFT JOIN name_string pnc ON pn.canonical_name_fk = pnc.id + LEFT JOIN name_usage cu ON u.class_fk = cu.id + LEFT JOIN name_string cn ON cu.name_fk = cn.id + LEFT JOIN name_string cnc ON cn.canonical_name_fk = cnc.id + LEFT JOIN name_usage ou ON u.order_fk = ou.id + LEFT JOIN name_string onn ON ou.name_fk = onn.id + LEFT JOIN name_string onc ON onn.canonical_name_fk = onc.id + LEFT JOIN name_usage fu ON u.family_fk = fu.id + LEFT JOIN name_string fn ON fu.name_fk = fn.id + LEFT JOIN name_string fnc ON fn.canonical_name_fk = fnc.id + LEFT JOIN name_usage gu ON u.genus_fk = gu.id + LEFT JOIN name_string gn ON gu.name_fk = gn.id + LEFT JOIN name_string gnc ON gn.canonical_name_fk = gnc.id + LEFT JOIN name_usage su ON u.species_fk = su.id + LEFT JOIN name_string sn ON su.name_fk = sn.id + LEFT JOIN name_string snc ON sn.canonical_name_fk = snc.id + WHERE u.checklist_fk = 1; + +--may need to materialise the view so that SELECT statements are performant Query returned successfully with no result in 3842271 ms. +drop table IF EXISTS tmp_export_name_usage; + +create table tmp_export_name_usage AS SELECT * from ala_dwc_classification; +CREATE INDEX tmp_export_name_id_idx + ON tmp_export_name_usage + USING btree + (id) + WITH (FILLFACTOR=90); + +--create a tmp table with index on lookup columns to improve the performance of the lsid identifier lookup +drop table IF EXISTS tmp_identifiers; + +create table tmp_identifiers( +id serial NOT NULL, +lexical_group_fk integer, +name_fk integer, +identifier character varying(500), +checklist_fk integer +); +CREATE INDEX idx_tmp_ids_lg + ON tmp_identifiers + USING btree + (lexical_group_fk, name_fk); + +--insert the lsid type identifiers into the temporary identifiers table. 2636708 rows affected, 1107075 ms +--2622695 rows affected, 1129295 ms +--Query returned successfully: 2560234 rows affected, 1781306 ms execution time. +-- NC: Added a order by identifier so that the consistent LSIDs are reported when multiple LSIDs exist for one taxon +INSERT into tmp_identifiers (lexical_group_fk, name_fk, identifier,checklist_fk) +SELECT nu.lexical_group_fk, COALESCE(ns.canonical_name_fk, ns.id), i.identifier, nu.checklist_fk FROM identifier i JOIN name_usage nu ON i.usage_fk = nu.id JOIN name_string ns on nu.name_fk = ns.id where i.type_fk = 2001 ORDER BY CASE nu.checklist_fk WHEN 1001 THEN 1 WHEN 1002 THEN 2 WHEN 1003 THEN 3 ELSE 4 END, i.identifier; + +--The SQL below identifies potential lexical groups that will have issues when the nub is genertaed +--The is specific to when 2 different ranks belong to the same lexical group eg Plecoptera is an ORDER and GENUS -- select lg.id, count(distinct preferred_term_fk) from lexical_group lg join name_usage nu on lg.id = nu.lexical_group_fk join term r on nu.rank_fk=r.id group by lg.id having count(distinct preferred_term_fk)>1 \ No newline at end of file diff --git a/src/main/resources/au/org/ala/db/irmng.sql b/data/historical/db/irmng.sql similarity index 98% rename from src/main/resources/au/org/ala/db/irmng.sql rename to data/historical/db/irmng.sql index db5cef0cd..d7af6c34a 100644 --- a/src/main/resources/au/org/ala/db/irmng.sql +++ b/data/historical/db/irmng.sql @@ -1,17 +1,17 @@ ---create the list of known homonyms. ---The list assumes that a genus name that appears more than once in genus table is a homonym ---It is better to use this statement to create a list of known homonyms just in case Tony has not updated the DUPLICATE_FLAG field. -select GENUS into outfile '/tmp/known_homonyms.txt' from MASTER_GENLIST GROUP BY GENUS having count(GENUS)>1; - ---create the classification for all the Genus in IRMNG ---This will be used when trying to verify that a synonym has the correct higher classification -select case when UPPER(mf.KINGDOM) like '%UNALLOCATED%' then '' else mf.KINGDOM end, -case when UPPER(mf.PHYLUM) like '%UNALLOCATED%' then '' else mf.PHYLUM end, -case when UPPER(mf.CLASS) like '%UNALLOCATED%' then '' else mf.CLASS end , -case when UPPER(mf.ORDERNAME) like '%UNALLOCATED%' then '' else mf.ORDERNAME end, -case when UPPER(mf.FAMILY) like '%UNALLOCATED%' then '' else mf.FAMILY end, -mg.GENUS, mg.GENUS_ID, IFNULL(mg.SYNONYM_FLAG, ''), IFNULL(mg.IS_SYN_OF_CODE, ''), -IFNULL(mg.IS_SYN_OF_NAME, ''),IFNULL( mg.DUPLICATE_FLAG,'') -into outfile '/tmp/irmng_classification.txt' -from MASTER_GENLIST mg JOIN MASTER_FAMLIST mf on mg.FAMILY_ID = mf.FAMILY_ID - +--create the list of known homonyms. +--The list assumes that a genus name that appears more than once in genus table is a homonym +--It is better to use this statement to create a list of known homonyms just in case Tony has not updated the DUPLICATE_FLAG field. +select GENUS into outfile '/tmp/known_homonyms.txt' from MASTER_GENLIST GROUP BY GENUS having count(GENUS)>1; + +--create the classification for all the Genus in IRMNG +--This will be used when trying to verify that a synonym has the correct higher classification +select case when UPPER(mf.KINGDOM) like '%UNALLOCATED%' then '' else mf.KINGDOM end, +case when UPPER(mf.PHYLUM) like '%UNALLOCATED%' then '' else mf.PHYLUM end, +case when UPPER(mf.CLASS) like '%UNALLOCATED%' then '' else mf.CLASS end , +case when UPPER(mf.ORDERNAME) like '%UNALLOCATED%' then '' else mf.ORDERNAME end, +case when UPPER(mf.FAMILY) like '%UNALLOCATED%' then '' else mf.FAMILY end, +mg.GENUS, mg.GENUS_ID, IFNULL(mg.SYNONYM_FLAG, ''), IFNULL(mg.IS_SYN_OF_CODE, ''), +IFNULL(mg.IS_SYN_OF_NAME, ''),IFNULL( mg.DUPLICATE_FLAG,'') +into outfile '/tmp/irmng_classification.txt' +from MASTER_GENLIST mg JOIN MASTER_FAMLIST mf on mg.FAMILY_ID = mf.FAMILY_ID + diff --git a/src/main/resources/au/org/ala/vocab/nomenclatural_status.properties b/data/historical/nomenclatural_status.properties similarity index 100% rename from src/main/resources/au/org/ala/vocab/nomenclatural_status.properties rename to data/historical/nomenclatural_status.properties diff --git a/src/test/resources/au/org/ala/names/search/spatial-distribution-names.txt b/data/historical/spatial-distribution-names.txt similarity index 100% rename from src/test/resources/au/org/ala/names/search/spatial-distribution-names.txt rename to data/historical/spatial-distribution-names.txt diff --git a/pom.xml b/pom.xml index 8bfb7fee9..db8b02d73 100644 --- a/pom.xml +++ b/pom.xml @@ -3,166 +3,78 @@ au.org.ala ala-parent-pom - 9 + 14 4.0.0 au.org.ala ala-name-matching - jar + pom - 3.5 - ALA Name Matching (for Lucene 6 or above) - + 4.0 + + + ala-name-matching-model + ala-name-matching-search + ala-name-matching-builder + ala-name-matching-tools + ala-name-matching-distribution + + + ALA Name Matching (for Lucene 8 or above) + scm:git:git@github.com:AtlasOfLivingAustralia/ala-name-matching.git https://github.com/AtlasOfLivingAustralia/ala-name-matching scm:git:git@github.com:AtlasOfLivingAustralia/ala-name-matching.git HEAD + UTF-8 - 6.6.5 + 8.1.0 + 2.12.3 1.8 java18 1.0 + 4.13.1 + 1.7.25 + 1.6.2 + 5.0 + 1.2 + 2.4 + 1.32 + 2.61 - - org.gbif - dwca-io - 1.27 - - - commons-io - commons-io - - - org.slf4j - slf4j-api - - - - - - org.slf4j - slf4j-log4j12 - 1.7.25 - - - org.slf4j - slf4j-api - 1.7.25 - - - commons-collections - commons-collections - 3.2.2 - - - - org.gbif - gbif-common - 0.37 - - - org.gbif - name-parser - 2.24 - - - org.gbif.checklistbank - checklistbank-common - 2.61 - - - org.gbif - dwca-io - 1.32 - - junit junit - 4.12 + ${junit.version} test - - - org.apache.lucene - lucene-core - ${org.apache.lucene.version} - + - org.apache.lucene - lucene-backward-codecs - ${org.apache.lucene.version} - - - org.apache.lucene - lucene-analyzers-common - ${org.apache.lucene.version} - - - org.apache.lucene - lucene-queryparser - ${org.apache.lucene.version} - - - commons-lang - commons-lang - 2.6 - - - xerces - xercesImpl - - - - - com.opencsv - opencsv - 4.1 - jar - - - uk.ac.shef.wit - simmetrics - 1.6.2 + org.slf4j + slf4j-api + ${slf4j.version} - + com.fasterxml.jackson.core jackson-core - 2.8.9 + ${com.fasterxml.jackson.version} com.fasterxml.jackson.core jackson-databind - 2.8.9 + ${com.fasterxml.jackson.version} com.fasterxml.jackson.core jackson-annotations - 2.8.9 - - - - commons-cli - commons-cli - 1.2 + ${com.fasterxml.jackson.version} @@ -180,29 +92,6 @@ UTF8 - - org.apache.maven.plugins - maven-assembly-plugin - - - src/assembly/assembly.xml - - - - true - lib/ - - - - - - package - - single - - - - org.apache.maven.plugins maven-source-plugin @@ -232,19 +121,6 @@ - - org.apache.maven.plugins - maven-jar-plugin - - - - true - lib/ - au.org.ala.names.search.DwcaNameIndexer - - - - @@ -253,43 +129,6 @@ travis - - org.apache.maven.plugins - maven-jar-plugin - 2.4 - - - - true - lib/ - au.org.ala.names.search.DwcaNameIndexer - - - - - - org.apache.maven.plugins - maven-assembly-plugin - - - src/assembly/assembly.xml - - - - true - lib/ - - - - - - package - - single - - - - diff --git a/src/assembly/assembly.xml b/src/assembly/assembly.xml deleted file mode 100644 index 25a1e4f4c..000000000 --- a/src/assembly/assembly.xml +++ /dev/null @@ -1,30 +0,0 @@ - - distribution - - zip - - false - - - true - true - false - runtime - 0755 - 0755 - - au.org.ala:ala-name-matching - - - - false - true - false - runtime - 0644 - 0755 - lib - - - - \ No newline at end of file diff --git a/src/main/java/au/org/ala/names/index/IssueType.java b/src/main/java/au/org/ala/names/index/IssueType.java deleted file mode 100644 index 57bfc62a8..000000000 --- a/src/main/java/au/org/ala/names/index/IssueType.java +++ /dev/null @@ -1,22 +0,0 @@ -package au.org.ala.names.index; - -/** - * Vocabulary for reporting issues. - * - * @author Doug Palmer <Doug.Palmer@csiro.au> - * @copyright Copyright © 2017 Atlas of Living Australia - */ -public enum IssueType { - /** An invalid source taxonomy */ - VALIDATION, - /** An error likely to make a taxonomy unusable */ - ERROR, - /** A problem loading the taxonomy that needs to be addressed */ - PROBLEM, - /** A collision between concepts */ - COLLISION, - /** A note about processing */ - NOTE, - /** A statistic of some sort */ - COUNT -} diff --git a/src/main/java/au/org/ala/names/index/provider/NameMatchType.java b/src/main/java/au/org/ala/names/index/provider/NameMatchType.java deleted file mode 100644 index 62f9c2218..000000000 --- a/src/main/java/au/org/ala/names/index/provider/NameMatchType.java +++ /dev/null @@ -1,18 +0,0 @@ -package au.org.ala.names.index.provider; - -/** - * How to match a name or author - * - * @author Doug Palmer <Doug.Palmer@csiro.au> - * @copyright Copyright © 2017 Atlas of Living Australia - */ -public enum NameMatchType { - /** Exact match */ - EXACT, - /** Case and space insensitive */ - INSENSITIVE, - /** Normalised by GBIF name analysis @see org.gbif.checklistbank.utils.SciNameNormalizer @see org.gbif.checklistbank.authorship.AuthorComparator */ - NORMALISED, - /** Reguilar expression match */ - REGEX -} diff --git a/src/main/java/au/org/ala/names/lucene/analyzer/LowerCaseKeywordAnalyzer.java b/src/main/java/au/org/ala/names/lucene/analyzer/LowerCaseKeywordAnalyzer.java deleted file mode 100644 index 6e2f12fb9..000000000 --- a/src/main/java/au/org/ala/names/lucene/analyzer/LowerCaseKeywordAnalyzer.java +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (C) 2014 Atlas of Living Australia - * All Rights Reserved. - * - * The contents of this file are subject to the Mozilla Public - * License Version 1.1 (the "License"); you may not use this file - * except in compliance with the License. You may obtain a copy of - * the License at http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS - * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or - * implied. See the License for the specific language governing - * rights and limitations under the License. - */ -package au.org.ala.names.lucene.analyzer; - -import java.io.IOException; -import java.io.Reader; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.core.KeywordTokenizer; -import org.apache.lucene.analysis.core.LowerCaseFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.util.Version; - -/** - * A custom KeywordAnalyzer that converts the text to lowercase before tokenizing - * the complete string as one token - * - * @author Natasha - */ -public final class LowerCaseKeywordAnalyzer extends Analyzer { - - @Override - protected TokenStreamComponents createComponents(String fieldName) { - - KeywordTokenizer src = new KeywordTokenizer(); - TokenStream result = new LowerCaseFilter(src); - - return new TokenStreamComponents(src, result) { - - @Override - protected void setReader(final Reader reader){ - super.setReader(reader); - } - }; - } -} diff --git a/src/main/java/au/org/ala/names/model/NameFlag.java b/src/main/java/au/org/ala/names/model/NameFlag.java deleted file mode 100644 index 74207ee73..000000000 --- a/src/main/java/au/org/ala/names/model/NameFlag.java +++ /dev/null @@ -1,12 +0,0 @@ -package au.org.ala.names.model; - -/** - * Flags indicating special-case information about a name - * - * @author Doug Palmer <Doug.Palmer@csiro.au> - * @copyright Copyright © 2019 Atlas of Living Australia - */ -public enum NameFlag { - /** The name is an autonymn, meaning that it has been created without an author because a sub-taxon was created */ - AUTONYM -} diff --git a/src/main/java/au/org/ala/names/model/NameIndexField.java b/src/main/java/au/org/ala/names/model/NameIndexField.java deleted file mode 100644 index baafd43ae..000000000 --- a/src/main/java/au/org/ala/names/model/NameIndexField.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (C) 2014 Atlas of Living Australia - * All Rights Reserved. - * - * The contents of this file are subject to the Mozilla Public - * License Version 1.1 (the "License"); you may not use this file - * except in compliance with the License. You may obtain a copy of - * the License at http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS - * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or - * implied. See the License for the specific language governing - * rights and limitations under the License. - */ -package au.org.ala.names.model; - -/** - * An Enum for all the fields that are indexed for the name matching. This enum is used by - * {@link au.org.ala.names.search.ALANameIndexer} to create the index and - * {@link au.org.ala.names.search.ALANameSearcher} to search the index - * - * @author Natasha Carter - */ -public enum NameIndexField { - ID("id"), - LSID("lsid"), - PARENT_ID("parent_id"), - DOCUMENT_TYPE("doctype"), - ACCEPTED("accepted_lsid"), - iS_SYNONYM("is_synonym"),//whether or not the record is a synonym - GENUS("genus"), - GENUS_EX("genus_ex"), //genus sounds like expression - handles masculine and feminine too. - SPECIES_EX("specific_ex"),// specific epithet sounds like expression - INFRA_EX("infra_ex"),//infra specific epithet sounds like expression - SPECIFIC("specific"), - INFRA_SPECIFIC("infra"), - NAME("name"),// search name - OTHER_NAMES("other_names"),// Alternative names - NAME_CANONICAL("name_canonical"), // Canonical name - NAME_COMPLETE("name_complete"), // Complete name - RANK_ID("rank_id"), - RANK("rank"), - AUTHOR("author"), - PHRASE("phrase"),//stores the values of a "phrase" name. Some more intelligence will be needed when matching these - VOUCHER("voucher"), //stores a voucher value minus the spaces and fullstops. - ALA("ala"), //stores whether or not it is an ALA generated name - DATASET_ID("dataset_id"), // The source dataset - SYNONYM_TYPE("syn_type"), //stores the type of synonym that it represents - /* Stores the priority score associated with a taxon */ - PRIORITY("priority"); - String name; - - NameIndexField(String name) { - this.name = name; - } - - public String toString() { - return name; - } -} diff --git a/src/main/java/au/org/ala/names/model/TaxonomicTypeGroup.java b/src/main/java/au/org/ala/names/model/TaxonomicTypeGroup.java deleted file mode 100644 index 0271b024a..000000000 --- a/src/main/java/au/org/ala/names/model/TaxonomicTypeGroup.java +++ /dev/null @@ -1,20 +0,0 @@ -package au.org.ala.names.model; - -/** - * Groupings of taxonomic types - * - * @author Doug Palmer <Doug.Palmer@csiro.au> - * @copyright Copyright © 2017 Atlas of Living Australia - */ -public enum TaxonomicTypeGroup { - ACCEPTED, - SYNONYM, - MISAPPLIED, - EXCLUDED, - MISCELLANEOUS, - INCERTAE_SEDIS, - SPECIES_INQUIRENDA, - UNPLACED, - DOUBTFUL, - INVALID -} diff --git a/src/main/java/au/org/ala/vocab/Concept.java b/src/main/java/au/org/ala/vocab/Concept.java deleted file mode 100644 index 705acfb4d..000000000 --- a/src/main/java/au/org/ala/vocab/Concept.java +++ /dev/null @@ -1,143 +0,0 @@ -package au.org.ala.vocab; - -import com.fasterxml.jackson.annotation.*; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.SerializationFeature; - -import java.io.IOException; -import java.io.Writer; -import java.net.URI; -import java.net.URISyntaxException; -import java.util.Arrays; -import java.util.List; - -/** - * Abstract vocabulary concept. - *

- * These are modelled as data rather than enums or the like so that ... ahem ... unique source - * vocabularies can be mapped. - *

- * - * @author Doug Palmer <Doug.Palmer@csiro.au> - * @copyright Copyright © 2017 Atlas of Living Australia - */ -@JsonIdentityInfo(generator = ObjectIdGenerators.PropertyGenerator.class, property = "id") -@JsonTypeInfo(use=JsonTypeInfo.Id.CLASS, include=JsonTypeInfo.As.PROPERTY, property="@class") -@JsonInclude(JsonInclude.Include.NON_NULL) -abstract public class Concept> { - /** The concept URI */ - @JsonProperty - private URI uri; - /** The concept id; a unique identifier */ - @JsonProperty - private String id; - /** Alternative names for a concept */ - @JsonProperty - private List names; - /** The concept description */ - @JsonProperty - private String description; - /** The concept vocabulary that this concept is a member of */ - @JsonManagedReference - private Vocabulary vocabulary; - /** A parent concept */ - @JsonProperty - private Concept parent; - - public Concept() { - } - - public Concept(Vocabulary vocabulary, URI uri, String id, String description, Concept parent, String... names) { - this.vocabulary = vocabulary; - this.uri = uri; - this.id = id; - this.names = names == null ? null : Arrays.asList(names); - this.description = description; - this.parent = parent; - } - - public Concept(Vocabulary vocabulary, String id, String description, Concept parent, String... names) { - this(vocabulary, null, id, description, parent, names); - try { - this.uri = new URI(this.vocabulary.getUri().getScheme(), this.vocabulary.getUri().getSchemeSpecificPart(), id); - } catch (URISyntaxException ex) { - throw new IllegalArgumentException("Unable to construct concept " + id, ex); - } - } - - public Concept(Vocabulary vocabulary, String id, String... names) { - this(vocabulary, id, null, null, names); - } - - - /** - * Get the URI associated with this concept. - * - * @return The concept URI - */ - public URI getUri() { - return uri; - } - - /** - * Get the id of the concept. - *

- * The id is a unique identifier for this concept. - *

- * - * @return The concept id - */ - public String getId() { - return id; - } - - /** - * Get the list of alternative names for a concept. - * - * @return The alternative name list - */ - public List getNames() { - return names; - } - - /** - * Get the long description of the concept. - * - * @return The long description - */ - public String getDescription() { - return description; - } - - - /** - * Get the vocabulary that the concept is part of - * - * @return The vocabulary - */ - public Vocabulary getVocabulary() { - return vocabulary; - } - - /** - * Get the parent concept. - * - * @return A wider or more general version of the concept. - */ - public Concept getParent() { - return parent; - } - - /** - * Write the concept to a writer - * - * @param writer - * @throws IOException - */ - public void write(Writer writer) throws IOException { - ObjectMapper mapper = new ObjectMapper(); - - mapper.enable(SerializationFeature.INDENT_OUTPUT); - mapper.writeValue(writer, this); - } -} diff --git a/src/main/java/au/org/ala/vocab/TaxonRank.java b/src/main/java/au/org/ala/vocab/TaxonRank.java deleted file mode 100644 index e03e5c9b5..000000000 --- a/src/main/java/au/org/ala/vocab/TaxonRank.java +++ /dev/null @@ -1,63 +0,0 @@ -package au.org.ala.vocab; - -/** - * A taxonomic rank - * - * @author Doug Palmer <Doug.Palmer@csiro.au> - * @copyright Copyright © 2017 Atlas of Living Australia - */ -public class TaxonRank extends Concept { - /** The rank level */ - private int level; - /** Is this rank comparable */ - private boolean comparable; - /** Is this one of the standard linnaean ranks? */ - private boolean linnaean; - - public TaxonRank() { - } - - public TaxonRank(Vocabulary vocabulary, String id, int level, boolean comparable, boolean linnaean, String... names) { - super(vocabulary, id, names); - this.level = level; - this.comparable = comparable; - this.linnaean = linnaean; - } - - /** - * The rank level. - *

- * Larger indicates a lower order (more specific) taxon - *

- * - * @return The level - */ - public int getLevel() { - return level; - } - - /** - * Is this a comparable rank? - *

- * Comparable ranks should have taxa with parent-child in order. - * Non-comparable ranks indicate a - *

- * - * @return True if the rank is comparable - */ - public boolean isComparable() { - return comparable; - } - - /** - * Is this a Linnaean rank? - *

- * One of the big seven (kingdom, phylum, class, order, familty, genus, species) - *

- * - * @return True if a linnaean rank - */ - public boolean isLinnaean() { - return linnaean; - } -} diff --git a/src/main/java/au/org/ala/vocab/Vocabulary.java b/src/main/java/au/org/ala/vocab/Vocabulary.java deleted file mode 100644 index aa8fdc87e..000000000 --- a/src/main/java/au/org/ala/vocab/Vocabulary.java +++ /dev/null @@ -1,98 +0,0 @@ -package au.org.ala.vocab; - -import com.fasterxml.jackson.annotation.JsonBackReference; -import com.fasterxml.jackson.databind.annotation.JsonDeserialize; -import com.fasterxml.jackson.databind.util.StdConverter; - -import java.net.URI; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * A vocabulary constructed from - * - * @author Doug Palmer <Doug.Palmer@csiro.au> - * @copyright Copyright © 2017 Atlas of Living Australia - */ -@JsonDeserialize(converter = Vocabulary.VocabularyConverter.class) -public class Vocabulary> extends Concept> { - /** The concepts */ - @JsonBackReference - private List> concepts; - /** The vocabulary concepts */ - private Map> uriConceptMap; - /** The vocabulary names */ - private Map> nameConceptMap; - - public Vocabulary() { - this.concepts = new ArrayList<>(); - this.uriConceptMap = new HashMap<>(); - this.nameConceptMap = new HashMap<>(); - } - - public Vocabulary(URI uri, String id, String description) { - super(null, uri, id, description, null, null); - this.concepts = new ArrayList<>(); - this.uriConceptMap = new HashMap<>(); - this.nameConceptMap = new HashMap<>(); - } - - /** - * Add a concept to the vocabulary - * - * @param concept The concept - */ - public void add(Concept concept) { - this.concepts.add(concept); - this.resolve(concept); - } - - /** - * Build vocabulary maps to allow get by name/get by URI - */ - protected void resolve() { - this.uriConceptMap = new HashMap<>(this.concepts.size()); - this.nameConceptMap = new HashMap<>(this.concepts.size()); - for (Concept concept: this.concepts) - this.resolve(concept); - } - - /** - * Add a concept to the lookup tables - * - * @param concept The concept to add - * - * @throws IllegalStateException if the concept URI or name has already been added - */ - protected void resolve(Concept concept) { - if (concept.getUri() != null) { - if (this.uriConceptMap.containsKey(concept.getUri())) - throw new IllegalStateException("Duplicate uri " + concept.getUri() + " for " + concept.getId()); - this.uriConceptMap.put(concept.getUri(), concept); - } - if (this.nameConceptMap.containsKey(concept.getId())) - throw new IllegalStateException("Duplicate id " + concept.getId()); - this.nameConceptMap.put(concept.getId(), concept); - if (concept.getNames() != null) { - for (String name: concept.getNames()) { - if (this.nameConceptMap.containsKey(concept.getId())) - throw new IllegalStateException("Duplicate name " + name + " for " + concept.getId()); - this.nameConceptMap.put(name, concept); - } - } - } - - /** - * Converter to allow post-construction concept maps to be built - */ - protected static class VocabularyConverter extends StdConverter, Vocabulary> { - @Override - public Vocabulary convert(Vocabulary value) { - value.resolve(); - return value; - } - } - -} diff --git a/src/test/java/au/org/ala/names/search/MatchMetricsTest.java b/src/test/java/au/org/ala/names/search/MatchMetricsTest.java deleted file mode 100644 index 20ea83f95..000000000 --- a/src/test/java/au/org/ala/names/search/MatchMetricsTest.java +++ /dev/null @@ -1,422 +0,0 @@ -package au.org.ala.names.search; - -import au.org.ala.names.model.LinnaeanRankClassification; -import au.org.ala.names.model.MatchMetrics; -import org.junit.Before; -import org.junit.Test; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -/** - * Tests for the rank classification - */ -public class MatchMetricsTest { - private static final float MATCH_TOLERANCE = 0.01f; - private static final LinnaeanRankClassification CLASS1 = new LinnaeanRankClassification("Animalia", "Arthropoda", "Insecta", "Hymenoptra", "Formicidae", "Huberria", "Huberia striata", "(Smith, 1876)"); - private static final LinnaeanRankClassification CLASS2 = new LinnaeanRankClassification("Charophyta", "Arthropoda", "Equisetopsida", "Gentianales", "Apocynaceae", "Oxypetalum", "Oxypetalum caeruleum", "(D.Don) Decne."); - private MatchMetrics metrics; - - @Before - public void setup() { - this.metrics = new MatchMetrics(); - } - - @Test - public void testComputeMatch1() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch2() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setKingdom(null); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch3() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setPhylum(null); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch4() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setKlass(null); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch5() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setOrder(null); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch6() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setFamily(null); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch7() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setGenus(null); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch8() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setScientificName(null); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch9() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setAuthorship(null); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch10() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setKingdom(null); - this.metrics.computeMatch(query, result, false); - assertEquals(0.746, metrics.getMatch(), MATCH_TOLERANCE); - } - - - @Test - public void testComputeMatch11() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setKingdom("Plantae"); - this.metrics.computeMatch(query, result, false); - assertEquals(0.816, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch12() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setPhylum(null); - this.metrics.computeMatch(query, result, false); - assertEquals(0.976, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch13() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setPhylum("Chordata"); - this.metrics.computeMatch(query, result, false); - assertEquals(0.958, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch14() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setPhylum("ARTHROPODA"); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch15() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setKlass("Hexapodia"); - this.metrics.computeMatch(query, result, false); - assertEquals(0.947, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch16() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setFamily("PERIPATOPSIDAE"); - this.metrics.computeMatch(query, result, false); - assertEquals(0.942, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch17() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setGenus("Vescerro"); - this.metrics.computeMatch(query, result, false); - assertEquals(0.929, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch18() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setSpecificEpithet("striata"); - result.setSpecificEpithet("trigona"); - this.metrics.computeMatch(query, result, false); - assertEquals(0.975, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch19() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setAuthorship("Smith, 1876"); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch20() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setAuthorship("Smith"); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch21() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setAuthorship("Jones, 1876"); - this.metrics.computeMatch(query, result, false); - assertEquals(0.854, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch22() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setAuthorship("Jones"); - this.metrics.computeMatch(query, result, false); - assertEquals(0.756, metrics.getMatch(), MATCH_TOLERANCE); - } - - /** - * Testing bad authors without much context, should result in a lowered match. - * - * @throws Exception - */ - @Test - public void testComputeMatch23() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setKingdom(null); - query.setPhylum(null); - query.setKlass(null); - query.setOrder(null); - query.setFamily(null); - query.setGenus(null); - result.setAuthorship("Smith, 1876"); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch24() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setKingdom(null); - query.setPhylum(null); - query.setKlass(null); - query.setOrder(null); - query.setFamily(null); - query.setGenus(null); - result.setAuthorship("Smith"); - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch25() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setKingdom(null); - query.setPhylum(null); - query.setKlass(null); - query.setOrder(null); - query.setFamily(null); - query.setGenus(null); - result.setAuthorship("Jones, 1876"); - this.metrics.computeMatch(query, result, false); - assertEquals(0.554, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatch26() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setKingdom(null); - query.setPhylum(null); - query.setKlass(null); - query.setOrder(null); - query.setFamily(null); - query.setGenus(null); - result.setAuthorship("Jones"); - this.metrics.computeMatch(query, result, false); - assertEquals(0.255, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatchSynonym1() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS2); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS2); - result.setKingdom(null); - result.setPhylum(null); - result.setKlass(null); - result.setOrder(null); - result.setFamily(null); - result.setGenus(null); - this.metrics.computeMatch(query, result, true); - assertEquals(0.655, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatchSynonym2() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS2); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS2); - result.setKingdom(null); - result.setPhylum(null); - result.setKlass(null); - result.setOrder(null); - result.setFamily(null); - this.metrics.computeMatch(query, result, true); - assertEquals(0.675, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatchSynonym3() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS2); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS2); - result.setKingdom(null); - result.setPhylum(null); - result.setKlass(null); - result.setOrder(null); - result.setFamily(null); - result.setAuthorship(null); - this.metrics.computeMatch(query, result, true); - assertEquals(0.482, metrics.getMatch(), MATCH_TOLERANCE); - } - - @Test - public void testComputeMatchSynonym4() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS2); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS2); - result.setKingdom(null); - result.setPhylum(null); - result.setKlass(null); - result.setOrder(null); - result.setFamily(null); - query.setAuthorship(null); - result.setAuthorship(null); - this.metrics.computeMatch(query, result, true); - assertEquals(0.471, metrics.getMatch(), MATCH_TOLERANCE); - } - - - /** - * Test match computation takes less than 1us per match for simple cases - * - * @throws Exception - */ - @Test - public void testComputeMatchTiming1() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - long start = System.currentTimeMillis(); - for (int i = 0; i < 1000000; i++) - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - long time = System.currentTimeMillis() - start; - assertTrue("Took " + time + "ms. Required to be less than 4000ms", time < 4000); - } - - /** - * Test match computation takes less than 1us per match for simple cases - * - * @throws Exception - */ - @Test - public void testComputeMatchTiming2() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - query.setKingdom(null); - query.setPhylum(null); - query.setKlass(null); - query.setOrder(null); - long start = System.currentTimeMillis(); - for (int i = 0; i < 1000000; i++) - this.metrics.computeMatch(query, result, false); - assertEquals(1.0, metrics.getMatch(), MATCH_TOLERANCE); - long time = System.currentTimeMillis() - start; - assertTrue("Took " + time + "ms. Required to be less than 4000ms", time < 4000); - } - - /** - * Test match computation takes less than 10us per match for one bodgy result - * - * @throws Exception - */ - @Test - public void testComputeMatchTiming3() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setPhylum("Chordata"); - long start = System.currentTimeMillis(); - for (int i = 0; i < 100000; i++) - this.metrics.computeMatch(query, result, false); - assertEquals(0.958, metrics.getMatch(), MATCH_TOLERANCE); - long time = System.currentTimeMillis() - start; - assertTrue("Took " + time + "ms. Required to be less than 4000ms", time < 4000); - } - - /** - * Test match computation takes less than 10us per match for two bodgy results - * - * @throws Exception - */ - @Test - public void testComputeMatchTiming4() throws Exception { - LinnaeanRankClassification query = new LinnaeanRankClassification(CLASS1); - LinnaeanRankClassification result = new LinnaeanRankClassification(CLASS1); - result.setPhylum("Chordata"); - result.setGenus("Acacia"); - long start = System.currentTimeMillis(); - for (int i = 0; i < 100000; i++) - this.metrics.computeMatch(query, result, false); - assertEquals(0.873, metrics.getMatch(), MATCH_TOLERANCE); - long time = System.currentTimeMillis() - start; - assertTrue("Took " + time + "ms. Required to be less than 4000ms", time < 4000); - } - - -} diff --git a/src/test/java/au/org/ala/vocab/TaxonRankTest.java b/src/test/java/au/org/ala/vocab/TaxonRankTest.java deleted file mode 100644 index 4696cef1c..000000000 --- a/src/test/java/au/org/ala/vocab/TaxonRankTest.java +++ /dev/null @@ -1,33 +0,0 @@ -package au.org.ala.vocab; - -import au.org.ala.names.util.TestUtils; -import org.junit.Before; -import org.junit.Test; - -import java.io.StringWriter; -import java.net.URI; - -import static org.junit.Assert.assertEquals; - -/** - * Tests for a taxonomic rank. - * - * @author Doug Palmer <Doug.Palmer@csiro.au> - * @copyright Copyright © 2017 Atlas of Living Australia - */ -public class TaxonRankTest extends TestUtils { - private Vocabulary vocabulary; - - @Before - public void setup() throws Exception { - this.vocabulary = new Vocabulary<>(URI.create("urm:x-ala:vocabulary:tr-1"), "tr-1", null); - } - - @Test - public void testWrite1() throws Exception { - TaxonRank rank = new TaxonRank(this.vocabulary, "genus", 6000, true, true); - StringWriter sw = new StringWriter(); - rank.write(sw); - assertEquals(this.loadResource("taxon-rank-1.json"), sw.toString()); - } -} diff --git a/src/test/resources/au/org/ala/vocab/taxon-rank-1.json b/src/test/resources/au/org/ala/vocab/taxon-rank-1.json deleted file mode 100644 index 515dc85f0..000000000 --- a/src/test/resources/au/org/ala/vocab/taxon-rank-1.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "@class" : "au.org.ala.vocab.TaxonRank", - "id" : "genus", - "uri" : "urm:x-ala:vocabulary:tr-1#genus", - "names" : [ ], - "vocabulary" : { - "@class" : "au.org.ala.vocab.Vocabulary", - "id" : "tr-1", - "uri" : "urm:x-ala:vocabulary:tr-1" - }, - "level" : 6000, - "comparable" : true, - "linnaean" : true -} \ No newline at end of file