Skip to content

Commit

Permalink
add exeptions to annotations (close #53)
Browse files Browse the repository at this point in the history
  • Loading branch information
dimus committed Aug 2, 2021
1 parent 51131ef commit b02e40c
Show file tree
Hide file tree
Showing 6 changed files with 3,628 additions and 3,519 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@

## Unreleased

## [v1.3.1]
## [v1.3.2]

- Add [#182]: support `Do`, `Oo`, `Nu` 2-letter genera.
- Add [#53]: exceptions to annotations (`Bottaria nudum` for example).

## [v1.3.1]

- Add [#180]: Zenodo DOI.

## [v1.3.0]
Expand Down
103 changes: 62 additions & 41 deletions ent/internal/preprocess/preprocess.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,57 @@ import (
var hybridCharRe1 = regexp.MustCompile(`(^)[Xx](\p{Lu})`)
var hybridCharRe2 = regexp.MustCompile(`(\s|^)[Xx](\s|$)`)

var VirusException = map[string]string{
"Aspilota": "vector",
"Bembidion": "satellites",
"Bolivina": "prion",
"Ceylonesmus": "vector",
"Cryptops": "vector",
"Culex": "vector",
"Dasyproctus": "cevirus",
"Desmoxytes": "vector",
"Dicathais": "vector",
"Erateina": "satellites",
"Euragallia": "prion",
"Exochus": "virus",
"Hilara": "vector",
"Ithomeis": "satellites",
"Microgoneplax": "prion",
"Neoaemula": "vector",
"Nephodia": "satellites",
"Ophion": "virus",
"Psenulus": "trevirus",
"Tidabius": "vector",
}

var AnnotationException = map[string]string{
"Acrostichum": "nudum",
"Adiantum": "nudum",
"Africanthion": "nudum",
"Agathidium": "nudum",
"Aphaniosoma": "nudum",
"Aspidium": "nudum",
"Athyrium": "nudum",
"Blechnum": "nudum",
"Bottaria": "nudum",
"Gnathopleustes": "den",
"Lycopodium": "nudum",
"Nephrodium": "nudum",
"Paralvinella": "dela",
"Polypodium": "nudum",
"Polystichum": "nudum",
"Psilotum": "nudum",
"Ruteloryctes": "bis",
"Selenops": "ab",
"Tortolena": "dela",
"Trachyphloeosoma": "nudum",
"Zodarion": "van",
}

var NoParseException = map[string]string{
"Navicula": "bacterium",
}

var notesRe = regexp.MustCompile(
`(?i)\s+(environmental|samples|species\s+group|species\s+complex|clade|group|author)\b.*$`,
)
Expand Down Expand Up @@ -68,14 +119,17 @@ func Preprocess(bs []byte) *Preprocessor {
}
i := len(bs)
name := string(bs)
if !VirusLikeName(name) {
if !IsException(name, VirusException) {
pr.Virus = IsVirus(bs[0:i])
}
if pr.Virus {
pr.NoParse = true
return pr
}
pr.NoParse = NoParse(bs[0:i])
if IsException(name, NoParseException) {
pr.NoParse = false
}
if pr.NoParse {
return pr
}
Expand All @@ -96,50 +150,14 @@ func Preprocess(bs []byte) *Preprocessor {
return pr
}

// LikeVirus takes a string and checks it against known species that can
// easily be mistaken for viruses. If the string belongs to one of such species
// returns true.
// The following names are covered:
// Aspilota vector Belokobylskij, 2007
// Ceylonesmus vector Chamberlin, 1941
// Cryptops (Cryptops) vector Chamberlin, 1939
// Culex vector Dyar & Knab, 1906
// Dasyproctus cevirus Leclercq, 1963
// Desmoxytes vector (Chamberlin, 1941)
// Dicathais vector Thornley, 1952
// Euragallia prion Kramer, 1976
// Exochus virus Gauld & Sithole, 2002
// Hilara vector Miller, 1923
// Microgoneplax prion Castro, 2007
// Neoaemula vector Mackinnon, Hiller, Long & Marshall, 2008
// Ophion virus Gauld & Mitchell, 1981
// Psenulus trevirus Leclercq, 1961
// Tidabius vector Chamberlin, 1931
func VirusLikeName(name string) bool {
names := map[string]string{
"Aspilota": "vector",
"Ceylonesmus": "vector",
"Cryptops": "vector",
"Culex": "vector",
"Dasyproctus": "cevirus",
"Desmoxytes": "vector",
"Dicathais": "vector",
"Euragallia": "prion",
"Exochus": "virus",
"Hilara": "vector",
"Microgoneplax": "prion",
"Neoaemula": "vector",
"Ophion": "virus",
"Psenulus": "trevirus",
"Tidabius": "vector",
}
func IsException(name string, names map[string]string) bool {
words := strings.Fields(name)
if len(words) < 2 {
return false
}
if epithet, ok := names[words[0]]; ok {
for _, w := range words[1:] {
if strings.HasPrefix(w, epithet) {
if w == epithet {
return true
}
}
Expand All @@ -161,6 +179,9 @@ func NormalizeHybridChar(bs []byte) []byte {
// input.
func Annotation(bs []byte) int {
i := len(bs)
if IsException(string(bs), AnnotationException) {
return i
}
regexps := []*regexp.Regexp{
notesRe, taxonConceptsRe1, taxonConceptsRe2, taxonConceptsRe3,
nomenConceptsRe, lastWordJunkRe, stopWordsRe,
Expand All @@ -178,8 +199,8 @@ func Annotation(bs []byte) int {
// `Anthurium Trustees of the British Museum` should not.
cultivarRankLoc := cultivarRankRe.FindIndex(bs[0:i])
ofLoc := ofWordRe.FindIndex(bs[0:i])
if( len(ofLoc) > 0 && ofLoc[0] < i &&
(len(cultivarRankLoc) == 0 || cultivarRankLoc[0] > ofLoc[0])) {
if len(ofLoc) > 0 && ofLoc[0] < i &&
(len(cultivarRankLoc) == 0 || cultivarRankLoc[0] > ofLoc[0]) {
i = ofLoc[0]
}

Expand Down
46 changes: 45 additions & 1 deletion ent/internal/preprocess/preprocess_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,50 @@ func TestPreprocess(t *testing.T) {
[]byte(v.out), v.msg)
}
})
t.Run("NoParseLikeName", func(t *testing.T) {
data := []struct {
msg string
name string
likeAnnotation bool
}{
{"name", "Navicula bacterium", true},
}
for _, v := range data {
assert.Equal(t, ppr.IsException(v.name, ppr.NoParseException), v.likeAnnotation, v.msg)
}
})
t.Run("AnnotationLikeName", func(t *testing.T) {
data := []struct {
msg string
name string
likeAnnotation bool
}{
{"name", "Acrostichum nudum", true},
{"name", "Adiantum nudum", true},
{"name", "Africanthion nudum", true},
{"name", "Agathidium nudum", true},
{"name", "Aphaniosoma nudum", true},
{"name", "Aspidium nudum", true},
{"name", "Athyrium nudum", true},
{"name", "Blechnum nudum", true},
{"name", "Bottaria nudum", true},
{"name", "Gnathopleustes den", true},
{"name", "Lycopodium nudum", true},
{"name", "Nephrodium nudum", true},
{"name", "Paralvinella dela", true},
{"name", "Polypodium nudum", true},
{"name", "Polystichum nudum", true},
{"name", "Psilotum nudum", true},
{"name", "Ruteloryctes bis", true},
{"name", "Selenops ab", true},
{"name", "Tortolena dela", true},
{"name", "Trachyphloeosoma nudum", true},
{"name", "Zodarion van", true},
}
for _, v := range data {
assert.Equal(t, ppr.IsException(v.name, ppr.AnnotationException), v.likeAnnotation, v.msg)
}
})

t.Run("VirusLikeName", func(t *testing.T) {
data := []struct {
Expand All @@ -139,7 +183,7 @@ func TestPreprocess(t *testing.T) {
{"name17", "Homo sapiens coronavirus", false},
}
for _, v := range data {
assert.Equal(t, ppr.VirusLikeName(v.name), v.likeVirus, v.msg)
assert.Equal(t, ppr.IsException(v.name, ppr.VirusException), v.likeVirus, v.msg)
}
})

Expand Down
2 changes: 1 addition & 1 deletion ent/parser/grammar.peg
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ CultivarApostrophe <- '\'' / '‘' / '’' / '"' / '“' / '”'

SpeciesEpithet <- !(AuthorEx) Word (_? Authorship)?

Comparison <- 'cf' '.'?
Comparison <- 'cf' '.'? &(SpaceCharEOI)

Rank <- (RankForma / RankVar / RankSsp / RankOther / RankOtherUncommon /
RankAgamo / RankNotho) (_? LowerGreek ('.' / &(SpaceCharEOI)))?
Expand Down
Loading

0 comments on commit b02e40c

Please sign in to comment.