diff --git a/CHANGELOG.md b/CHANGELOG.md index f48bb8d..39b33c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,12 @@ ## Unreleased +## [v1.5.4] + +- Add: different approach for normalize-by-type for words. +- Add [#204]: allow genera starting with De-, Eu-, Le-, Ne- (by @tobymarsden). +- Add [#203]: allow up to 2 dashes in genera (by @tobymarsden). + ## [v1.5.3] - Add [#202]: add NormalizeMore function for Word. @@ -331,6 +337,7 @@ array of names instead of a stream. This document follows [changelog guidelines] +[v1.5.4]: https://github.com/gnames/gnparser/compare/v1.5.3...v1.5.4 [v1.5.3]: https://github.com/gnames/gnparser/compare/v1.5.2...v1.5.3 [v1.5.2]: https://github.com/gnames/gnparser/compare/v1.5.1...v1.5.2 [v1.5.1]: https://github.com/gnames/gnparser/compare/v1.5.0...v1.5.1 diff --git a/ent/parsed/words.go b/ent/parsed/words.go index bbec5a0..001748a 100644 --- a/ent/parsed/words.go +++ b/ent/parsed/words.go @@ -5,6 +5,7 @@ import ( "strings" "github.com/gnames/gnparser/ent/stemmer" + "github.com/gnames/gnparser/ent/str" ) // Word represents a parsed word and its meaning in the name-string. @@ -21,21 +22,22 @@ type Word struct { End int `json:"end"` } -// NormalizeMore is useful when searching for a word alone. +// NormalizeByType is useful when searching for a word alone. // In such cases specific epithets will match better when stemmed, -// authors low-cased with stripped periods. -func (w Word) NormalizeMore() string { +// authors and genera low-cased, authors with stripped periods. +func NormalizeByType(wrd string, wt WordType) string { var res string - switch w.Type { + wrd = str.Normalize(wrd) + switch wt { case SpEpithetType, InfraspEpithetType: - res = stemmer.Stem(w.Normalized).Stem + res = stemmer.Stem(wrd).Stem case GenusType: - res = strings.ToLower(w.Normalized) + res = strings.ToLower(wrd) case AuthorWordType: - res = strings.ToLower(w.Normalized) + res = strings.ToLower(wrd) res = strings.ReplaceAll(res, ".", "") default: - res = w.Normalized + res = wrd } return res } diff --git a/ent/parser/ast.go b/ent/parser/ast.go index 44bee4a..d674a5a 100644 --- a/ent/parser/ast.go +++ b/ent/parser/ast.go @@ -1146,22 +1146,22 @@ func (p *Engine) newWordNode(n *node32, wt parsed.WordType) *parsed.Word { switch v.pegRule { case ruleDotPrefix: p.addWarn(parsed.DotEpithetWarn) - wrd.Normalized, _ = normalize(wrd.Verbatim) + wrd.Normalized = str.Normalize(wrd.Verbatim) case ruleUpperCharExtended, ruleLowerCharExtended: p.addWarn(parsed.CharBadWarn) - wrd.Normalized, _ = normalize(wrd.Verbatim) + wrd.Normalized = str.Normalize(wrd.Verbatim) case ruleWordApostr: p.addWarn(parsed.CanonicalApostropheWarn) canonicalApostrophe = true - wrd.Normalized, _ = normalize(wrd.Verbatim) + wrd.Normalized = str.Normalize(wrd.Verbatim) case ruleWordStartsWithDigit: p.addWarn(parsed.SpeciesNumericWarn) wrd.Normalized = normalizeNums(wrd.Verbatim) case ruleApostrOther: p.addWarn(parsed.ApostrOtherWarn) if !canonicalApostrophe { - nv, _ := str.ToASCII([]byte(wrd.Verbatim), str.GlobalTransliterations) - wrd.Normalized = string(nv) + nv := str.ToASCII(wrd.Verbatim, str.GlobalTransliterations) + wrd.Normalized = nv } } } @@ -1214,19 +1214,6 @@ func (p *Engine) newCultivarEpithetNode(n *node32, wt parsed.WordType) *cultivar return &cv } -func normalize(s string) (string, error) { - res := s - if s == "" { - return s, nil - } - nv, err := str.ToASCII([]byte(s), str.Transliterations) - if err != nil { - return res, err - } - res = string(nv) - return res, nil -} - var numWord = regexp.MustCompile(`^([0-9]+)[-\.]?(.+)$`) func normalizeNums(s string) string { diff --git a/ent/str/str.go b/ent/str/str.go index bb03cac..f1583a7 100644 --- a/ent/str/str.go +++ b/ent/str/str.go @@ -31,8 +31,18 @@ func CapitalizeName(name string) string { return string(runes) } +// Normalize takes a string and returns normalized version of it. +// Normalize function should be indempotent. +func Normalize(s string) string { + return ToASCII(s, Transliterations) +} + // ToASCII converts a UTF-8 diacritics to corresponding ASCII chars. -func ToASCII(b []byte, m map[rune]string) ([]byte, error) { +func ToASCII(s string, m map[rune]string) string { + if s == "" { + return s + } + b := []byte(s) tlBuf := bytes.NewBuffer(make([]byte, 0, len(b)*125/100)) for i, w := 0, 0; i < len(b); i += w { r, width := utf8.DecodeRune(b[i:]) @@ -43,7 +53,7 @@ func ToASCII(b []byte, m map[rune]string) ([]byte, error) { } w = width } - return tlBuf.Bytes(), nil + return tlBuf.String() } func IsBoldSurrogate(s string) bool { diff --git a/ent/str/str_test.go b/ent/str/str_test.go index 4ec28bf..052dc42 100644 --- a/ent/str/str_test.go +++ b/ent/str/str_test.go @@ -48,8 +48,8 @@ func TestStringTools(t *testing.T) { {"‘", "‘", "", str.Transliterations}, } for _, v := range tests { - res, _ := str.ToASCII([]byte(v.in), v.tbl) - assert.Equal(t, string(res), v.out, v.msg) + res := str.ToASCII(v.in, v.tbl) + assert.Equal(t, res, v.out, v.msg) } }) diff --git a/gnparser_test.go b/gnparser_test.go index 4a07869..a6d4e90 100644 --- a/gnparser_test.go +++ b/gnparser_test.go @@ -74,23 +74,18 @@ func TestParseLowCaseName(t *testing.T) { } } -func TestWordNormalizeMore(t *testing.T) { +func TestWordNormalizeByType(t *testing.T) { tests := []struct { - msg, name string - norm []string + msg, word, norm string + wType parsed.WordType }{ - {"1", "Betula alba Linn.", []string{"betula", "alb", "linn"}}, - {"2", "Plantago major var. major", []string{"plantago", "maior", "var.", "maior"}}, + {"betula", "Betula", "betula", parsed.GenusType}, + {"alba", "alba", "alb", parsed.SpEpithetType}, + {"Linn", "Linn.", "linn", parsed.AuthorWordType}, } - cfg := gnparser.NewConfig(gnparser.OptWithDetails(true)) - gnp := gnparser.New(cfg) for _, v := range tests { - p := gnp.ParseName(v.name) - res := make([]string, len(p.Words)) - for i, v := range p.Words { - res[i] = v.NormalizeMore() - } + res := parsed.NormalizeByType(v.word, v.wType) assert.Equal(t, res, v.norm, v.msg) } }