better version of additional normalize (#202)

gnames · Nov 14, 2021 · 6f60bd5 · 6f60bd5
1 parent dc67aaf
commit 6f60bd5
Show file tree

Hide file tree

Showing 6 changed files with 43 additions and 42 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,12 @@
 
 ## Unreleased
 
+## [v1.5.4]
+
+- Add: different approach for normalize-by-type for words.
+- Add [#204]: allow genera starting with De-, Eu-, Le-, Ne- (by @tobymarsden).
+- Add [#203]: allow up to 2 dashes in genera (by @tobymarsden).
+
 ## [v1.5.3]
 
 - Add [#202]: add NormalizeMore function for Word.
@@ -331,6 +337,7 @@ array of names instead of a stream.
 
 This document follows [changelog guidelines]
 
+[v1.5.4]: https://github.com/gnames/gnparser/compare/v1.5.3...v1.5.4
 [v1.5.3]: https://github.com/gnames/gnparser/compare/v1.5.2...v1.5.3
 [v1.5.2]: https://github.com/gnames/gnparser/compare/v1.5.1...v1.5.2
 [v1.5.1]: https://github.com/gnames/gnparser/compare/v1.5.0...v1.5.1

diff --git a/ent/parsed/words.go b/ent/parsed/words.go
@@ -5,6 +5,7 @@ import (
 	"strings"
 
 	"github.com/gnames/gnparser/ent/stemmer"
+	"github.com/gnames/gnparser/ent/str"
 )
 
 // Word represents a parsed word and its meaning in the name-string.
@@ -21,21 +22,22 @@ type Word struct {
 	End int `json:"end"`
 }
 
-// NormalizeMore is useful when searching for a word alone.
+// NormalizeByType is useful when searching for a word alone.
 // In such cases specific epithets will match better when stemmed,
-// authors low-cased with stripped periods.
-func (w Word) NormalizeMore() string {
+// authors and genera low-cased, authors with stripped periods.
+func NormalizeByType(wrd string, wt WordType) string {
 	var res string
-	switch w.Type {
+	wrd = str.Normalize(wrd)
+	switch wt {
 	case SpEpithetType, InfraspEpithetType:
-		res = stemmer.Stem(w.Normalized).Stem
+		res = stemmer.Stem(wrd).Stem
 	case GenusType:
-		res = strings.ToLower(w.Normalized)
+		res = strings.ToLower(wrd)
 	case AuthorWordType:
-		res = strings.ToLower(w.Normalized)
+		res = strings.ToLower(wrd)
 		res = strings.ReplaceAll(res, ".", "")
 	default:
-		res = w.Normalized
+		res = wrd
 	}
 	return res
 }

diff --git a/ent/parser/ast.go b/ent/parser/ast.go
@@ -1146,22 +1146,22 @@ func (p *Engine) newWordNode(n *node32, wt parsed.WordType) *parsed.Word {
 		switch v.pegRule {
 		case ruleDotPrefix:
 			p.addWarn(parsed.DotEpithetWarn)
-			wrd.Normalized, _ = normalize(wrd.Verbatim)
+			wrd.Normalized = str.Normalize(wrd.Verbatim)
 		case ruleUpperCharExtended, ruleLowerCharExtended:
 			p.addWarn(parsed.CharBadWarn)
-			wrd.Normalized, _ = normalize(wrd.Verbatim)
+			wrd.Normalized = str.Normalize(wrd.Verbatim)
 		case ruleWordApostr:
 			p.addWarn(parsed.CanonicalApostropheWarn)
 			canonicalApostrophe = true
-			wrd.Normalized, _ = normalize(wrd.Verbatim)
+			wrd.Normalized = str.Normalize(wrd.Verbatim)
 		case ruleWordStartsWithDigit:
 			p.addWarn(parsed.SpeciesNumericWarn)
 			wrd.Normalized = normalizeNums(wrd.Verbatim)
 		case ruleApostrOther:
 			p.addWarn(parsed.ApostrOtherWarn)
 			if !canonicalApostrophe {
-				nv, _ := str.ToASCII([]byte(wrd.Verbatim), str.GlobalTransliterations)
-				wrd.Normalized = string(nv)
+				nv := str.ToASCII(wrd.Verbatim, str.GlobalTransliterations)
+				wrd.Normalized = nv
 			}
 		}
 	}
@@ -1214,19 +1214,6 @@ func (p *Engine) newCultivarEpithetNode(n *node32, wt parsed.WordType) *cultivar
 	return &cv
 }
 
-func normalize(s string) (string, error) {
-	res := s
-	if s == "" {
-		return s, nil
-	}
-	nv, err := str.ToASCII([]byte(s), str.Transliterations)
-	if err != nil {
-		return res, err
-	}
-	res = string(nv)
-	return res, nil
-}
-
 var numWord = regexp.MustCompile(`^([0-9]+)[-\.]?(.+)$`)
 
 func normalizeNums(s string) string {

diff --git a/ent/str/str.go b/ent/str/str.go
@@ -31,8 +31,18 @@ func CapitalizeName(name string) string {
 	return string(runes)
 }
 
+// Normalize takes a string and returns normalized version of it.
+// Normalize function should be indempotent.
+func Normalize(s string) string {
+	return ToASCII(s, Transliterations)
+}
+
 // ToASCII converts a UTF-8 diacritics to corresponding ASCII chars.
-func ToASCII(b []byte, m map[rune]string) ([]byte, error) {
+func ToASCII(s string, m map[rune]string) string {
+	if s == "" {
+		return s
+	}
+	b := []byte(s)
 	tlBuf := bytes.NewBuffer(make([]byte, 0, len(b)*125/100))
 	for i, w := 0, 0; i < len(b); i += w {
 		r, width := utf8.DecodeRune(b[i:])
@@ -43,7 +53,7 @@ func ToASCII(b []byte, m map[rune]string) ([]byte, error) {
 		}
 		w = width
 	}
-	return tlBuf.Bytes(), nil
+	return tlBuf.String()
 }
 
 func IsBoldSurrogate(s string) bool {

diff --git a/ent/str/str_test.go b/ent/str/str_test.go
@@ -48,8 +48,8 @@ func TestStringTools(t *testing.T) {
 			{"‘", "‘", "", str.Transliterations},
 		}
 		for _, v := range tests {
-			res, _ := str.ToASCII([]byte(v.in), v.tbl)
-			assert.Equal(t, string(res), v.out, v.msg)
+			res := str.ToASCII(v.in, v.tbl)
+			assert.Equal(t, res, v.out, v.msg)
 		}
 	})
 

diff --git a/gnparser_test.go b/gnparser_test.go
@@ -74,23 +74,18 @@ func TestParseLowCaseName(t *testing.T) {
 	}
 }
 
-func TestWordNormalizeMore(t *testing.T) {
+func TestWordNormalizeByType(t *testing.T) {
 	tests := []struct {
-		msg, name string
-		norm      []string
+		msg, word, norm string
+		wType           parsed.WordType
 	}{
-		{"1", "Betula alba Linn.", []string{"betula", "alb", "linn"}},
-		{"2", "Plantago major var. major", []string{"plantago", "maior", "var.", "maior"}},
+		{"betula", "Betula", "betula", parsed.GenusType},
+		{"alba", "alba", "alb", parsed.SpEpithetType},
+		{"Linn", "Linn.", "linn", parsed.AuthorWordType},
 	}
 
-	cfg := gnparser.NewConfig(gnparser.OptWithDetails(true))
-	gnp := gnparser.New(cfg)
 	for _, v := range tests {
-		p := gnp.ParseName(v.name)
-		res := make([]string, len(p.Words))
-		for i, v := range p.Words {
-			res[i] = v.NormalizeMore()
-		}
+		res := parsed.NormalizeByType(v.word, v.wType)
 		assert.Equal(t, res, v.norm, v.msg)
 	}
 }