Skip to content

Commit

Permalink
better version of additional normalize (#202)
Browse files Browse the repository at this point in the history
  • Loading branch information
dimus committed Nov 14, 2021
1 parent dc67aaf commit 6f60bd5
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 42 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

## Unreleased

## [v1.5.4]

- Add: different approach for normalize-by-type for words.
- Add [#204]: allow genera starting with De-, Eu-, Le-, Ne- (by @tobymarsden).
- Add [#203]: allow up to 2 dashes in genera (by @tobymarsden).

## [v1.5.3]

- Add [#202]: add NormalizeMore function for Word.
Expand Down Expand Up @@ -331,6 +337,7 @@ array of names instead of a stream.

This document follows [changelog guidelines]

[v1.5.4]: https://github.com/gnames/gnparser/compare/v1.5.3...v1.5.4
[v1.5.3]: https://github.com/gnames/gnparser/compare/v1.5.2...v1.5.3
[v1.5.2]: https://github.com/gnames/gnparser/compare/v1.5.1...v1.5.2
[v1.5.1]: https://github.com/gnames/gnparser/compare/v1.5.0...v1.5.1
Expand Down
18 changes: 10 additions & 8 deletions ent/parsed/words.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"strings"

"github.com/gnames/gnparser/ent/stemmer"
"github.com/gnames/gnparser/ent/str"
)

// Word represents a parsed word and its meaning in the name-string.
Expand All @@ -21,21 +22,22 @@ type Word struct {
End int `json:"end"`
}

// NormalizeMore is useful when searching for a word alone.
// NormalizeByType is useful when searching for a word alone.
// In such cases specific epithets will match better when stemmed,
// authors low-cased with stripped periods.
func (w Word) NormalizeMore() string {
// authors and genera low-cased, authors with stripped periods.
func NormalizeByType(wrd string, wt WordType) string {
var res string
switch w.Type {
wrd = str.Normalize(wrd)
switch wt {
case SpEpithetType, InfraspEpithetType:
res = stemmer.Stem(w.Normalized).Stem
res = stemmer.Stem(wrd).Stem
case GenusType:
res = strings.ToLower(w.Normalized)
res = strings.ToLower(wrd)
case AuthorWordType:
res = strings.ToLower(w.Normalized)
res = strings.ToLower(wrd)
res = strings.ReplaceAll(res, ".", "")
default:
res = w.Normalized
res = wrd
}
return res
}
Expand Down
23 changes: 5 additions & 18 deletions ent/parser/ast.go
Original file line number Diff line number Diff line change
Expand Up @@ -1146,22 +1146,22 @@ func (p *Engine) newWordNode(n *node32, wt parsed.WordType) *parsed.Word {
switch v.pegRule {
case ruleDotPrefix:
p.addWarn(parsed.DotEpithetWarn)
wrd.Normalized, _ = normalize(wrd.Verbatim)
wrd.Normalized = str.Normalize(wrd.Verbatim)
case ruleUpperCharExtended, ruleLowerCharExtended:
p.addWarn(parsed.CharBadWarn)
wrd.Normalized, _ = normalize(wrd.Verbatim)
wrd.Normalized = str.Normalize(wrd.Verbatim)
case ruleWordApostr:
p.addWarn(parsed.CanonicalApostropheWarn)
canonicalApostrophe = true
wrd.Normalized, _ = normalize(wrd.Verbatim)
wrd.Normalized = str.Normalize(wrd.Verbatim)
case ruleWordStartsWithDigit:
p.addWarn(parsed.SpeciesNumericWarn)
wrd.Normalized = normalizeNums(wrd.Verbatim)
case ruleApostrOther:
p.addWarn(parsed.ApostrOtherWarn)
if !canonicalApostrophe {
nv, _ := str.ToASCII([]byte(wrd.Verbatim), str.GlobalTransliterations)
wrd.Normalized = string(nv)
nv := str.ToASCII(wrd.Verbatim, str.GlobalTransliterations)
wrd.Normalized = nv
}
}
}
Expand Down Expand Up @@ -1214,19 +1214,6 @@ func (p *Engine) newCultivarEpithetNode(n *node32, wt parsed.WordType) *cultivar
return &cv
}

func normalize(s string) (string, error) {
res := s
if s == "" {
return s, nil
}
nv, err := str.ToASCII([]byte(s), str.Transliterations)
if err != nil {
return res, err
}
res = string(nv)
return res, nil
}

var numWord = regexp.MustCompile(`^([0-9]+)[-\.]?(.+)$`)

func normalizeNums(s string) string {
Expand Down
14 changes: 12 additions & 2 deletions ent/str/str.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,18 @@ func CapitalizeName(name string) string {
return string(runes)
}

// Normalize takes a string and returns normalized version of it.
// Normalize function should be indempotent.
func Normalize(s string) string {
return ToASCII(s, Transliterations)
}

// ToASCII converts a UTF-8 diacritics to corresponding ASCII chars.
func ToASCII(b []byte, m map[rune]string) ([]byte, error) {
func ToASCII(s string, m map[rune]string) string {
if s == "" {
return s
}
b := []byte(s)
tlBuf := bytes.NewBuffer(make([]byte, 0, len(b)*125/100))
for i, w := 0, 0; i < len(b); i += w {
r, width := utf8.DecodeRune(b[i:])
Expand All @@ -43,7 +53,7 @@ func ToASCII(b []byte, m map[rune]string) ([]byte, error) {
}
w = width
}
return tlBuf.Bytes(), nil
return tlBuf.String()
}

func IsBoldSurrogate(s string) bool {
Expand Down
4 changes: 2 additions & 2 deletions ent/str/str_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ func TestStringTools(t *testing.T) {
{"‘", "‘", "", str.Transliterations},
}
for _, v := range tests {
res, _ := str.ToASCII([]byte(v.in), v.tbl)
assert.Equal(t, string(res), v.out, v.msg)
res := str.ToASCII(v.in, v.tbl)
assert.Equal(t, res, v.out, v.msg)
}
})

Expand Down
19 changes: 7 additions & 12 deletions gnparser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,23 +74,18 @@ func TestParseLowCaseName(t *testing.T) {
}
}

func TestWordNormalizeMore(t *testing.T) {
func TestWordNormalizeByType(t *testing.T) {
tests := []struct {
msg, name string
norm []string
msg, word, norm string
wType parsed.WordType
}{
{"1", "Betula alba Linn.", []string{"betula", "alb", "linn"}},
{"2", "Plantago major var. major", []string{"plantago", "maior", "var.", "maior"}},
{"betula", "Betula", "betula", parsed.GenusType},
{"alba", "alba", "alb", parsed.SpEpithetType},
{"Linn", "Linn.", "linn", parsed.AuthorWordType},
}

cfg := gnparser.NewConfig(gnparser.OptWithDetails(true))
gnp := gnparser.New(cfg)
for _, v := range tests {
p := gnp.ParseName(v.name)
res := make([]string, len(p.Words))
for i, v := range p.Words {
res[i] = v.NormalizeMore()
}
res := parsed.NormalizeByType(v.word, v.wType)
assert.Equal(t, res, v.norm, v.msg)
}
}
Expand Down

0 comments on commit 6f60bd5

Please sign in to comment.