Skip to content

Commit

Permalink
normalize/parse names with non-breaking hyphens (close #237)
Browse files Browse the repository at this point in the history
  • Loading branch information
dimus committed Nov 10, 2022
1 parent 79c2dc1 commit 057a468
Show file tree
Hide file tree
Showing 8 changed files with 165 additions and 92 deletions.
19 changes: 19 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

## Unreleased

## [v1.6.9] - 2022-11-10 Thu

- Add [#237]: detect and normalize non-breaking hyphens. In case if other
non-typical hythens will appear, they will be dealt the same way.

## [v1.6.8] - 2022-10-01 Sat

- Add: update all modules.
Expand Down Expand Up @@ -402,6 +407,10 @@

This document follows [changelog guidelines]

[v1.6.9]: https://github.com/gnames/gnparser/compare/v1.6.8...v1.6.9
[v1.6.8]: https://github.com/gnames/gnparser/compare/v1.6.7...v1.6.8
[v1.6.7]: https://github.com/gnames/gnparser/compare/v1.6.6...v1.6.7
[v1.6.6]: https://github.com/gnames/gnparser/compare/v1.6.5...v1.6.6
[v1.6.5]: https://github.com/gnames/gnparser/compare/v1.6.4...v1.6.5
[v1.6.4]: https://github.com/gnames/gnparser/compare/v1.6.3...v1.6.4
[v1.6.3]: https://github.com/gnames/gnparser/compare/v1.6.2...v1.6.3
Expand Down Expand Up @@ -460,6 +469,16 @@ This document follows [changelog guidelines]
[v0.7.0]: https://github.com/gnames/gnparser/compare/v0.6.0...v0.7.0
[v0.6.0]: https://github.com/gnames/gnparser/compare/v0.5.1...v0.6.0
[v0.5.1]: https://github.com/gnames/gnparser/tree/v0.5.1
[#240]: https://github.com/gnames/gnparser/issues/240
[#239]: https://github.com/gnames/gnparser/issues/239
[#238]: https://github.com/gnames/gnparser/issues/238
[#237]: https://github.com/gnames/gnparser/issues/237
[#236]: https://github.com/gnames/gnparser/issues/236
[#235]: https://github.com/gnames/gnparser/issues/235
[#234]: https://github.com/gnames/gnparser/issues/234
[#233]: https://github.com/gnames/gnparser/issues/233
[#232]: https://github.com/gnames/gnparser/issues/232
[#231]: https://github.com/gnames/gnparser/issues/231
[#230]: https://github.com/gnames/gnparser/issues/230
[#229]: https://github.com/gnames/gnparser/issues/229
[#228]: https://github.com/gnames/gnparser/issues/228
Expand Down
3 changes: 3 additions & 0 deletions ent/parsed/warning.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ const (
CharBadWarn
ContainsIgnoredAnnotation
CultivarEpithetWarn
DashOtherWarn
DotEpithetWarn
GenusAbbrWarn
GenusUpperCharAfterDash
Expand Down Expand Up @@ -90,6 +91,7 @@ var warningMap = map[Warning]string{
CharBadWarn: "Non-standard characters in canonical",
ContainsIgnoredAnnotation: "Ignored annotation `mihi`",
CultivarEpithetWarn: "Cultivar epithet",
DashOtherWarn: "Atypical hyphen character",
DotEpithetWarn: "Period character is not allowed in canonical",
GenusAbbrWarn: "Abbreviated uninomial word",
GenusUpperCharAfterDash: "Apparent genus with capital character after hyphen",
Expand Down Expand Up @@ -158,6 +160,7 @@ var WarningQualityMap = map[Warning]int{
CharBadWarn: 2,
ContainsIgnoredAnnotation: 3,
CultivarEpithetWarn: 2,
DashOtherWarn: 2,
DotEpithetWarn: 3,
GenusAbbrWarn: 4,
GenusUpperCharAfterDash: 2,
Expand Down
7 changes: 7 additions & 0 deletions ent/parser/ast.go
Original file line number Diff line number Diff line change
Expand Up @@ -1167,6 +1167,9 @@ func (p *Engine) newWordNode(n *node32, wt parsed.WordType) *parsed.Word {
nv := str.ToASCII(wrd.Verbatim, str.GlobalTransliterations)
wrd.Normalized = nv
}
case ruleDashOther:
p.addWarn(parsed.DashOtherWarn)
wrd.Normalized = normalizeDashes(wrd.Verbatim)
}
}

Expand Down Expand Up @@ -1220,6 +1223,10 @@ func (p *Engine) newCultivarEpithetNode(n *node32, wt parsed.WordType) *cultivar

var numWord = regexp.MustCompile(`^([0-9]+)[-\.]?(.+)$`)

func normalizeDashes(s string) string {
return strings.ReplaceAll(s, "‑", "-")
}

func normalizeNums(s string) string {
res := s
match := numWord.FindAllStringSubmatch(s, 1)
Expand Down
1 change: 1 addition & 0 deletions ent/parser/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ var nodeRules = map[pegRule]struct{}{
ruleComparison: {},
ruleCultivar: {},
ruleCultivarRecursive: {},
ruleDashOther: {},
ruleDotPrefix: {},
ruleFilius: {},
ruleFiliusFNoSpace: {},
Expand Down
4 changes: 3 additions & 1 deletion ent/parser/grammar.peg
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,9 @@ ApostrASCII <- '\''
ApostrOther <- '‘' / '’' / '`' / '´'


Dash <- '-'
Dash <- '-' / DashOther

DashOther <- [‑]

Slash <- '/'

Expand Down
Loading

0 comments on commit 057a468

Please sign in to comment.