Skip to content

Commit

Permalink
Merge pull request #12 from f1monkey/feature/caverphone2
Browse files Browse the repository at this point in the history
Feature/caverphone2
  • Loading branch information
cyradin authored Jan 21, 2023
2 parents d9c5998 + eb7ca3c commit 9fec3b4
Show file tree
Hide file tree
Showing 4 changed files with 349 additions and 1 deletion.
23 changes: 22 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@

Set of different phonetic encoders' implementations.

Now consists of:
Provided encoders:
* [Beider-Morse encoder](#beider-morse-encoder) - BMPM implementation. It's a Go port of [the original PHP library](https://stevemorse.org/phoneticinfo.htm)
* [Caverphone2](#caverphone2-encoder) - implementation of [Caverphone 2.0 algorithm](https://en.wikipedia.org/wiki/Caverphone#Caverphone_2.0)


## Installion
Expand Down Expand Up @@ -91,3 +92,23 @@ func main() {
// prints: [uranzi uranz uranS uranzi uranz uranhi uranh]
}
```

### Caverphone2 encoder
Code example:

```go
package main

import (
"fmt"

"github.com/f1monkey/phonetic/caverphone2"
)

func main() {
e := caverphone2.NewEncoder()
result := e.Encode("orange")
fmt.Println(result)
// prints: ARNK111111
}
```
33 changes: 33 additions & 0 deletions caverphone2/encoder.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package caverphone2

import (
"bytes"
"strings"
)

var suffix = []byte("1111111111")

type Encoder struct{}

func NewEncoder() *Encoder {
return &Encoder{}
}

func (e *Encoder) Encode(input string) string {
if len(input) == 0 {
return string(suffix)
}

inputBytes := []byte(strings.ToLower(input))
for i := range rules {
if rules[i].regexp != nil {
inputBytes = rules[i].regexp.ReplaceAll(inputBytes, rules[i].replaceWith)
} else {
inputBytes = bytes.ReplaceAll(inputBytes, rules[i].pattern, rules[i].replaceWith)
}
}

inputBytes = append(inputBytes, suffix...)

return string(inputBytes[:len(suffix)])
}
54 changes: 54 additions & 0 deletions caverphone2/encoder_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
package caverphone2

import (
"testing"

"github.com/stretchr/testify/require"
)

func Benchmark_Encoder_Encode(b *testing.B) {
e := NewEncoder()
for i := 0; i < b.N; i++ {
e.Encode("orange")
}
}

func Test_Encoder_Encode(t *testing.T) {
cases := []struct {
input string
expected string
}{
{
input: "add",
expected: "AT11111111",
},
{
input: "eat",
expected: "AT11111111",
},
{
input: "hold",
expected: "AT11111111",
},
{
input: "orange",
expected: "ARNK111111",
},
{
input: "test",
expected: "TST1111111",
},
{
input: "ready",
expected: "RTA1111111",
},
}

for _, c := range cases {
t.Run(c.input, func(t *testing.T) {
e := NewEncoder()
result := e.Encode(c.input)
require.Equal(t, c.expected, result)
})
}
}
240 changes: 240 additions & 0 deletions caverphone2/rules.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
package caverphone2

import "regexp"

type rule struct {
pattern []byte
regexp *regexp.Regexp
replaceWith []byte
}

var rules = []rule{
{
regexp: regexp.MustCompile("[^a-z]"),
replaceWith: []byte(""),
},
{
regexp: regexp.MustCompile("e$"),
replaceWith: []byte(""),
},
{
regexp: regexp.MustCompile("^cough"),
replaceWith: []byte("cou2f"),
},
{
regexp: regexp.MustCompile("^rough"),
replaceWith: []byte("rou2f"),
},
{
regexp: regexp.MustCompile("^tough"),
replaceWith: []byte("tou2f"),
},
{
regexp: regexp.MustCompile("^enough"),
replaceWith: []byte("enou2f"),
},
{
regexp: regexp.MustCompile("^trough"),
replaceWith: []byte("trou2f"),
},
{
regexp: regexp.MustCompile("^gn"),
replaceWith: []byte("2n"),
},
{
regexp: regexp.MustCompile("mb$"),
replaceWith: []byte("m2"),
},
{
pattern: []byte("cq"),
replaceWith: []byte("2q"),
},
{
pattern: []byte("ci"),
replaceWith: []byte("si"),
},
{
pattern: []byte("ce"),
replaceWith: []byte("se"),
},
{
pattern: []byte("cy"),
replaceWith: []byte("sy"),
},
{
pattern: []byte("tch"),
replaceWith: []byte("2ch"),
},
{
pattern: []byte("c"),
replaceWith: []byte("k"),
},
{
pattern: []byte("q"),
replaceWith: []byte("k"),
},
{
pattern: []byte("x"),
replaceWith: []byte("k"),
},
{
pattern: []byte("v"),
replaceWith: []byte("f"),
},
{
pattern: []byte("dg"),
replaceWith: []byte("2g"),
},
{
pattern: []byte("tio"),
replaceWith: []byte("sio"),
},
{
pattern: []byte("tia"),
replaceWith: []byte("sia"),
},
{
pattern: []byte("d"),
replaceWith: []byte("t"),
},
{
pattern: []byte("ph"),
replaceWith: []byte("fh"),
},
{
pattern: []byte("b"),
replaceWith: []byte("p"),
},
{
pattern: []byte("sh"),
replaceWith: []byte("s2"),
},
{
pattern: []byte("z"),
replaceWith: []byte("s"),
},
{
regexp: regexp.MustCompile("^[aeiou]"),
replaceWith: []byte("A"),
},
{
regexp: regexp.MustCompile("[aeiou]"),
replaceWith: []byte("3"),
},
{
pattern: []byte("j"),
replaceWith: []byte("y"),
},
{
regexp: regexp.MustCompile("^y3"),
replaceWith: []byte("Y3"),
},
{
regexp: regexp.MustCompile("^y"),
replaceWith: []byte("A"),
},
{
pattern: []byte("y"),
replaceWith: []byte("3"),
},
{
pattern: []byte("3gh3"),
replaceWith: []byte("3kh3"),
},
{
pattern: []byte("gh"),
replaceWith: []byte("22"),
},
{
pattern: []byte("g"),
replaceWith: []byte("k"),
},
{
regexp: regexp.MustCompile("s+"),
replaceWith: []byte("S"),
},
{
regexp: regexp.MustCompile("t+"),
replaceWith: []byte("T"),
},
{
regexp: regexp.MustCompile("p+"),
replaceWith: []byte("P"),
},
{
regexp: regexp.MustCompile("k+"),
replaceWith: []byte("K"),
},
{
regexp: regexp.MustCompile("f+"),
replaceWith: []byte("F"),
},
{
regexp: regexp.MustCompile("m+"),
replaceWith: []byte("M"),
},
{
regexp: regexp.MustCompile("n+"),
replaceWith: []byte("N"),
},
{
pattern: []byte("w3"),
replaceWith: []byte("W3"),
},
{
pattern: []byte("wh3"),
replaceWith: []byte("Wh3"),
},
{
regexp: regexp.MustCompile("w$"),
replaceWith: []byte("3"),
},
{
pattern: []byte("w"),
replaceWith: []byte("2"),
},
{
regexp: regexp.MustCompile("^h"),
replaceWith: []byte("A"),
},
{
pattern: []byte("h"),
replaceWith: []byte("2"),
},
{
pattern: []byte("r3"),
replaceWith: []byte("R3"),
},
{
regexp: regexp.MustCompile("r$"),
replaceWith: []byte("3"),
},
{
pattern: []byte("r"),
replaceWith: []byte("2"),
},
{
pattern: []byte("l3"),
replaceWith: []byte("L3"),
},
{
regexp: regexp.MustCompile("l$"),
replaceWith: []byte("3"),
},
{
pattern: []byte("l"),
replaceWith: []byte("2"),
},
{
pattern: []byte("2"),
replaceWith: []byte(""),
},
{
regexp: regexp.MustCompile("3$"),
replaceWith: []byte("A"),
},
{
pattern: []byte("3"),
replaceWith: []byte(""),
},
}

0 comments on commit 9fec3b4

Please sign in to comment.