Skip to content

Commit

Permalink
minor cleanings
Browse files Browse the repository at this point in the history
  • Loading branch information
thc committed Jan 29, 2024
1 parent e68bcf6 commit 227b6ac
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 38 deletions.
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Subayes

This is a naive bayesian classifier for mail subjects. Ham/Spam discrimination using
This is a naive bayesian classifier for mail subjects.

Ham/Spam discrimination using
[golang jbrukh/bayesian lib](https://github.com/jbrukh/bayesian).

![go.yml](https://github.com/thc2cat/subayes/actions/workflows/go.yml/badge.svg)
Expand All @@ -10,10 +12,12 @@ This is a naive bayesian classifier for mail subjects. Ham/Spam discrimination u

Spammer uses a lot of differents subjects, sometime with wrong spelling and garbage.

Purpose of this project is a basic classifier able to detect spam from mail subjects.
Purpose of this project is a basic classifier able to detect spam from mail subjects better than grep.

subayes read stdin line and output them on stdout with prefix "Spam: " or "Ham: ".

**Training db is really important, unknown words will be classified with most learned class.**

## Basics

```shell
Expand Down
80 changes: 44 additions & 36 deletions main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,29 +8,38 @@ import (
"github.com/jbrukh/bayesian"
)

func Test_split(t *testing.T) {
var (
Spam bayesian.Class = "Spam"
Ham bayesian.Class = "Ham"

tests := []struct {
name string
s string
want []string
tests = []struct {
name string
s string
want []string
class bayesian.Class
wordscount int
}{
// {"empty", "", []string{""}},
{"empty", "", nil},
{"A test !", "A test !", []string{"A", "test"}},
{"puctuation", "a lot of # numbers 1,2,5 ? ", []string{"a", "lot", "of", "numbers"}},
{"real1", "Re: [3616304] MODIFICATION // Fwd: Senegal", []string{"Re", "MODIFICATION", "Fwd", "Senegal"}},
{"real2", "Chèque MAIF 1 000€", []string{"Chèque", "MAIF"}},
{"empty", "", nil, Ham, 0},
{"A test !", "A test !", []string{"A", "test"}, Ham, 1},
{"puctuation", "a lot of # numbers 1,2,5 ? ", []string{"a", "lot", "of", "numbers"}, Ham, 2},
{"real1", "Re: [3616304] MODIFICATION // Fwd: Senegal", []string{"Re", "MODIFICATION", "Fwd", "Senegal"}, Ham, 3},
{"real2", "Chèque MAIF 1 000€", []string{"Chèque", "MAIF"}, Ham, 2},
{"real3", "Re: Rattrapages (examens de seconde chance) 12-16 juin et 19-23 juin",
[]string{"Re", "Rattrapages", "examens", "de", "seconde", "chance", "juin", "et", "juin"}},
{"real4", "Vaše vlastné dievča lokálne", []string{"Vaše", "vlastné", "dievča", "lokálne"}},
[]string{"Re", "Rattrapages", "examens", "de", "seconde", "chance", "juin", "et", "juin"}, Ham, 5},
{"real4", "Vaše vlastné dievča lokálne", []string{"Vaše", "vlastné", "dievča", "lokálne"}, Spam, 4},
{"real5", "Compañeras własne Zdobądź nära pobliżu",
[]string{"Compañeras", "własne", "Zdobądź", "nära", "pobliżu"}},
[]string{"Compañeras", "własne", "Zdobądź", "nära", "pobliżu"}, Spam, 5},
{"apostrophes", "Parlez avec des meufs pour s’envoyer en l’air dès maintenant",
[]string{"Parlez", "avec", "des", "meufs", "pour", "s", "envoyer", "en", "l", "air", "dès", "maintenant"}},
[]string{"Parlez", "avec", "des", "meufs", "pour", "s", "envoyer", "en", "l", "air", "dès", "maintenant"}, Spam, 9},
{"with numbers", "ceci contient 1.999 test et ceci 10000 excuses",
[]string{"ceci", "contient", "test", "et", "ceci", "excuses"}, Ham, 4},
}
// TODO: Add test cases.

K = bayesian.NewClassifier(Ham, Spam)
)

func Test_split(t *testing.T) {
verbose = true
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := split(tt.s); !reflect.DeepEqual(got, tt.want) {
Expand All @@ -41,39 +50,38 @@ func Test_split(t *testing.T) {
}

func Test_most(t *testing.T) {
var (
Spam bayesian.Class = "Spam"
Ham bayesian.Class = "Ham"
)
verbose = true
K := bayesian.NewClassifier(Ham, Spam)

if _, err := os.Stat("db"); !os.IsNotExist(err) {
// path/to/whatever does not exist
errcheck(K.ReadClassFromFile(Spam, "db"))
errcheck(K.ReadClassFromFile(Ham, "db"))
showClassesCount(K)
}

}

func Test_removeDuplicate(t *testing.T) {
tests := []struct {
name string
array []string
want []string
length int
}{
// WARNING : lowercase since V1.2
{"basic", []string{"AAAA", "AAAA", "BBB"}, []string{"aaaa", "bbb"}, 3},
{"length3", []string{"A", "BBB", "CCC", "BBB"}, []string{"bbb", "ccc"}, 3},
{"length2", []string{"AA", "BBB", "CCC", "BBB"}, []string{"aa", "bbb", "ccc"}, 2},
{"with numbers", []string{"ceci", "contient", "1.999", "test", "10000", "excuses"}, []string{"ceci", "contient", "test", "excuses"}, 4},
verbose = true
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := removeDuplicate(split(tt.s), 3); len(got) != tt.wordscount {
t.Errorf("removeDuplicate() '%s' is '%s' gave '%v', we want '%v'", tt.name, tt.s, len(got), tt.wordscount)
}
})
}
}

func Test_classify(t *testing.T) {
verbose = true
if _, err := os.Stat("db"); !os.IsNotExist(err) {
// path/to/whatever does not exist
errcheck(K.ReadClassFromFile(Spam, "db"))
errcheck(K.ReadClassFromFile(Ham, "db"))
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := removeDuplicate(tt.array, tt.length); !reflect.DeepEqual(got, tt.want) {
t.Errorf("removeDuplicate() gave %v, we want %v", got, tt.want)
if got := classify(K, tt.want, Ham); got != tt.class {
t.Errorf("For %v test : %v got %v but want %v\n",
tt.name, classify(K, tt.want, Ham), got, tt.want)
}
})
}
Expand Down

0 comments on commit 227b6ac

Please sign in to comment.