From 227b6acba398073cef336b0b834b5b11809457c2 Mon Sep 17 00:00:00 2001 From: thc Date: Mon, 29 Jan 2024 11:31:38 +0100 Subject: [PATCH] minor cleanings --- README.md | 8 ++++-- main_test.go | 80 +++++++++++++++++++++++++++++----------------------- 2 files changed, 50 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index 505b639..c041713 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ # Subayes -This is a naive bayesian classifier for mail subjects. Ham/Spam discrimination using +This is a naive bayesian classifier for mail subjects. + +Ham/Spam discrimination using [golang jbrukh/bayesian lib](https://github.com/jbrukh/bayesian). ![go.yml](https://github.com/thc2cat/subayes/actions/workflows/go.yml/badge.svg) @@ -10,10 +12,12 @@ This is a naive bayesian classifier for mail subjects. Ham/Spam discrimination u Spammer uses a lot of differents subjects, sometime with wrong spelling and garbage. -Purpose of this project is a basic classifier able to detect spam from mail subjects. +Purpose of this project is a basic classifier able to detect spam from mail subjects better than grep. subayes read stdin line and output them on stdout with prefix "Spam: " or "Ham: ". +**Training db is really important, unknown words will be classified with most learned class.** + ## Basics ```shell diff --git a/main_test.go b/main_test.go index a9cfea5..13326ef 100644 --- a/main_test.go +++ b/main_test.go @@ -8,29 +8,38 @@ import ( "github.com/jbrukh/bayesian" ) -func Test_split(t *testing.T) { +var ( + Spam bayesian.Class = "Spam" + Ham bayesian.Class = "Ham" - tests := []struct { - name string - s string - want []string + tests = []struct { + name string + s string + want []string + class bayesian.Class + wordscount int }{ - // {"empty", "", []string{""}}, - {"empty", "", nil}, - {"A test !", "A test !", []string{"A", "test"}}, - {"puctuation", "a lot of # numbers 1,2,5 ? ", []string{"a", "lot", "of", "numbers"}}, - {"real1", "Re: [3616304] MODIFICATION // Fwd: Senegal", []string{"Re", "MODIFICATION", "Fwd", "Senegal"}}, - {"real2", "Chèque MAIF 1 000€", []string{"Chèque", "MAIF"}}, + {"empty", "", nil, Ham, 0}, + {"A test !", "A test !", []string{"A", "test"}, Ham, 1}, + {"puctuation", "a lot of # numbers 1,2,5 ? ", []string{"a", "lot", "of", "numbers"}, Ham, 2}, + {"real1", "Re: [3616304] MODIFICATION // Fwd: Senegal", []string{"Re", "MODIFICATION", "Fwd", "Senegal"}, Ham, 3}, + {"real2", "Chèque MAIF 1 000€", []string{"Chèque", "MAIF"}, Ham, 2}, {"real3", "Re: Rattrapages (examens de seconde chance) 12-16 juin et 19-23 juin", - []string{"Re", "Rattrapages", "examens", "de", "seconde", "chance", "juin", "et", "juin"}}, - {"real4", "Vaše vlastné dievča lokálne", []string{"Vaše", "vlastné", "dievča", "lokálne"}}, + []string{"Re", "Rattrapages", "examens", "de", "seconde", "chance", "juin", "et", "juin"}, Ham, 5}, + {"real4", "Vaše vlastné dievča lokálne", []string{"Vaše", "vlastné", "dievča", "lokálne"}, Spam, 4}, {"real5", "Compañeras własne Zdobądź nära pobliżu", - []string{"Compañeras", "własne", "Zdobądź", "nära", "pobliżu"}}, + []string{"Compañeras", "własne", "Zdobądź", "nära", "pobliżu"}, Spam, 5}, {"apostrophes", "Parlez avec des meufs pour s’envoyer en l’air dès maintenant", - []string{"Parlez", "avec", "des", "meufs", "pour", "s", "envoyer", "en", "l", "air", "dès", "maintenant"}}, + []string{"Parlez", "avec", "des", "meufs", "pour", "s", "envoyer", "en", "l", "air", "dès", "maintenant"}, Spam, 9}, + {"with numbers", "ceci contient 1.999 test et ceci 10000 excuses", + []string{"ceci", "contient", "test", "et", "ceci", "excuses"}, Ham, 4}, } - // TODO: Add test cases. + K = bayesian.NewClassifier(Ham, Spam) +) + +func Test_split(t *testing.T) { + verbose = true for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { if got := split(tt.s); !reflect.DeepEqual(got, tt.want) { @@ -41,39 +50,38 @@ func Test_split(t *testing.T) { } func Test_most(t *testing.T) { - var ( - Spam bayesian.Class = "Spam" - Ham bayesian.Class = "Ham" - ) verbose = true - K := bayesian.NewClassifier(Ham, Spam) - if _, err := os.Stat("db"); !os.IsNotExist(err) { // path/to/whatever does not exist errcheck(K.ReadClassFromFile(Spam, "db")) errcheck(K.ReadClassFromFile(Ham, "db")) showClassesCount(K) } - } func Test_removeDuplicate(t *testing.T) { - tests := []struct { - name string - array []string - want []string - length int - }{ - // WARNING : lowercase since V1.2 - {"basic", []string{"AAAA", "AAAA", "BBB"}, []string{"aaaa", "bbb"}, 3}, - {"length3", []string{"A", "BBB", "CCC", "BBB"}, []string{"bbb", "ccc"}, 3}, - {"length2", []string{"AA", "BBB", "CCC", "BBB"}, []string{"aa", "bbb", "ccc"}, 2}, - {"with numbers", []string{"ceci", "contient", "1.999", "test", "10000", "excuses"}, []string{"ceci", "contient", "test", "excuses"}, 4}, + verbose = true + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := removeDuplicate(split(tt.s), 3); len(got) != tt.wordscount { + t.Errorf("removeDuplicate() '%s' is '%s' gave '%v', we want '%v'", tt.name, tt.s, len(got), tt.wordscount) + } + }) + } +} + +func Test_classify(t *testing.T) { + verbose = true + if _, err := os.Stat("db"); !os.IsNotExist(err) { + // path/to/whatever does not exist + errcheck(K.ReadClassFromFile(Spam, "db")) + errcheck(K.ReadClassFromFile(Ham, "db")) } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := removeDuplicate(tt.array, tt.length); !reflect.DeepEqual(got, tt.want) { - t.Errorf("removeDuplicate() gave %v, we want %v", got, tt.want) + if got := classify(K, tt.want, Ham); got != tt.class { + t.Errorf("For %v test : %v got %v but want %v\n", + tt.name, classify(K, tt.want, Ham), got, tt.want) } }) }