From 2c3e4508f194228a33c6a4870c492cd870c441b8 Mon Sep 17 00:00:00 2001 From: thc Date: Tue, 14 Nov 2023 10:55:43 +0100 Subject: [PATCH] better doc and comments --- README.md | 88 +++++++++++++++++++++++++++++++++---------------------- main.go | 24 ++++++--------- 2 files changed, 62 insertions(+), 50 deletions(-) diff --git a/README.md b/README.md index 325cab6..83d3999 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,9 @@ Bayesian filter for mail subjects Ham/Spam discrimination using Spammer uses a lot of differents subjects, sometime with wrong spelling and garbage. -Purpose of this project is a classifier able to learn/identify spam/Ham mail subjects. +Purpose of this project is a basic classifier able to learn/identify spam/Ham from mail subjects. + +subayes read stdin line and output them on stdout with prefix "Spam: " or "Ham: " ## Basics @@ -17,8 +19,25 @@ Purpose of this project is a classifier able to learn/identify spam/Ham mail sub ## Building $ go mod tidy && go build -## Learning -$ rm db/Spam db/Ham +## Defaults options : +$ subayes -h +Usage of subayes: + -E explain words scores + -d string + data filename (default "subayes.spam") + -db string + db path (default "db") + -learnHam + learn Ham subjects + -learnSpam + learn Spam subjects + -m int + word min length (default 4) + -v verbose + + +## Learning +$ rm db/Spam db/Ham && mkdir db $ ./subayes -learnHam -d testdata/Ham -v INFO classifier corpus : [ Ham -> 0 items ] INFO classifier corpus : [ Ham -> 4623 items ] @@ -26,6 +45,14 @@ $ ./subayes -learnSpam -d testdata/esteban.txt -v INFO classifier corpus : [ Spam -> 0 items ] INFO classifier corpus : [ Spam -> 1096 items ] +## Testing +$ echo "mensaje al grupo de trabajo please" | subayes +Ham: mensaje al grupo de trabajo please + +$ echo "View sexy women in your neighborhood" | subayes +Spam: View sexy women in your neighborhood + + ## Evaluating words scores $ echo "mensaje al grupo de trabajo please" | subayes -E [ mensaje = Spam ] : [Ham]{ 0.4000 } [Spam]{ 0.6000 } @@ -34,62 +61,53 @@ $ echo "mensaje al grupo de trabajo please" | subayes -E [ please = Ham ] : [Ham]{ 0.6667 } [Spam]{ 0.3333 } Ham: mensaje al grupo de trabajo please -## Spam detection from stdin -$ ./subayes < testdata/2023-05 | grep -c Spam -59213 -$ wc -l 2023-05 -241662 2023-05 ( meaning 24% Spam, WTF! ) +## Raw test from v0.1 +$ ./subayes.exe < testdata/2023-05 |cut -d: -f1|sort|uniq -c + 176347 Ham + 57102 Spam -## Relearning -$ ./subayes -learnHam -d testdata/Ham-rajout-1.txt -v -INFO classifier corpus : [ Ham -> 4623 items ] -INFO classifier corpus : [ Ham -> 4718 items ] -$ ./subayes < testdata/2023-05 | grep -c Spam -58240 +Meaning at least 24% Spam ! ``` -## Usage +## Common usage Use [utf8submimedecode](https://github.com/thc2cat/utf8submimedecode) -filter to decode utf8 encoded subjects lines. +filter to decode utf8 encoded subjects lines. -ex-pat contains lines to ignore patterns ( like Spam, or already detected users ). +ex-pat contains lines to ignore patterns ( like Spam, [PUB] or already detected users ). -subjects.sed is a sed script extracting subjects from log line. +subjects.sed is a simple sed script extracting subjects from log line. subayes will create two files in db/ : Spam and Ham -```shell +Each time you find a spammer, learn theirs subjects as spam, verify updated db against previous clean data to adjust false positives. -# Detection +```shell -logs/partage$ rg -z clamav sftp_logs/$LOGDATE/*clamav.log* | rg -vf ex-pat |\ - sed -f subjects.sed | utf8submimedecode | sort -u | subayes | rg Spam | \ - tee subayes.spam | mail -E -s "[subayes detection]" postmaster +# Detection from clamav logs -# Learning more Ham words : +logs/partage$ rg -z clamav sftp_logs/$LOGDATE/*clamav.log* \ +| rg -vf ex-pat | sed -f subjects.sed | utf8submimedecode \ +| sort -u | subayes | rg ^Spam \ +| tee subayes.spam | mail -E -s "[subayes detection]" postmaster -logs/partage$ rg -z clamav sftp_logs/$DATES/*clamav.log* | rg -vf ex-pat|\ - sed -f subjects.sed | utf8submimedecode | sort -u | subayes | rg Spam |\ - cut -c7- | tee subayes.spam +# If you want to know what are the words tagged with Spam in a line, +# use "-E" explain option (printed on stderr). - # edit ex-pat ( when you find new spammer address ) +$ subayes -E < subayes.spam +# Learning more Ham words : # edit subayes.spam (when you have false positives and relearn :) logs/partage$ subayes -v -learnHam -d subayes.spam - -# If you want to know what are the words tagged with Spam in a line, -# use "-E explain", save, edit and relearn. - -$ subayes -E < 2023.subjects 2>&1 | awk '/^\[/ { if ($4=="Spam") print $2 }' |\ - sort -u | tee subayes.words +( -d is optional, subayes.spam is the default data file) # Efficiency : -logs/partage$ subayes < /tmp/Hacked-account-Subjects | cut -d: -f1 | sort | uniq -c +logs/partage$ subayes < /tmp/Hacked-account-Subjects \ +| cut -d: -f1 | sort | uniq -c 5658 Ham 39016 Spam ( meaning 87% detection without false positives from filtered subjects) diff --git a/main.go b/main.go index 4d44c48..ba5d540 100644 --- a/main.go +++ b/main.go @@ -5,6 +5,9 @@ package main // v0.1 : working draft. // v0.2 : minlen words, better split func, default bayes class, +main_test.go // v0.3 : -E options for explaining and showing scores +// +// TODO : +// - how to remove item from db ? import ( "bufio" @@ -14,7 +17,7 @@ import ( "os" "regexp" - // Credits for "github.com/jbrukh/bayesian" + // Credits to "github.com/jbrukh/bayesian" "github.com/jbrukh/bayesian" ) @@ -25,8 +28,9 @@ var ( Spam bayesian.Class = "Spam" Ham bayesian.Class = "Ham" minlength = 4 - // words = regexp.MustCompile("[\\p{L}]+") + words = regexp.MustCompile(`[\p{L}]+`) + // See http://www.unicode.org/reports/tr44/#General_Category_Values ) func main() { @@ -136,6 +140,9 @@ func classify(c *bayesian.Classifier, pattern []string, d bayesian.Class) bayesi } // ProbScores return scores ([]float64), indexofclass, strict(?) _, likelyb, _ := c.ProbScores(pattern) + // Would testing strict should be done ? + // _, likelyb, strict := c.ProbScores(pattern) + // if false returning default class d ? return c.Classes[likelyb] } @@ -170,19 +177,6 @@ func split(s string) []string { return words.FindAllString(s, -1) } -// Generic version (string or int ) -// func removeDuplicate[T string | int](sliceList []T) []T { -// allKeys := make(map[T]bool) -// list := []T{} -// for _, item := range sliceList { -// if _, value := allKeys[item]; !value { -// allKeys[item] = true -// list = append(list, item) -// } -// } -// return list -// } - // removeduplicate func removeDuplicate(sliceList []string, length int) []string { allKeys := make(map[string]bool)