better doc and comments

thc2cat · Nov 14, 2023 · 2c3e450 · 2c3e450
1 parent 43941a7
commit 2c3e450
Show file tree

Hide file tree

Showing 2 changed files with 62 additions and 50 deletions.
diff --git a/README.md b/README.md
@@ -9,23 +9,50 @@ Bayesian filter for mail subjects Ham/Spam discrimination using
 
 Spammer uses a lot of differents subjects, sometime with wrong spelling and garbage.
 
-Purpose of this project is a classifier able to learn/identify spam/Ham mail subjects.
+Purpose of this project is a basic classifier able to learn/identify spam/Ham from mail subjects.
+
+subayes read stdin line and output them on stdout with prefix "Spam: " or "Ham: "
 
 ## Basics
 
 ```shell
 ## Building
 $ go mod tidy && go build 
 
-## Learning
-$ rm db/Spam db/Ham
+## Defaults options : 
+$ subayes -h
+Usage of subayes:
+  -E    explain words scores
+  -d string
+        data filename (default "subayes.spam")
+  -db string
+         db path (default "db")
+  -learnHam
+        learn Ham subjects
+  -learnSpam
+        learn Spam subjects
+  -m int
+        word min length (default 4)
+  -v    verbose
+
+
+## Learning 
+$ rm db/Spam db/Ham && mkdir db
 $ ./subayes  -learnHam -d testdata/Ham -v
 INFO classifier corpus :  [ Ham -> 0 items ]
 INFO classifier corpus :  [ Ham -> 4623 items ]
 $ ./subayes  -learnSpam -d testdata/esteban.txt -v
 INFO classifier corpus :  [ Spam -> 0 items ]
 INFO classifier corpus :  [ Spam -> 1096 items ]
 
+## Testing 
+$ echo "mensaje al grupo de trabajo please" | subayes
+Ham: mensaje al grupo de trabajo please
+
+$ echo "View sexy women in your neighborhood" | subayes
+Spam: View sexy women in your neighborhood
+
+
 ## Evaluating words scores
 $ echo "mensaje al grupo de trabajo please" | subayes -E    
 [ mensaje = Spam ] : [Ham]{ 0.4000 } [Spam]{ 0.6000 } 
@@ -34,62 +61,53 @@ $ echo "mensaje al grupo de trabajo please" | subayes -E
 [ please = Ham ] : [Ham]{ 0.6667 } [Spam]{ 0.3333 } 
 Ham: mensaje al grupo de trabajo please
 
-## Spam detection from stdin
-$ ./subayes < testdata/2023-05  | grep -c Spam
-59213
-$ wc -l 2023-05
-241662 2023-05 ( meaning 24% Spam, WTF! )
+## Raw test from v0.1
+$ ./subayes.exe < testdata/2023-05 |cut -d: -f1|sort|uniq -c
+ 176347 Ham
+  57102 Spam
 
-## Relearning
-$ ./subayes -learnHam -d testdata/Ham-rajout-1.txt -v
-INFO classifier corpus :  [ Ham -> 4623 items ]
-INFO classifier corpus :  [ Ham -> 4718 items ]
-$ ./subayes < testdata/2023-05  | grep -c Spam
-58240
+Meaning at least 24% Spam ! 
 
 ```
 
-## Usage
+## Common usage
 
 Use
 [utf8submimedecode](https://github.com/thc2cat/utf8submimedecode)
-filter to decode  utf8 encoded subjects lines.
+filter to decode utf8 encoded subjects lines.
 
-ex-pat contains lines to ignore patterns ( like Spam, or already detected users ).
+ex-pat contains lines to ignore patterns ( like Spam, [PUB] or already detected users ).
 
-subjects.sed is a sed script extracting subjects from log line.
+subjects.sed is a simple sed script extracting subjects from log line.
 
 subayes will create two files in db/ : Spam and Ham
 
-```shell
+Each time you find a spammer, learn theirs subjects as spam, verify updated db against previous clean data to adjust false positives.
 
-# Detection
+```shell
 
-logs/partage$ rg -z clamav  sftp_logs/$LOGDATE/*clamav.log* | rg -vf ex-pat |\
- sed -f subjects.sed  | utf8submimedecode | sort -u | subayes | rg Spam | \
- tee  subayes.spam | mail -E -s "[subayes detection]" postmaster
+# Detection from clamav logs
 
-# Learning more Ham words :  
+logs/partage$ rg -z clamav  sftp_logs/$LOGDATE/*clamav.log* \
+| rg -vf ex-pat | sed -f subjects.sed  | utf8submimedecode \
+| sort -u | subayes | rg ^Spam \
+| tee  subayes.spam | mail -E -s "[subayes detection]" postmaster
 
-logs/partage$ rg -z clamav  sftp_logs/$DATES/*clamav.log*  | rg -vf ex-pat|\
- sed -f subjects.sed  | utf8submimedecode | sort -u | subayes | rg Spam |\
- cut -c7- | tee subayes.spam 
+# If you want to know what are the words tagged with Spam in a line, 
+# use "-E" explain option (printed on stderr).
 
- # edit ex-pat ( when you find new spammer address )
+$ subayes -E < subayes.spam  
 
+# Learning more Ham words :  
  # edit subayes.spam  (when you have false positives and relearn :)
 
 logs/partage$ subayes  -v -learnHam -d subayes.spam          
-
-# If you want to know what are the words tagged with Spam in a line, 
-# use "-E explain", save, edit and  relearn.
-
-$ subayes -E < 2023.subjects 2>&1 | awk '/^\[/ { if ($4=="Spam") print $2 }' |\
-  sort -u | tee  subayes.words  
+( -d is optional, subayes.spam is the default data file)
 
 # Efficiency :
 
-logs/partage$  subayes < /tmp/Hacked-account-Subjects | cut -d: -f1 | sort | uniq -c
+logs/partage$  subayes < /tmp/Hacked-account-Subjects \
+| cut -d: -f1 | sort | uniq -c
 5658 Ham
 39016 Spam ( meaning 87% detection without false positives from filtered subjects)
 

diff --git a/main.go b/main.go
@@ -5,6 +5,9 @@ package main
 // v0.1 : working draft.
 // v0.2 : minlen words, better split func, default bayes class, +main_test.go
 // v0.3 : -E options for explaining and showing scores
+//
+// TODO :
+// - how to remove item from db ?
 
 import (
 	"bufio"
@@ -14,7 +17,7 @@ import (
 	"os"
 	"regexp"
 
-	// Credits for "github.com/jbrukh/bayesian"
+	// Credits to "github.com/jbrukh/bayesian"
 	"github.com/jbrukh/bayesian"
 )
 
@@ -25,8 +28,9 @@ var (
 	Spam                bayesian.Class = "Spam"
 	Ham                 bayesian.Class = "Ham"
 	minlength                          = 4
-	// words = regexp.MustCompile("[\\p{L}]+")
+
 	words = regexp.MustCompile(`[\p{L}]+`)
+	// See http://www.unicode.org/reports/tr44/#General_Category_Values
 )
 
 func main() {
@@ -136,6 +140,9 @@ func classify(c *bayesian.Classifier, pattern []string, d bayesian.Class) bayesi
 	}
 	//  ProbScores return scores ([]float64), indexofclass, strict(?)
 	_, likelyb, _ := c.ProbScores(pattern)
+	// Would testing strict should be done ?
+	// _, likelyb, strict := c.ProbScores(pattern)
+	// if false returning default class d ?
 	return c.Classes[likelyb]
 }
 
@@ -170,19 +177,6 @@ func split(s string) []string {
 	return words.FindAllString(s, -1)
 }
 
-// Generic version (string or int )
-// func removeDuplicate[T string | int](sliceList []T) []T {
-// 	allKeys := make(map[T]bool)
-// 	list := []T{}
-// 	for _, item := range sliceList {
-// 		if _, value := allKeys[item]; !value {
-// 			allKeys[item] = true
-// 			list = append(list, item)
-// 		}
-// 	}
-// 	return list
-// }
-
 // removeduplicate
 func removeDuplicate(sliceList []string, length int) []string {
 	allKeys := make(map[string]bool)