Merge pull request #43 from SupervisedStylometry/moreTests

Starting to create more tests
SupervisedStylometry · Feb 15, 2024 · dbe264b · dbe264b
2 parents c8d4d60 + b005a17
commit dbe264b
Show file tree

Hide file tree

Showing 5 changed files with 113 additions and 27 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.9", "3.10", "3.11"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
 
     steps:
     - uses: actions/checkout@v3

diff --git a/main.py b/main.py
@@ -1,7 +1,6 @@
 import superstyl.preproc.tuyau as tuy
 import superstyl.preproc.features_extract as fex
 from superstyl.preproc.text_count import count_process
-import fasttext
 import pandas
 import json
 # from multiprocessing import Pool

diff --git a/requirements.txt b/requirements.txt
@@ -1,17 +1,17 @@
-langdetect==1.0.9
-joblib==1.2.0
-lxml==4.9.1
-nltk==3.6.6
-numpy==1.26.4
-pybind11==2.8.1
-scikit-learn==1.2.1
-scipy==1.10.0
-six==1.16.0
-tqdm==4.64.1
-unidecode==1.3.2
-pandas==2.2.0
-pyarrow==15.0.0
-argparse==1.4.0
-regex==2022.10.31
-matplotlib==3.6.2
-imbalanced-learn==0.8.1
+langdetect>=1.0.9
+joblib>=1.2.0
+lxml>=4.9.1
+nltk>=3.6.6
+numpy>=1.26.4
+pybind11>=2.8.1
+scikit-learn>=1.2.1
+scipy>=1.10.0
+six>=1.16.0
+tqdm>=4.64.1
+unidecode>=1.3.2
+pandas>=2.2.0
+pyarrow>=15.0.0
+argparse>=1.4.0
+regex>=2022.10.31
+matplotlib>=3.6.2
+imbalanced-learn>=0.8.1
diff --git a/superstyl/preproc/features_extract.py b/superstyl/preproc/features_extract.py
@@ -8,9 +8,10 @@ def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False):
     """
     Get word counts from  a text
     :param text: the source text
+    :param feat_list: a list of features to be selected
     :param feats: the type of feats (words, chars, etc.)
     :param n: the length of n-grams
-    :param relFreqs: whether or not to compute relative freqs
+    :param relFreqs: whether to compute relative freqs
     :return: feature frequencies in text
     """
 
@@ -51,8 +52,8 @@ def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False):
 
 def get_feature_list(myTexts, feats="words", n=1, relFreqs=True):
     """
-
     :param myTexts: a 'myTexts' object, containing documents to be processed
+    :param feat_list: a list of features to be selected
     :param feats: type of feats (words, chars)
     :param n: n-grams length
     :return: list of features, with total frequency
@@ -65,7 +66,6 @@ def get_feature_list(myTexts, feats="words", n=1, relFreqs=True):
         my_feats.update(counts)
 
     # sort them
-
     my_feats = [(i, my_feats[i]) for i in sorted(my_feats, key=my_feats.get, reverse=True)]
 
     return my_feats
@@ -77,7 +77,7 @@ def get_counts(myTexts, feat_list, feats = "words", n = 1, relFreqs = False):
     :param myTexts: the document collection
     :param feats: the type of feats (words, chars, etc.)
     :param n: the length of n-grams
-    :param relFreqs: whether or not to compute relative freqs
+    :param relFreqs: whether to compute relative freqs
     :return: the collection with, for each text, a 'wordCounts' dictionary
     """
 

diff --git a/tests/test_main.py b/tests/test_main.py
@@ -1,15 +1,102 @@
 import unittest
-from superstyl.preproc.tuyau import normalise
+import superstyl.preproc.tuyau
+import superstyl.preproc.features_extract
 
 class DataLoading(unittest.TestCase):
+    # First, testing the tuyau features
     def test_normalise(self):
         text = " Hello,  Mr. 𓀁, how are §§ you; doing?"
         expected_default = "hello mr how are you doing"
-        self.assertEqual(normalise(text), expected_default)
+        self.assertEqual(superstyl.preproc.tuyau.normalise(text), expected_default)
         expected_keeppunct = "Hello, Mr. , how are SSSS you; doing?"
-        self.assertEqual(normalise(text, keep_punct=True), expected_keeppunct)
+        self.assertEqual(superstyl.preproc.tuyau.normalise(text, keep_punct=True), expected_keeppunct)
         expected_keepsym = "Hello, Mr. 𓀁, how are §§ you; doing?"
-        self.assertEqual(normalise(text, keep_sym=True), expected_keepsym)
+        self.assertEqual(superstyl.preproc.tuyau.normalise(text, keep_sym=True), expected_keepsym)
+
+    def test_detect_lang(self):
+        french = "Bonjour, Monsieur, comment allez-vous?"
+        # NB: it fails on that !!!
+        # english = "Hello, How do you do good sir?"
+        # still too hard
+        # english = "Hello, How do you do good sir? Are you well today?"
+        english = "Hello, How do you do good sir? Are you well today? Is this so bloody hard? Really, this is still failing?"
+        italian = "Buongiorno signore, come sta?"
+        #TODO: find something that manages old languages, like fasttext did…
+        self.assertEqual(superstyl.preproc.tuyau.detect_lang(french), "fr")
+        self.assertEqual(superstyl.preproc.tuyau.detect_lang(english), "en")
+        self.assertEqual(superstyl.preproc.tuyau.detect_lang(italian), "it")
+
+    # Now, lower level features,
+    # from features_extract
+    def test_counts(self):
+        text = "the cat the dog the squirrel the cat the cat"
+        superstyl.preproc.features_extract.count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False)
+        self.assertEqual(
+            superstyl.preproc.features_extract.count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False),
+            {'the': 5, 'cat': 3, 'dog': 1, 'squirrel': 1}
+        )
+        self.assertEqual(
+            superstyl.preproc.features_extract.count_words(text, feat_list=None, feats="words", n=1, relFreqs=True),
+            {'the': 0.5, 'cat': 0.3, 'dog': 0.1, 'squirrel': 0.1}
+        )
+        self.assertEqual(
+            superstyl.preproc.features_extract.count_words(text, feat_list=['the', 'cat'], feats="words", n=1, relFreqs=False),
+            {'the': 5, 'cat': 3}
+        )
+        self.assertEqual(
+            superstyl.preproc.features_extract.count_words(text, feat_list=['the', 'cat'], feats="words", n=1, relFreqs=True),
+            {'the': 0.5, 'cat': 0.3}
+        )
+        self.assertEqual(
+            superstyl.preproc.features_extract.count_words(text, feat_list=None, feats="words", n=2, relFreqs=False),
+            {'the_cat': 3, 'cat_the': 2, 'the_dog': 1, 'dog_the': 1, 'the_squirrel': 1, 'squirrel_the': 1}
+        )
+        self.assertEqual(
+            superstyl.preproc.features_extract.count_words(text, feat_list=None, feats="words", n=2, relFreqs=True),
+            {'the_cat': 3/9, 'cat_the': 2/9, 'the_dog': 1/9, 'dog_the': 1/9, 'the_squirrel': 1/9, 'squirrel_the': 1/9}
+        )
+
+        text = "the yo yo"
+        self.assertEqual(
+            superstyl.preproc.features_extract.count_words(text, feat_list=None, feats="chars", n=3, relFreqs=False),
+            {'the': 1, 'he_': 1, 'e_y': 1, '_yo': 2, 'yo_': 1, 'o_y': 1}
+        )
+        self.assertEqual(
+            superstyl.preproc.features_extract.count_words(text, feat_list=['the'], feats="chars", n=3, relFreqs=True),
+            {'the': 1/7}
+        )
+
+    # Testing the processing of "myTexts" objects
+    def test_get_feature_list(self):
+        myTexts = [
+            {"name": "Letter1", "aut": "Smith", "text": "This is the text", "lang": "en"},
+            {"name": "Letter2", "aut": "Smith", "text": "This is also the text", "lang": "en"},
+            {"name": "Letter1", "aut": "Dupont", "text": "Voici le texte", "lang": "fr"},
+        ]
+        self.assertEqual(
+            superstyl.preproc.features_extract.get_feature_list(myTexts, feats="words", n=1, relFreqs=False),
+            [('This', 2), ('is', 2), ('the', 2), ('text', 2), ('also', 1), ('Voici', 1), ('le', 1), ('texte', 1)]
+        )
+
+    def test_get_counts(self):
+        myTexts = [
+            {"name": "Letter1", "aut": "Smith", "text": "This is the text", "lang": "en"},
+            {"name": "Letter2", "aut": "Smith", "text": "This is also the text", "lang": "en"},
+            {"name": "Letter1", "aut": "Dupont", "text": "Voici le texte", "lang": "fr"},
+        ]
+
+        self.assertEqual(
+            superstyl.preproc.features_extract.get_counts(myTexts, ['the', 'is', 'also', 'le'], feats = "words",
+                                                          n = 1, relFreqs = True),
+            [{'name': 'Letter1', 'aut': 'Smith', 'text': 'This is the text', 'lang': 'en',
+              'wordCounts': {'the': 0.25, 'is': 0.25}},
+             {'name': 'Letter2', 'aut': 'Smith', 'text': 'This is also the text', 'lang': 'en',
+              'wordCounts': {'the': 0.2, 'is': 0.2, 'also': 0.2}},
+             {'name': 'Letter1', 'aut': 'Dupont', 'text': 'Voici le texte', 'lang': 'fr', 'wordCounts': {'le': 1/3}}]
+        )
+
+        #TODO: a lot more tests
+
 
 
 if __name__ == '__main__':