SupervisedStylometry · Jean-Baptiste-Camps · May 6, 2024 · Mar 6, 2024 · Mar 6, 2024 · Mar 25, 2024
diff --git a/load_corpus.py b/load_corpus.py
@@ -75,7 +75,7 @@
 
     if not args.f:
         with open(feat_file, "w") as out:
-            out.write(json.dumps(my_feats, ensure_ascii=False))
+            out.write(json.dumps(my_feats, ensure_ascii=False, indent=0))
             print("Features list saved to " + feat_file)
 
     corpus.to_csv(corpus_file)

diff --git a/requirements.txt b/requirements.txt
@@ -5,7 +5,7 @@ nltk>=3.6.6
 numpy>=1.26.4
 pybind11>=2.8.1
 scikit-learn>=1.3.0
-scipy>=1.10.0
+scipy>=1.10.0,<1.13.0
 six>=1.16.0
 tqdm>=4.64.1
 unidecode>=1.3.2

diff --git a/superstyl/preproc/pipe.py b/superstyl/preproc/pipe.py
@@ -77,18 +77,30 @@ def detect_lang(string):
 
 
 def normalise(text, keep_punct=False, keep_sym=False):
+    """
+    Function to normalise an input string. By defaults, it removes all but word chars, remove accents,
+    and normalise space, and then normalise unicode.
+    :param keep_punct: if true, in addition, also keeps Punctuation and case distinction
+    :param keep_sym: if true, same as keep_punct, but keeps also N?umbers, Symbols,  Marks, such as combining diacritics,
+    as well as Private use characters, and no Unidecode is applied
+    """
     # Remove all but word chars, remove accents, and normalise space
     # and then normalise unicode
 
     if keep_sym:
-        out = re.sub(r"\s+", " ", re.sub(r"[^\p{L}\p{P}\p{N}]+", " ", text.strip()))
+        out = re.sub(r"[^\p{L}\p{P}\p{N}\p{S}\p{M}\p{Co}]+", " ", text)
 
     else:
         if keep_punct:
-            out = re.sub(r"\s+", " ", unidecode.unidecode(re.sub(r"[^\p{L}\p{P}]+", " ", text.strip())))
+            out = re.sub(r"[^\p{L}\p{P}]+", " ", text)
 
         else:
-            out = re.sub(r"\s+", " ", unidecode.unidecode(re.sub(r"[\W0-9]+", " ", text.lower()).strip()))
+            #out = re.sub(r"[\W0-9]+", " ", text.lower())
+            out = re.sub(r"[^\p{L}]+", " ", text.lower())
+
+        out = unidecode.unidecode(out)
+
+    out = re.sub(r"\s+", " ", out).strip()
 
     return out
 

diff --git a/tests/test_load_corpus.py b/tests/test_load_corpus.py
@@ -283,14 +283,15 @@ def test_load_texts_txt(self):
 
         self.assertEqual(results, expected)
 
-        #TODO: test keep_sym, according to revised definition
         # WHEN
-        # results = superstyl.preproc.pipe.load_texts(self.paths, identify_lang=False, format="txt",
-        #                                             keep_sym=True, max_samples=None)
+        results = superstyl.preproc.pipe.load_texts(self.paths, identify_lang=False, format="txt",
+                                                     keep_sym=True, max_samples=None)
         # THEN
-        # expected = [{'name': 'Dupont_Letter1.txt', 'aut': 'Dupont', 'text': 'Voici le texte!', 'lang': 'NA'},
-        #            {'name': 'Smith_Letter1.txt', 'aut': 'Smith', 'text': 'This is the text!', 'lang': 'NA'},
-        #            {'name': 'Smith_Letter2.txt', 'aut': 'Smith', 'text': 'This is, also , the text!', 'lang': 'NA'}]
+        expected = [{'name': 'Dupont_Letter1.txt', 'aut': 'Dupont', 'text': 'Voici le texte!', 'lang': 'NA'},
+                   {'name': 'Smith_Letter1.txt', 'aut': 'Smith', 'text': 'This is the text!', 'lang': 'NA'},
+                   {'name': 'Smith_Letter2.txt', 'aut': 'Smith', 'text': 'This is, © also © , the text!', 'lang': 'NA'}]
+
+        self.assertEqual(results, expected)
 
         # WHEN
         results = superstyl.preproc.pipe.load_texts(self.paths, identify_lang=True, format="txt", keep_punct=True,
@@ -314,9 +315,9 @@ def test_docs_to_samples(self):
         self.assertEqual(results, expected)
 
         # WHEN
-        results = superstyl.preproc.pipe.docs_to_samples(self.paths, identify_lang=False, size=2, step=1,
+        results = superstyl.preproc.pipe.docs_to_samples(sorted(self.paths), identify_lang=False, size=2, step=1,
                                                           units="words", format="txt", keep_punct=True,
-                                                          keep_sym=False,
+                                                          keep_sym=True,
                                                           max_samples=None)
 
         # THEN
@@ -329,14 +330,15 @@ def test_docs_to_samples(self):
                     {'name': 'Smith_Letter1.txt_3-5', 'aut': 'Smith', 'text': 'text !', 'lang': 'NA'},
                     {'name': 'Smith_Letter2.txt_0-2', 'aut': 'Smith', 'text': 'This is', 'lang': 'NA'},
                     {'name': 'Smith_Letter2.txt_1-3', 'aut': 'Smith', 'text': 'is ,', 'lang': 'NA'},
-                    {'name': 'Smith_Letter2.txt_2-4', 'aut': 'Smith', 'text': ', also', 'lang': 'NA'},
-                    {'name': 'Smith_Letter2.txt_3-5', 'aut': 'Smith', 'text': 'also ,', 'lang': 'NA'},
-                    {'name': 'Smith_Letter2.txt_4-6', 'aut': 'Smith', 'text': ', the', 'lang': 'NA'},
-                    {'name': 'Smith_Letter2.txt_5-7', 'aut': 'Smith', 'text': 'the text', 'lang': 'NA'},
-                    {'name': 'Smith_Letter2.txt_6-8', 'aut': 'Smith', 'text': 'text !', 'lang': 'NA'}]
-        self.assertEqual(results, expected)
+                    {'name': 'Smith_Letter2.txt_2-4', 'aut': 'Smith', 'text': ', ©', 'lang': 'NA'},
+                    {'name': 'Smith_Letter2.txt_3-5', 'aut': 'Smith', 'text': '© also', 'lang': 'NA'},
+                    {'name': 'Smith_Letter2.txt_4-6', 'aut': 'Smith', 'text': 'also ©', 'lang': 'NA'},
+                    {'name': 'Smith_Letter2.txt_5-7', 'aut': 'Smith', 'text': '© ,', 'lang': 'NA'},
+                    {'name': 'Smith_Letter2.txt_6-8', 'aut': 'Smith', 'text': ', the', 'lang': 'NA'},
+                    {'name': 'Smith_Letter2.txt_7-9', 'aut': 'Smith', 'text': 'the text', 'lang': 'NA'},
+                    {'name': 'Smith_Letter2.txt_8-10', 'aut': 'Smith', 'text': 'text !', 'lang': 'NA'}]
 
-        # TODO: test keep_sym
+        self.assertEqual(results, expected)
 
         # WHEN
         results = superstyl.preproc.pipe.docs_to_samples(self.paths, identify_lang=True, size=2, step=None,
@@ -492,13 +494,26 @@ class DataLoading(unittest.TestCase):
      # Now down to lower level features
     # First, testing the pipe features
     def test_normalise(self):
-        text = " Hello,  Mr. 𓀁, how are §§ you; doing?"
-        expected_default = "hello mr how are you doing"
-        self.assertEqual(superstyl.preproc.pipe.normalise(text), expected_default)
-        expected_keeppunct = "Hello, Mr. , how are SSSS you; doing?"
-        self.assertEqual(superstyl.preproc.pipe.normalise(text, keep_punct=True), expected_keeppunct)
-        expected_keepsym = "Hello, Mr. 𓀁, how are §§ you; doing?" #TODO: modify test according to new def
-        self.assertEqual(superstyl.preproc.pipe.normalise(text, keep_sym=True), expected_keepsym)
+        # FEATURE
+        # Normalise an input text, according to different options
+        # SCENARIO
+        # GIVEN
+        text = " Hello,  Mr. 𓀁, how are §§ you; doing? ſõ ❡"
+        # WHEN
+        results = superstyl.preproc.pipe.normalise(text)
+        # THEN
+        expected_default = "hello mr how are you doing s o"
+        self.assertEqual(results, expected_default)
+        # WHEN
+        results = superstyl.preproc.pipe.normalise(text, keep_punct=True)
+        # THEN
+        expected_keeppunct = "Hello, Mr. , how are SSSS you; doing? s o"
+        self.assertEqual(results, expected_keeppunct)
+        # WHEN
+        results = superstyl.preproc.pipe.normalise(text, keep_sym=True)
+        # THEN
+        expected_keepsym = "Hello, Mr. 𓀁, how are §§ you; doing? ſõ ❡"
+        self.assertEqual(results, expected_keepsym)
 
     def test_detect_lang(self):
         french = "Bonjour, Monsieur, comment allez-vous?"

diff --git a/tests/test_train_svm.py b/tests/test_train_svm.py
@@ -52,6 +52,11 @@ def test_train_svm(self):
         self.assertEqual(results["classification_report"], expected_results["classification_report"])
         self.assertEqual(results["misattributions"].to_dict(), expected_results["misattributions"])
         self.assertEqual(list(results.keys()), expected_keys)
-        # This is only the first minimal test for this function
+
+        # WHEN
+        #results = superstyl.train_svm(train, test, final_pred=False, balance="SMOTETomek")
+
+
+        # This is only the first minimal tests for this function
 
 
diff --git a/train_svm.py b/train_svm.py
@@ -69,12 +69,16 @@
     else:
         args.o = ''
 
-    svm["confusion_matrix"].to_csv(args.o+"confusion_matrix.csv")
-    svm["misattributions"].to_csv(args.o+"misattributions.csv")
+
+    if args.cross_validate is not None or (args.test_path is not None and not args.final):
+        svm["confusion_matrix"].to_csv(args.o+"confusion_matrix.csv")
+        svm["misattributions"].to_csv(args.o+"misattributions.csv")
+
     joblib.dump(svm["pipeline"], args.o+'mySVM.joblib')
 
-    print(".......... Writing final predictions to " + args.o + "FINAL_PREDICTIONS.csv ........")
-    svm["final_predictions"].to_csv(args.o+"FINAL_PREDICTIONS.csv")
+    if args.final:
+        print(".......... Writing final predictions to " + args.o + "FINAL_PREDICTIONS.csv ........")
+        svm["final_predictions"].to_csv(args.o+"FINAL_PREDICTIONS.csv")
 
     if args.get_coefs:
         print(".......... Writing coefficients to disk ........")