From 7f11fcd252152227270c108e2dc5d9856893d9cc Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Camps <jbcamps@hotmail.com>
Date: Thu, 22 Feb 2024 16:25:40 +0100
Subject: [PATCH 1/7] include affixes in standard load_corpus

---
 features_filter.py                    | 36 -------------------------
 load_corpus.py                        | 14 +++++++---
 superstyl/preproc/features_extract.py | 38 +++++++++++++++++----------
 superstyl/preproc/features_select.py  | 36 -------------------------
 tests/test_main.py                    |  6 ++---
 5 files changed, 38 insertions(+), 92 deletions(-)
 delete mode 100755 features_filter.py
 delete mode 100755 superstyl/preproc/features_select.py

diff --git a/features_filter.py b/features_filter.py
deleted file mode 100755
index 67580f87..00000000
--- a/features_filter.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import superstyl.preproc.features_select as fs
-import json
-import regex as re
-
-#TODO: implement more types from Sapkota et al?
-
-if __name__ == "__main__":
-
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-f', action="store", help="list of features in json (such as produced by main.py)", required=True)
-    parser.add_argument('--affixes_grams', action='store_true', help="Keep affixes (space starting or ending n-grams)", default=False)
-    parser.add_argument('--punct_grams', action='store_true', help="Keep n-grams containing punctuation", default=False)
-    #parser.add_argument('--word-grams', action='store_true', help="Keep n-grams with word content", default=False)
-    args = parser.parse_args()
-
-    print(".......loading preexisting feature list.......")
-    with open(args.f, 'r') as f:
-        my_feats = json.loads(f.read())
-
-    print(".......Filtering feature list.......")
-    my_feats = fs.filter_ngrams(my_feats, affixes=args.affixes_grams, punct=args.punct_grams)
-
-    # name the output
-    outfile = re.sub(r"\.json$", "", args.f)
-    if args.affixes_grams:
-        outfile = outfile+"_affixes"
-
-    if args.punct_grams:
-        outfile = outfile + "_punct"
-
-    outfile = outfile+".json"
-
-    print(".......Writing .......")
-    with open(outfile, "w") as out:
-        out.write(json.dumps(my_feats))
diff --git a/load_corpus.py b/load_corpus.py
index f4638556..3ebfc275 100755
--- a/load_corpus.py
+++ b/load_corpus.py
@@ -12,13 +12,21 @@
     parser.add_argument('-s', nargs='+', help="paths to files")
     parser.add_argument('-o', action='store', help="optional base name of output files", type=str, default=False)
     parser.add_argument('-f', action="store", help="optional list of features in json", default=False)
-    parser.add_argument('-t', action='store', help="types of features (words or chars)", type=str, default="words")
+    parser.add_argument('-t', action='store', help="types of features (words, chars, affixes - "
+                                                   "as per Sapkota et al. 2015 - or POS). POS are currently"
+                                                   "only implemented for Modern English", type=str,
+                        default="words", choices=["words", "chars", "affixes", "POS"])
     parser.add_argument('-n', action='store', help="n grams lengths (default 1)", default=1, type=int)
     parser.add_argument('-k', action='store', help="How many most frequent?", default=5000, type=int)
     parser.add_argument('--absolute_freqs', action='store_true', help="switch to get absolute instead of relative freqs", default=False)
-    parser.add_argument('-x', action='store', help="format (txt, xml or tei)", default="txt")
+    parser.add_argument('-x', action='store', help="format (txt, xml or tei)", default="txt",
+                        choices=["txt", "xml", "tei"]
+                        )
     parser.add_argument('--sampling', action='store_true', help="Sample the texts?", default=False)
-    parser.add_argument('--sample_units', action='store', help="Units of length for sampling (words, verses; default: words)", default="words", type=str)
+    parser.add_argument('--sample_units', action='store', help="Units of length for sampling "
+                                                               "(words, verses; default: words)",
+                        choices=["words", "verses"],
+                        default="words", type=str)
     parser.add_argument('--sample_size', action='store', help="Size for sampling (default: 3000)", default=3000, type=int)
     parser.add_argument('--sample_step', action='store', help="Step for sampling with overlap (default is no overlap)", default=None, type=int)
     parser.add_argument('--max_samples', action='store', help="Maximum number of (randomly selected) samples per author/class (default is all)",
diff --git a/superstyl/preproc/features_extract.py b/superstyl/preproc/features_extract.py
index 0b4ddee1..2d3f7989 100755
--- a/superstyl/preproc/features_extract.py
+++ b/superstyl/preproc/features_extract.py
@@ -2,27 +2,37 @@
 from collections import Counter
 import nltk.tokenize
 import nltk
+import regex as re
 
-
-def count_words(text, feats = "words", n = 1):
+def count_features(text, feats ="words", n = 1):
     """
-    Get feature counts from  a text (words, chars or POS n-grams)
+    Get feature counts from  a text (words, chars or POS n-grams, or affixes(+punct if keep_punct),
+    following Sapkota et al., NAACL 2015
     :param text: the source text
-    :param feats: the type of feats: words, chars, POS (supported only for English)
+    :param feats: the type of feats: words, chars, POS (supported only for English), or affixes
     :param n: the length of n-grams
     :return: features absolute frequencies in text as a counter
     """
-    # Should this be called count_words ? It counts other features as well... count_features ? It's just a grep and replace away.
 
-    if feats == "words":
+    if feats == "words" or feats == "affixes":
         tokens = nltk.tokenize.wordpunct_tokenize(text)
         if n > 1:
             tokens = ["_".join(t) for t in list(nltk.ngrams(tokens, n))]
 
     elif feats == "chars":
-        tokens = list(text.replace(' ', '_'))
-        if n > 1:
-            tokens = ["".join(t) for t in list(nltk.ngrams(tokens, n))]
+        tokens = [re.sub(r'\p{Z}', '_', ''.join(ngram)) for ngram in nltk.ngrams(text, n)]
+
+    elif feats == "affixes":
+        words = nltk.tokenize.wordpunct_tokenize(text)
+        ngrams = [''.join(ngram) for ngram in nltk.ngrams(text, n)]
+        # and now get all types from Sapkota et al.
+        affs = [w[:3] for w in words if len(w) > n] + [w[-3:] for w in words if len(w) > n]
+        # space affixes (and punct affixes if keep_punct has been enabled)
+        space_affs_and_punct = [re.sub(r'\p{Z}', '_', ngram)
+                                for ngram in ngrams
+                                if re.search(r'(^\p{Z})|(\p{Z}$)|(\p{P})', ngram)
+                                ]
+        tokens = affs + space_affs_and_punct
 
     #POS in english with NLTK - need to propose spacy later on
     elif feats == "pos":
@@ -40,7 +50,7 @@ def count_words(text, feats = "words", n = 1):
 
     #Adding an error message in case some distracted guy like me would enter something wrong:
     else:
-        raise ValueError("Unsupported feature type. Choose from 'words', 'chars', or 'pos'.")
+        raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes' or 'pos'.")
 
     counts = Counter()
     counts.update(tokens)
@@ -65,14 +75,14 @@ def get_feature_list(myTexts, feats="words", n=1, relFreqs=True):
     """
     :param myTexts: a 'myTexts' object, containing documents to be processed
     :param feat_list: a list of features to be selected
-    :param feats: type of feats (words, chars, POS)
+    :param feats: type of feats (words, chars, affixes or POS)
     :param n: n-grams length
     :return: list of features, with total frequency
     """
     my_feats = Counter()
 
     for text in myTexts:
-        counts = count_words(text["text"], feats=feats, n=n)
+        counts = count_features(text["text"], feats=feats, n=n)
 
         my_feats.update(counts)
 
@@ -90,7 +100,7 @@ def get_counts(myTexts, feat_list=None, feats = "words", n = 1, relFreqs = False
     Get counts for a collection of texts
     :param myTexts: the document collection
     :param feat_list: a list of features to be selected (None for all)
-    :param feats: the type of feats (words, chars, etc.)
+    :param feats: the type of feats (words, chars, affixes, POS)
     :param n: the length of n-grams
     :param relFreqs: whether to compute relative freqs
     :return: the collection with, for each text, a 'wordCounts' dictionary
@@ -98,7 +108,7 @@ def get_counts(myTexts, feat_list=None, feats = "words", n = 1, relFreqs = False
 
     for i in enumerate(myTexts):
 
-        counts = count_words(myTexts[i[0]]["text"], feats=feats, n=n)
+        counts = count_features(myTexts[i[0]]["text"], feats=feats, n=n)
 
         if relFreqs:
             counts = relative_frequencies(counts)
diff --git a/superstyl/preproc/features_select.py b/superstyl/preproc/features_select.py
deleted file mode 100755
index 3d1e9418..00000000
--- a/superstyl/preproc/features_select.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import regex as re
-
-def filter_ngrams(feat_list, affixes=True, punct=True):
-    """
-    Filter a list of features in input to yield a selection of n-grams, according to the parameters,
-    following Sapkota et al., NAACL 2015
-    feat_list: the feature list (typically, coming of load_corpus.py and loaded)
-     affixes: affixes (n-grams beginning or ending by space)
-     punct: n-grams containing punctuation
-    """
-
-    out = []
-
-    if affixes:
-        out = out + [f for f in feat_list if f[0].startswith('_') or f[0].endswith('_')]
-        switch = True
-        seen = set([f[0] for f in out])
-
-    if punct:
-        # a bit trickier: need to remove underscore not to include n-grams with just
-        # underscore as punctuation
-        if switch:
-            out = out + [f for f in feat_list if re.match(r"\p{P}", re.sub('_', '', f[0]))
-                         and f[0] not in seen]
-
-        else:
-            out = out + [f for f in feat_list if re.match(r"\p{P}", re.sub('_', '', f[0]))]
-
-    return out
-
-
-
-
-
-
-
diff --git a/tests/test_main.py b/tests/test_main.py
index 3b1bcb4b..b18b5f40 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -391,13 +391,13 @@ def test_counts(self):
         # GIVEN
         text = "the cat the dog the squirrel the cat the cat"
         # WHEN
-        results = superstyl.preproc.features_extract.count_words(text, feats = "words", n = 1)
+        results = superstyl.preproc.features_extract.count_features(text, feats ="words", n = 1)
         # THEN
         expected = {'the': 5, 'cat': 3, 'dog': 1, 'squirrel': 1}
         self.assertEqual(results, expected)
 
         # WHEN
-        results = superstyl.preproc.features_extract.count_words(text, feats="words", n=2)
+        results = superstyl.preproc.features_extract.count_features(text, feats="words", n=2)
         # THEN
         expected = {'the_cat': 3, 'cat_the': 2, 'the_dog': 1, 'dog_the': 1, 'the_squirrel': 1, 'squirrel_the': 1}
         self.assertEqual(results, expected)
@@ -405,7 +405,7 @@ def test_counts(self):
         # GIVEN
         text = "the yo yo"
         # WHEN
-        results = superstyl.preproc.features_extract.count_words(text, feats="chars", n=3)
+        results = superstyl.preproc.features_extract.count_features(text, feats="chars", n=3)
         # THEN
         expected = {'the': 1, 'he_': 1, 'e_y': 1, '_yo': 2, 'yo_': 1, 'o_y': 1}
         self.assertEqual(results, expected)

From effce56aa36b5828cdd6585aacf58b34d592b524 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Camps <jbcamps@hotmail.com>
Date: Thu, 22 Feb 2024 17:40:32 +0100
Subject: [PATCH 2/7] fixed the function and added tests

---
 superstyl/preproc/features_extract.py       |  25 +++--
 tests/{test_main.py => test_load_corpus.py} | 113 ++++++++++++++++++--
 2 files changed, 123 insertions(+), 15 deletions(-)
 rename tests/{test_main.py => test_load_corpus.py} (72%)

diff --git a/superstyl/preproc/features_extract.py b/superstyl/preproc/features_extract.py
index 2d3f7989..874cd7eb 100755
--- a/superstyl/preproc/features_extract.py
+++ b/superstyl/preproc/features_extract.py
@@ -11,20 +11,24 @@ def count_features(text, feats ="words", n = 1):
     :param text: the source text
     :param feats: the type of feats: words, chars, POS (supported only for English), or affixes
     :param n: the length of n-grams
-    :return: features absolute frequencies in text as a counter
+    :return: features absolute frequencies in text as a counter, and the total of frequencies
     """
 
-    if feats == "words" or feats == "affixes":
+    if feats == "words":
         tokens = nltk.tokenize.wordpunct_tokenize(text)
         if n > 1:
             tokens = ["_".join(t) for t in list(nltk.ngrams(tokens, n))]
+        total = len(tokens)
 
     elif feats == "chars":
         tokens = [re.sub(r'\p{Z}', '_', ''.join(ngram)) for ngram in nltk.ngrams(text, n)]
+        total = len(tokens)
 
     elif feats == "affixes":
         words = nltk.tokenize.wordpunct_tokenize(text)
         ngrams = [''.join(ngram) for ngram in nltk.ngrams(text, n)]
+        # relative frequencies should be computed from all existing n-grams
+        total = len(ngrams)
         # and now get all types from Sapkota et al.
         affs = [w[:3] for w in words if len(w) > n] + [w[-3:] for w in words if len(w) > n]
         # space affixes (and punct affixes if keep_punct has been enabled)
@@ -42,6 +46,7 @@ def count_features(text, feats ="words", n = 1):
             tokens = ["_".join(t) for t in list(nltk.ngrams(pos_tags, n))]
         else:
             tokens = pos_tags
+        total = len(tokens)
 
     # Adding sentence length ; still commented as it is a work in progress, an integer won't do, a quantile would be better
     #elif feats == "sentenceLength":
@@ -55,16 +60,16 @@ def count_features(text, feats ="words", n = 1):
     counts = Counter()
     counts.update(tokens)
 
-    return counts
+    return counts, total
 
-def relative_frequencies(wordCounts):
+def relative_frequencies(wordCounts, total):
     """
     For a counter of word counts, return the relative frequencies
     :param wordCounts: a dictionary of word counts
+    :param total, the total number of features
     :return a counter of word relative frequencies
     """
 
-    total = sum(wordCounts.values())
     for t in wordCounts.keys():
         wordCounts[t] = wordCounts[t] / total
 
@@ -80,14 +85,16 @@ def get_feature_list(myTexts, feats="words", n=1, relFreqs=True):
     :return: list of features, with total frequency
     """
     my_feats = Counter()
+    total = 0
 
     for text in myTexts:
-        counts = count_features(text["text"], feats=feats, n=n)
+        counts, text_total = count_features(text["text"], feats=feats, n=n)
 
         my_feats.update(counts)
+        total = total + text_total
 
     if relFreqs:
-        my_feats = relative_frequencies(my_feats)
+        my_feats = relative_frequencies(my_feats, total)
 
     # sort them
     my_feats = [(i, my_feats[i]) for i in sorted(my_feats, key=my_feats.get, reverse=True)]
@@ -108,10 +115,10 @@ def get_counts(myTexts, feat_list=None, feats = "words", n = 1, relFreqs = False
 
     for i in enumerate(myTexts):
 
-        counts = count_features(myTexts[i[0]]["text"], feats=feats, n=n)
+        counts, total = count_features(myTexts[i[0]]["text"], feats=feats, n=n)
 
         if relFreqs:
-            counts = relative_frequencies(counts)
+            counts = relative_frequencies(counts, total)
 
         if feat_list:
             # and keep only the ones in the feature list
diff --git a/tests/test_main.py b/tests/test_load_corpus.py
similarity index 72%
rename from tests/test_main.py
rename to tests/test_load_corpus.py
index b18b5f40..faa3c651 100644
--- a/tests/test_main.py
+++ b/tests/test_load_corpus.py
@@ -6,7 +6,6 @@
 import superstyl.preproc.embedding
 import superstyl.preproc.select
 import superstyl.preproc.text_count
-import superstyl.preproc.features_select
 import os
 import glob
 
@@ -119,9 +118,103 @@ def test_load_corpus(self):
         expected_feats = [('this', 2 / 12), ('is', 2 / 12), ('the', 2 / 12), ('text', 2 / 12)]
         self.assertEqual(feats, expected_feats)
 
+        # WHEN
+        corpus, feats = superstyl.load.load_corpus(self.paths, feats="chars", n=3, format="txt", keep_punct=True,
+                                                   relFreqs=False)
+
+        # THEN
+        expected_feats = [('e_t', 3), ('_te', 3), ('tex', 3), ('ext', 3), ('is_', 3), ('Thi', 2), ('his', 2), ('s_i', 2),
+                          ('_is', 2), ('_th', 2), ('the', 2), ('he_', 2), ('xt!', 2), ('Voi', 1), ('oic', 1), ('ici', 1),
+                          ('ci_', 1), ('i_l', 1), ('_le', 1), ('le_', 1), ('xte', 1), ('te!', 1), ('is,', 1), ('s,_', 1),
+                          (',_a', 1), ('_al', 1), ('als', 1), ('lso', 1), ('so_', 1), ('o_,', 1), ('_,_', 1), (',_t', 1),
+                          ('s_t', 1)]
+
+
+        expected_corpus = {'author':
+                               {'Dupont_Letter1.txt': 'Dupont', 'Smith_Letter2.txt': 'Smith', 'Smith_Letter1.txt': 'Smith'},
+                           'lang': {'Dupont_Letter1.txt': 'NA', 'Smith_Letter2.txt': 'NA', 'Smith_Letter1.txt': 'NA'},
+                           'e_t': {'Dupont_Letter1.txt': 1, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 1},
+                           '_te': {'Dupont_Letter1.txt': 1, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 1},
+                           'tex': {'Dupont_Letter1.txt': 1, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 1},
+                           'ext': {'Dupont_Letter1.txt': 1, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 1},
+                           'is_': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 2},
+                           'Thi': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 1},
+                           'his': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 1},
+                           's_i': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 1},
+                           '_is': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 1},
+                           '_th': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 1},
+                           'the': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 1},
+                           'he_': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 1},
+                           'xt!': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 1},
+                           'Voi': {'Dupont_Letter1.txt': 1, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0},
+                           'oic': {'Dupont_Letter1.txt': 1, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0},
+                           'ici': {'Dupont_Letter1.txt': 1, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0},
+                           'ci_': {'Dupont_Letter1.txt': 1, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0},
+                           'i_l': {'Dupont_Letter1.txt': 1, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0},
+                           '_le': {'Dupont_Letter1.txt': 1, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0},
+                           'le_': {'Dupont_Letter1.txt': 1, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0},
+                           'xte': {'Dupont_Letter1.txt': 1, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0},
+                           'te!': {'Dupont_Letter1.txt': 1, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0},
+                           'is,': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 0},
+                           's,_': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 0},
+                           ',_a': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 0},
+                           '_al': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 0},
+                           'als': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 0},
+                           'lso': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 0},
+                           'so_': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 0},
+                           'o_,': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 0},
+                           '_,_': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 0},
+                           ',_t': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 0},
+                           's_t': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 1}}
+
+
+        self.assertEqual(sorted(feats), sorted(expected_feats))
+        self.assertEqual(corpus.to_dict(), expected_corpus)
 
+        # WHEN
+        corpus, feats = superstyl.load.load_corpus(self.paths, feats="affixes", n=3, format="txt", keep_punct=True)
 
-        # TODO: test other options
+        # THEN
+        expected_feats = [('_te', 3/51), ('tex', 3/51), ('ext', 2/51), ('is_', 3/51), ('Thi', 2/51), ('his', 2/51),
+                          ('_is', 2/51), ('_th', 2/51),  ('he_', 2/51), ('xt!', 2/51), ('Voi', 1/51),('ici', 1/51),
+                          ('ci_', 1/51), ('_le', 1/51), ('le_', 1/51), ('xte', 1/51), ('te!', 1/51), ('is,', 1/51),
+                          ('s,_', 1/51), (',_a', 1/51), ('_al', 1/51), ('als', 1/51), ('lso', 1/51), ('so_', 1/51),
+                          ('o_,', 1/51), ('_,_', 1/51), (',_t', 1/51)]
+
+        expected_corpus = {'author':
+                               {'Dupont_Letter1.txt': 'Dupont', 'Smith_Letter2.txt': 'Smith',
+                                'Smith_Letter1.txt': 'Smith'},
+                           'lang': {'Dupont_Letter1.txt': 'NA', 'Smith_Letter2.txt': 'NA', 'Smith_Letter1.txt': 'NA'},
+                           '_te': {'Dupont_Letter1.txt': 1/13, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 1/15},
+                           'tex': {'Dupont_Letter1.txt': 1/13, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 1/15},
+                           'ext': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 1/15},
+                           'is_': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 2/15},
+                           'Thi': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 1/15},
+                           'his': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 1/15},
+                           '_is': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 1/15},
+                           '_th': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 1/15},
+                           'he_': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 1/15},
+                           'xt!': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 1/15},
+                           'Voi': {'Dupont_Letter1.txt': 1/13, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0},
+                           'ici': {'Dupont_Letter1.txt': 1/13, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0},
+                           'ci_': {'Dupont_Letter1.txt': 1/13, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0},
+                           '_le': {'Dupont_Letter1.txt': 1/13, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0},
+                           'le_': {'Dupont_Letter1.txt': 1/13, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0},
+                           'xte': {'Dupont_Letter1.txt': 1/13, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0},
+                           'te!': {'Dupont_Letter1.txt': 1/13, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0},
+                           'is,': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 0},
+                           's,_': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 0},
+                           ',_a': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 0},
+                           '_al': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 0},
+                           'als': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 0},
+                           'lso': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 0},
+                           'so_': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 0},
+                           'o_,': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 0},
+                           '_,_': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 0},
+                           ',_t': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 0}}
+
+        self.assertEqual(sorted(feats), sorted(expected_feats))
+        self.assertEqual(corpus.to_dict(), expected_corpus)
 
 
     def test_load_texts_txt(self):
@@ -393,21 +486,29 @@ def test_counts(self):
         # WHEN
         results = superstyl.preproc.features_extract.count_features(text, feats ="words", n = 1)
         # THEN
-        expected = {'the': 5, 'cat': 3, 'dog': 1, 'squirrel': 1}
+        expected = ({'the': 5, 'cat': 3, 'dog': 1, 'squirrel': 1}, 10)
         self.assertEqual(results, expected)
 
         # WHEN
         results = superstyl.preproc.features_extract.count_features(text, feats="words", n=2)
         # THEN
-        expected = {'the_cat': 3, 'cat_the': 2, 'the_dog': 1, 'dog_the': 1, 'the_squirrel': 1, 'squirrel_the': 1}
+        expected = ({'the_cat': 3, 'cat_the': 2, 'the_dog': 1, 'dog_the': 1, 'the_squirrel': 1, 'squirrel_the': 1}, 9)
         self.assertEqual(results, expected)
 
         # GIVEN
-        text = "the yo yo"
+        text = "These yo yo!"
         # WHEN
         results = superstyl.preproc.features_extract.count_features(text, feats="chars", n=3)
         # THEN
-        expected = {'the': 1, 'he_': 1, 'e_y': 1, '_yo': 2, 'yo_': 1, 'o_y': 1}
+        expected = ({'_yo': 2, 'The': 1, 'hes': 1, 'ese': 1, 'se_': 1, 'e_y': 1, 'yo_': 1, 'o_y': 1, 'yo!': 1}, 10)
+        self.assertEqual(results, expected)
+
+        # GIVEN
+        text = "These yo yo!"
+        # WHEN
+        results = superstyl.preproc.features_extract.count_features(text, feats="affixes", n=3)
+        # THEN
+        expected = ({'_yo': 2, 'The': 1, 'ese': 1, 'se_': 1, 'yo_': 1, 'yo!': 1}, 10)
         self.assertEqual(results, expected)
 
     def test_max_sampling(self):

From 096fcc0b1e968ed67fffb693a3af04cb356be699 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Camps <jbcamps@hotmail.com>
Date: Thu, 22 Feb 2024 19:00:36 +0100
Subject: [PATCH 3/7] added tests for embedding

---
 superstyl/load.py              |  4 +-
 superstyl/preproc/embedding.py | 24 +++++------
 tests/test_load_corpus.py      | 74 ++++++++++++++++++++++++++++++++++
 3 files changed, 85 insertions(+), 17 deletions(-)

diff --git a/superstyl/load.py b/superstyl/load.py
index 7e19b21b..eec1afdb 100644
--- a/superstyl/load.py
+++ b/superstyl/load.py
@@ -2,7 +2,6 @@
 import superstyl.preproc.features_extract as fex
 from superstyl.preproc.text_count import count_process
 import superstyl.preproc.embedding as embed
-import json
 import tqdm
 import pandas
 
@@ -53,7 +52,8 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, relFreqs
 
     if embedding:
         print(".......embedding counts.......")
-        myTexts = embed.get_embedded_counts(myTexts, my_feats, model, topn=neighbouring_size)
+        myTexts, my_feats = embed.get_embedded_counts(myTexts, my_feats, model, topn=neighbouring_size)
+        feat_list = [f for f in feat_list if f[0] in my_feats]
 
     unique_texts = [text["name"] for text in myTexts]
 
diff --git a/superstyl/preproc/embedding.py b/superstyl/preproc/embedding.py
index 6d0d8404..6dc0f262 100644
--- a/superstyl/preproc/embedding.py
+++ b/superstyl/preproc/embedding.py
@@ -1,5 +1,3 @@
-import numpy as np
-from scipy import spatial
 import gensim.models
 
 def load_embeddings(path):
@@ -25,12 +23,7 @@ def find_similar_words(model, word, topn=10):
     else:
         return [s[0] for s in model.most_similar(word, topn=topn)]
 
-# For tests
-# myTexts = [{'name': 'Letter1', 'aut': 'Smith', 'text': 'This is the text', 'lang': 'en', 'wordCounts': {'the': 1, 'this': 1}},
-#  {'name': 'Letter2', 'aut': 'Smith', 'text': 'Also the text', 'lang': 'en',
-#   'wordCounts': {'the': 1, 'also': 1}}]
-# feat_list = ['the']
-# feat = "the"
+
 def get_embedded_counts(myTexts, feat_list, model, topn=10):
     """
     Replace absolute frequencies by frequencies relative to a given semantic neighbouring
@@ -40,7 +33,14 @@ def get_embedded_counts(myTexts, feat_list, model, topn=10):
     :param model: the embeddings model
     :param topn: the n closest (as per cosine similarity) words on which to compute relative frequency
     :return: the myTexts collection with, for each text, a 'wordCounts' dictionary with said semantic relative frequencies
+    as well as the new feat_list with only the features that were actually used
     """
+    # First, create the new key
+    for i in enumerate(myTexts):
+        myTexts[i[0]]["embedded"] = {}
+
+    # keep only features present in the embedding
+    feat_list = [f for f in feat_list if f in list(model.index_to_key)]
 
     for feat in feat_list:
         similars = find_similar_words(model, feat, topn=topn)
@@ -50,17 +50,11 @@ def get_embedded_counts(myTexts, feat_list, model, topn=10):
 
         else:
             for i in enumerate(myTexts):
-
                 if feat in myTexts[i[0]]["wordCounts"].keys():
-
-                    if "embedded" not in myTexts[i[0]].keys():
-                        # then, initialise
-                        myTexts[i[0]]["embedded"] = {}
-
                     total = sum([myTexts[i[0]]["wordCounts"][s] for s in [feat]+similars if s in myTexts[i[0]]["wordCounts"].keys()])
                     myTexts[i[0]]["embedded"][feat] = myTexts[i[0]]["wordCounts"][feat] / total
 
-    return myTexts
+    return myTexts, feat_list
 
 
 
diff --git a/tests/test_load_corpus.py b/tests/test_load_corpus.py
index faa3c651..52887e9d 100644
--- a/tests/test_load_corpus.py
+++ b/tests/test_load_corpus.py
@@ -216,6 +216,29 @@ def test_load_corpus(self):
         self.assertEqual(sorted(feats), sorted(expected_feats))
         self.assertEqual(corpus.to_dict(), expected_corpus)
 
+        # WHEN
+        # TODO: fix pos !
+        #corpus, feats = superstyl.load.load_corpus(self.paths, feats="pos", n=2, format="txt")
+
+        # Now, test embedding
+        # WHEN
+        corpus, feats = superstyl.load.load_corpus(self.paths, feats="words", n=1, format="txt",
+                                                  embedding=THIS_DIR+"/embed/test_embedding.wv.txt",
+                                                  neighbouring_size=1)
+        # THEN
+
+        expected_feats = [('this', 2), ('is', 2), ('the', 2), ('text', 2), ('also', 1)]
+        expected_corpus = {'author': {'Dupont_Letter1.txt': 'Dupont', 'Smith_Letter1.txt': 'Smith', 'Smith_Letter2.txt': 'Smith'},
+                           'lang': {'Dupont_Letter1.txt': 'NA', 'Smith_Letter1.txt': 'NA', 'Smith_Letter2.txt': 'NA'},
+                           'this': {'Dupont_Letter1.txt': 0.0, 'Smith_Letter1.txt': 0.5, 'Smith_Letter2.txt': 0.5},
+                           'is': {'Dupont_Letter1.txt': 0.0, 'Smith_Letter1.txt': 0.5, 'Smith_Letter2.txt': 0.5},
+                           'the': {'Dupont_Letter1.txt': 0.0, 'Smith_Letter1.txt': 0.5, 'Smith_Letter2.txt': 0.5},
+                           'text': {'Dupont_Letter1.txt': 0.0, 'Smith_Letter1.txt': 0.5, 'Smith_Letter2.txt': 0.5},
+                           'also': {'Dupont_Letter1.txt': 0.0, 'Smith_Letter1.txt': 0.0, 'Smith_Letter2.txt': 1.0}}
+        self.assertEqual(feats, expected_feats)
+        self.assertEqual(corpus.to_dict(), expected_corpus)
+
+
 
     def test_load_texts_txt(self):
         # SCENARIO: from paths to txt, get myTexts object, i.e., a list of dictionaries
@@ -525,6 +548,57 @@ def test_max_sampling(self):
         self.assertEqual(len([text for text in results if text["aut"] == 'Smith']), 1)
 
 
+class Embed(unittest.TestCase):
+    model = superstyl.preproc.embedding.load_embeddings(THIS_DIR+"/embed/test_embedding.wv.txt")
+    def test_find_similar_words(self):
+        # Feature: find the n most similar words in an embedding
+        # GIVEN
+        word = "this"
+        # WHEN
+        results = superstyl.preproc.embedding.find_similar_words(self.model, word, topn=1)
+        # THEN
+        expected = ["the"]
+        self.assertEqual(results, expected)
+
+        # GIVEN
+        word = "supercalifragilistic"
+        # WHEN
+        results = superstyl.preproc.embedding.find_similar_words(self.model, word, topn=1)
+        # THEN
+        expected = None
+        self.assertEqual(results, expected)
+
+    def test_get_embedded_counts(self):
+        # FEATURE : for a myTexts objects, containing feature counts, a list of features, and an embedding model
+        # Get the relative frequencies of each words in regard to the topn most similar in the model
+
+        # GIVEN
+        myTexts =  [{'name': 'Letter1', 'aut': 'Smith', 'text': 'This is the text', 'lang': 'en',
+                     'wordCounts': {'this': 1, 'is': 1, 'the': 1, 'text': 1}},
+                   {'name': 'Letter2', 'aut': 'Smith', 'text': 'This is also the text', 'lang': 'en', 'wordCounts':
+                       {'this': 1, 'is': 1, 'also': 1, 'the': 1, 'text': 1}},
+                    {'name': 'Letter1', 'aut': 'Dupont', 'text': 'Voici le texte', 'lang': 'fr', 'wordCounts':
+                        {'Voici': 1, 'le': 1, 'texte': 1}}]
+        feat_list = ["this", "the", "voici"]
+        # WHEN
+        results, new_feat_list = superstyl.preproc.embedding.get_embedded_counts(myTexts, feat_list, self.model, topn=1)
+        # THEN
+        expected = [{'name': 'Letter1', 'aut': 'Smith', 'text': 'This is the text', 'lang': 'en',
+                     'wordCounts': {'this': 1, 'is': 1, 'the': 1, 'text': 1},
+                     'embedded': {'this': 0.5, 'the': 0.5}},
+                    {'name': 'Letter2', 'aut': 'Smith', 'text': 'This is also the text', 'lang': 'en',
+                     'wordCounts': {'this': 1, 'is': 1, 'also': 1, 'the': 1, 'text': 1},
+                     'embedded': {'this': 0.5, 'the': 0.5}},
+                    {'name': 'Letter1', 'aut': 'Dupont', 'text': 'Voici le texte', 'lang': 'fr',
+                     'wordCounts': {'Voici': 1, 'le': 1, 'texte': 1},
+                     'embedded': {}
+                     }]
+        self.assertEqual(results, expected)
+        self.assertEqual(new_feat_list, ["this", "the"])
+
+
+
+
 # TODO: tests for SVM, etc.
 # Test all options of main commands, see if they are accepted or not
 

From 43d0f446d235891c3d43bc097d6944203a4325e2 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Camps <jbcamps@hotmail.com>
Date: Thu, 22 Feb 2024 19:02:18 +0100
Subject: [PATCH 4/7] and the embedding file

---
 tests/embed/test_embedding.wv.txt | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 tests/embed/test_embedding.wv.txt

diff --git a/tests/embed/test_embedding.wv.txt b/tests/embed/test_embedding.wv.txt
new file mode 100644
index 00000000..e0a00662
--- /dev/null
+++ b/tests/embed/test_embedding.wv.txt
@@ -0,0 +1,7 @@
+6 100
+the -0.038194 -0.24487 0.72812 -0.39961 0.083172 0.043953 -0.39141 0.3344 -0.57545 0.087459 0.28787 -0.06731 0.30906 -0.26384 -0.13231 -0.20757 0.33395 -0.33848 -0.31743 -0.48336 0.1464 -0.37304 0.34577 0.052041 0.44946 -0.46971 0.02628 -0.54155 -0.15518 -0.14107 -0.039722 0.28277 0.14393 0.23464 -0.31021 0.086173 0.20397 0.52624 0.17164 -0.082378 -0.71787 -0.41531 0.20335 -0.12763 0.41367 0.55187 0.57908 -0.33477 -0.36559 -0.54857 -0.062892 0.26584 0.30205 0.99775 -0.80481 -3.0243 0.01254 -0.36942 2.2167 0.72201 -0.24978 0.92136 0.034514 0.46745 1.1079 -0.19358 -0.074575 0.23353 -0.052062 -0.22044 0.057162 -0.15806 -0.30798 -0.41625 0.37972 0.15006 -0.53212 -0.2055 -1.2526 0.071624 0.70565 0.49744 -0.42063 0.26148 -1.538 -0.30223 -0.073438 -0.28312 0.37104 -0.25217 0.016215 -0.017099 -0.38984 0.87424 -0.72569 -0.51058 -0.52028 -0.1459 0.8278 0.27062
+, -0.10767 0.11053 0.59812 -0.54361 0.67396 0.10663 0.038867 0.35481 0.06351 -0.094189 0.15786 -0.81665 0.14172 0.21939 0.58505 -0.52158 0.22783 -0.16642 -0.68228 0.3587 0.42568 0.19021 0.91963 0.57555 0.46185 0.42363 -0.095399 -0.42749 -0.16567 -0.056842 -0.29595 0.26037 -0.26606 -0.070404 -0.27662 0.15821 0.69825 0.43081 0.27952 -0.45437 -0.33801 -0.58184 0.22364 -0.5778 -0.26862 -0.20425 0.56394 -0.58524 -0.14365 -0.64218 0.0054697 -0.35248 0.16162 1.1796 -0.47674 -2.7553 -0.1321 -0.047729 1.0655 1.1034 -0.2208 0.18669 0.13177 0.15117 0.7131 -0.35215 0.91348 0.61783 0.70992 0.23955 -0.14571 -0.37859 -0.045959 -0.47368 0.2385 0.20536 -0.18996 0.32507 -1.1112 -0.36341 0.98679 -0.084776 -0.54008 0.11726 -1.0194 -0.24424 0.12771 0.013884 0.080374 -0.35414 0.34951 -0.7226 0.37549 0.4441 -0.99059 0.61214 -0.35111 -0.83155 0.45293 0.082577
+this -0.57058 0.44183 0.70102 -0.41713 -0.34058 0.02339 -0.071537 0.48177 -0.013121 0.16834 -0.13389 0.040626 0.15827 -0.44342 -0.019403 -0.009661 -0.046284 0.093228 -0.27331 0.2285 0.33089 -0.36474 0.078741 0.3585 0.44757 -0.2299 0.18077 -0.6265 0.053852 -0.29154 -0.4256 0.62903 0.14393 -0.046004 -0.21007 0.48879 -0.057698 0.37431 -0.030075 -0.34494 -0.29702 0.15095 0.28248 -0.16578 0.076131 -0.093016 0.79365 -0.60489 -0.18874 -1.0173 0.31962 -0.16344 0.54177 1.1725 -0.47875 -3.3842 -0.081301 -0.3528 1.8372 0.44516 -0.52666 0.99786 -0.32178 0.033462 1.1783 -0.072905 0.39737 0.26166 0.33111 -0.35629 -0.16558 -0.44382 -0.14183 -0.37976 0.28994 -0.029114 -0.35169 -0.27694 -1.344 0.19555 0.16887 0.040237 -0.80212 0.23366 -1.3837 -0.023132 0.085395 -0.74051 -0.073934 -0.58838 -0.085735 -0.10525 -0.51571 0.15038 -0.16694 -0.16372 -0.22702 -0.66102 0.47197 0.37253
+is -0.54264 0.41476 1.0322 -0.40244 0.46691 0.21816 -0.074864 0.47332 0.080996 -0.22079 -0.12808 -0.1144 0.50891 0.11568 0.028211 -0.3628 0.43823 0.047511 0.20282 0.49857 -0.10068 0.13269 0.16972 0.11653 0.31355 0.25713 0.092783 -0.56826 -0.52975 -0.051456 -0.67326 0.92533 0.2693 0.22734 0.66365 0.26221 0.19719 0.2609 0.18774 -0.3454 -0.42635 0.13975 0.56338 -0.56907 0.12398 -0.12894 0.72484 -0.26105 -0.26314 -0.43605 0.078908 -0.84146 0.51595 1.3997 -0.7646 -3.1453 -0.29202 -0.31247 1.5129 0.52435 0.21456 0.42452 -0.088411 -0.17805 1.1876 0.10579 0.76571 0.21914 0.35824 -0.11636 0.093261 -0.62483 -0.21898 0.21796 0.74056 -0.43735 0.14343 0.14719 -1.1605 -0.050508 0.12677 -0.014395 -0.98676 -0.091297 -1.2054 -0.11974 0.047847 -0.54001 0.52457 -0.70963 -0.32528 -0.1346 -0.41314 0.33435 -0.0072412 0.32253 -0.044219 -1.2969 0.76217 0.46349
+also -0.33819 0.064568 -0.032558 -0.29448 0.84125 -0.29092 -0.35264 0.35777 0.004152 -0.0067549 -0.11512 -0.38832 0.49764 0.47187 0.046247 -0.059806 0.59317 -0.080286 -0.45926 0.28211 0.33909 -0.25741 0.30599 0.53594 0.1168 -0.30916 -0.16143 -0.1841 -0.26339 -0.035592 -0.13136 1.1538 -0.61616 0.73314 0.46168 0.4241 0.2918 0.73092 -0.17098 -0.03529 -0.6867 -0.24653 0.34776 -0.46747 0.21257 -0.052958 0.1032 -0.52122 0.61087 -0.71005 -0.16765 -0.34415 0.27119 1.1337 -0.33195 -2.3864 -0.52352 -0.25531 0.80993 1.3563 -0.1452 0.32792 0.11149 0.17806 1.0008 -0.37243 0.3127 0.28634 0.47915 -0.23534 0.13146 -0.5478 0.054173 -0.19163 0.16276 -0.067267 -0.0044537 0.55708 -1.2568 -0.063385 0.62438 -0.28284 -0.6458 -0.2832 -1.8987 -0.5706 0.026083 -0.41721 0.29686 -0.18416 -0.19252 -0.59915 -0.17981 0.17649 -0.56043 0.48284 -0.44081 -0.84036 0.78533 0.36017
+text -0.49705 0.71642 0.40119 -0.05761 0.83614 0.8256 0.08963 -0.53492 0.34335 -0.27079 -0.011152 0.025207 -0.1235 0.11801 0.045312 0.73144 0.13744 -0.13084 -0.028249 -0.30789 -0.81864 -0.54517 0.25151 0.53891 0.38293 -1.0343 -0.1104 0.44977 -0.13019 0.24847 0.1048 0.19567 -0.42672 -0.37912 0.14535 -0.025532 -0.23523 -0.3638 -0.14269 0.0062072 -0.63 -0.23068 0.086461 0.22126 -0.65625 -0.55701 -0.60243 -0.13159 -0.027226 0.0044152 1.4123 1.3042 0.54118 0.33443 -0.51865 -1.8253 -0.30525 -0.32747 1.236 0.08771 0.007793 0.36571 -0.39304 -0.79174 0.57874 -0.0025427 0.10442 0.64166 -0.1881 -0.76203 0.23008 0.30637 1.0386 -0.69846 0.31094 0.63762 -0.09997 0.16999 -0.59984 -0.89565 -0.25059 -0.93011 -0.59606 -0.32965 -1.6828 0.39102 0.65383 -1.5176 0.61748 0.0075596 0.040066 0.60803 -0.027058 0.15273 -0.16887 -0.47664 -0.61775 -0.98735 0.23776 0.39952

From e798f5c2757bc2c591806779eec3b00a1e668663 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Camps <jbcamps@hotmail.com>
Date: Thu, 22 Feb 2024 20:16:01 +0100
Subject: [PATCH 5/7] added documentation

---
 load_corpus.py    |  5 +++--
 superstyl/load.py | 27 ++++++++++++++++++++++++++-
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/load_corpus.py b/load_corpus.py
index 3ebfc275..56dd309b 100755
--- a/load_corpus.py
+++ b/load_corpus.py
@@ -19,7 +19,8 @@
     parser.add_argument('-n', action='store', help="n grams lengths (default 1)", default=1, type=int)
     parser.add_argument('-k', action='store', help="How many most frequent?", default=5000, type=int)
     parser.add_argument('--absolute_freqs', action='store_true', help="switch to get absolute instead of relative freqs", default=False)
-    parser.add_argument('-x', action='store', help="format (txt, xml or tei)", default="txt",
+    parser.add_argument('-x', action='store', help="format (txt, xml or tei) /!\ only txt is fully implemented",
+                        default="txt",
                         choices=["txt", "xml", "tei"]
                         )
     parser.add_argument('--sampling', action='store_true', help="Sample the texts?", default=False)
@@ -31,7 +32,7 @@
     parser.add_argument('--sample_step', action='store', help="Step for sampling with overlap (default is no overlap)", default=None, type=int)
     parser.add_argument('--max_samples', action='store', help="Maximum number of (randomly selected) samples per author/class (default is all)",
                         default=None, type=int)
-    parser.add_argument('--keep_punct', action='store_true', help="whether or not to keep punctuation and caps (default is False)",
+    parser.add_argument('--keep_punct', action='store_true', help="whether to keep punctuation and caps (default is False)",
                         default=False)
     parser.add_argument('--keep_sym', action='store_true',
                         help="if true, same as keep_punct, plus no Unidecode, and numbers are kept as well (default is False)",
diff --git a/superstyl/load.py b/superstyl/load.py
index eec1afdb..83101ffe 100644
--- a/superstyl/load.py
+++ b/superstyl/load.py
@@ -10,7 +10,32 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, relFreqs
                 identify_lang=False, embedding=False, neighbouring_size=10):
     """
     Main function to load a corpus from a collection of file, and an optional list of features to extract.
-    :param #TODO, document all params
+    :param data_paths: paths to the source files
+    :param feat_list: an optional list of features (as created by load_corpus), default None
+    :param feats: the type of features, one of 'words', 'chars', 'affixes, and 'POS'. Affixes are inspired by
+    Sapkota et al. 2015, and include space_prefix, space_suffix, prefix, suffix, and, if keep_pos, punctuation n-grams.
+    POS are currently only implemented for Modern English
+    :param n: n grams lengths (default 1)
+    :param k: How many most frequent? The function takes the rank of k (if k is smaller than the total number of features),
+    gets its frequencies, and only include features of superior or equal total frequencies.
+    :param relFreqs: return relative frequencies (default: True)
+    :param format: one of txt, xml or tei. /!\ only txt is fully implemented.
+    :param sampling: whether to sample the texts, by cutting it into slices of a given length, until the last possible
+      slice of this length, which means that often the end of the text will be eliminated (default False)
+    :param units: units of length for sampling, one of 'words', 'verses' (default: words). 'verses' is only implemented
+    for the 'tei' format
+    :param size: the size of the samples (in units)
+    :param step: step for sampling with overlap (default is step = size, which means no overlap).
+    Reduce for overlapping slices
+    :param max_samples: Maximum number of (randomly selected) samples per author/class (default is all)
+    :param keep_punct: whether to keep punctuation and caps (default is False)
+    :param keep_sym: same as keep_punct, plus no Unidecode, and numbers are kept as well (default is False). /!\ does not
+    actually keep symbols
+    :param identify_lang: if true, the language of each text will be guessed, using langdetect (default is False)
+    :param embedding: optional path to a word2vec embedding in txt format to compute frequencies among a set of
+    semantic neighbourgs (i.e., pseudo-paronyms)
+    :param neighbouring_size: size of semantic neighbouring in the embedding (as per gensim most_similar,
+    with topn=neighbouring_size)
     :return a pandas dataFrame of text metadata and feature frequencies; a global list of features with their frequencies
     """
 

From 6e5702348be9597e36a32cf40b2bb4416ff91e51 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Camps <jbcamps@hotmail.com>
Date: Thu, 22 Feb 2024 20:18:18 +0100
Subject: [PATCH 6/7] small doc change

---
 load_corpus.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/load_corpus.py b/load_corpus.py
index 56dd309b..cfe51749 100755
--- a/load_corpus.py
+++ b/load_corpus.py
@@ -9,7 +9,7 @@
     import argparse
 
     parser = argparse.ArgumentParser()
-    parser.add_argument('-s', nargs='+', help="paths to files")
+    parser.add_argument('-s', nargs='+', help="paths to files", required=True)
     parser.add_argument('-o', action='store', help="optional base name of output files", type=str, default=False)
     parser.add_argument('-f', action="store", help="optional list of features in json", default=False)
     parser.add_argument('-t', action='store', help="types of features (words, chars, affixes - "

From 8fb83710314a7073c0973cf10deeb8666be99acb Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Camps <jbcamps@hotmail.com>
Date: Thu, 22 Feb 2024 20:24:53 +0100
Subject: [PATCH 7/7] fixed README and now DONE

---
 README.md | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/README.md b/README.md
index 84e3edf0..82486fa3 100755
--- a/README.md
+++ b/README.md
@@ -140,15 +140,6 @@ python load_corpus.py -s data/psyche/train/* -t chars -n 3 -x tei --sampling --s
 
 You have a lot of options for feats extraction, inclusion or not of punctuation and symbols, sampling, source file formats, …, that can be accessed through the help.
 
-### Optional: Filter features
-
-You can filter certain features (for instance retain only 'pseudo-affixes' from character n-grams) using the command 
-
-```bash
-python features_filter.py -f feature_list.json --affixes_grams --punct_grams
-```
-
-
 ### Optional: Merge different features
 
 You can merge several sets of features, extracted in csv with the previous commands, by doing: