SupervisedStylometry · Jean-Baptiste-Camps · Feb 26, 2024 · Feb 22, 2024 · Feb 22, 2024 · Feb 22, 2024
diff --git a/README.md b/README.md
@@ -140,15 +140,6 @@ python load_corpus.py -s data/psyche/train/* -t chars -n 3 -x tei --sampling --s
 
 You have a lot of options for feats extraction, inclusion or not of punctuation and symbols, sampling, source file formats, …, that can be accessed through the help.
 
-### Optional: Filter features
-
-You can filter certain features (for instance retain only 'pseudo-affixes' from character n-grams) using the command 
-
-```bash
-python features_filter.py -f feature_list.json --affixes_grams --punct_grams
-```
-
-
 ### Optional: Merge different features
 
 You can merge several sets of features, extracted in csv with the previous commands, by doing:

diff --git a/features_filter.py b/features_filter.py
diff --git a/load_corpus.py b/load_corpus.py
@@ -9,21 +9,30 @@
     import argparse
 
     parser = argparse.ArgumentParser()
-    parser.add_argument('-s', nargs='+', help="paths to files")
+    parser.add_argument('-s', nargs='+', help="paths to files", required=True)
     parser.add_argument('-o', action='store', help="optional base name of output files", type=str, default=False)
     parser.add_argument('-f', action="store", help="optional list of features in json", default=False)
-    parser.add_argument('-t', action='store', help="types of features (words or chars)", type=str, default="words")
+    parser.add_argument('-t', action='store', help="types of features (words, chars, affixes - "
+                                                   "as per Sapkota et al. 2015 - or POS). POS are currently"
+                                                   "only implemented for Modern English", type=str,
+                        default="words", choices=["words", "chars", "affixes", "POS"])
     parser.add_argument('-n', action='store', help="n grams lengths (default 1)", default=1, type=int)
     parser.add_argument('-k', action='store', help="How many most frequent?", default=5000, type=int)
     parser.add_argument('--absolute_freqs', action='store_true', help="switch to get absolute instead of relative freqs", default=False)
-    parser.add_argument('-x', action='store', help="format (txt, xml or tei)", default="txt")
+    parser.add_argument('-x', action='store', help="format (txt, xml or tei) /!\ only txt is fully implemented",
+                        default="txt",
+                        choices=["txt", "xml", "tei"]
+                        )
     parser.add_argument('--sampling', action='store_true', help="Sample the texts?", default=False)
-    parser.add_argument('--sample_units', action='store', help="Units of length for sampling (words, verses; default: words)", default="words", type=str)
+    parser.add_argument('--sample_units', action='store', help="Units of length for sampling "
+                                                               "(words, verses; default: words)",
+                        choices=["words", "verses"],
+                        default="words", type=str)
     parser.add_argument('--sample_size', action='store', help="Size for sampling (default: 3000)", default=3000, type=int)
     parser.add_argument('--sample_step', action='store', help="Step for sampling with overlap (default is no overlap)", default=None, type=int)
     parser.add_argument('--max_samples', action='store', help="Maximum number of (randomly selected) samples per author/class (default is all)",
                         default=None, type=int)
-    parser.add_argument('--keep_punct', action='store_true', help="whether or not to keep punctuation and caps (default is False)",
+    parser.add_argument('--keep_punct', action='store_true', help="whether to keep punctuation and caps (default is False)",
                         default=False)
     parser.add_argument('--keep_sym', action='store_true',
                         help="if true, same as keep_punct, plus no Unidecode, and numbers are kept as well (default is False)",

diff --git a/superstyl/load.py b/superstyl/load.py
@@ -2,7 +2,6 @@
 import superstyl.preproc.features_extract as fex
 from superstyl.preproc.text_count import count_process
 import superstyl.preproc.embedding as embed
-import json
 import tqdm
 import pandas
 
@@ -11,7 +10,32 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, relFreqs
                 identify_lang=False, embedding=False, neighbouring_size=10):
     """
     Main function to load a corpus from a collection of file, and an optional list of features to extract.
-    :param #TODO, document all params
+    :param data_paths: paths to the source files
+    :param feat_list: an optional list of features (as created by load_corpus), default None
+    :param feats: the type of features, one of 'words', 'chars', 'affixes, and 'POS'. Affixes are inspired by
+    Sapkota et al. 2015, and include space_prefix, space_suffix, prefix, suffix, and, if keep_pos, punctuation n-grams.
+    POS are currently only implemented for Modern English
+    :param n: n grams lengths (default 1)
+    :param k: How many most frequent? The function takes the rank of k (if k is smaller than the total number of features),
+    gets its frequencies, and only include features of superior or equal total frequencies.
+    :param relFreqs: return relative frequencies (default: True)
+    :param format: one of txt, xml or tei. /!\ only txt is fully implemented.
+    :param sampling: whether to sample the texts, by cutting it into slices of a given length, until the last possible
+      slice of this length, which means that often the end of the text will be eliminated (default False)
+    :param units: units of length for sampling, one of 'words', 'verses' (default: words). 'verses' is only implemented
+    for the 'tei' format
+    :param size: the size of the samples (in units)
+    :param step: step for sampling with overlap (default is step = size, which means no overlap).
+    Reduce for overlapping slices
+    :param max_samples: Maximum number of (randomly selected) samples per author/class (default is all)
+    :param keep_punct: whether to keep punctuation and caps (default is False)
+    :param keep_sym: same as keep_punct, plus no Unidecode, and numbers are kept as well (default is False). /!\ does not
+    actually keep symbols
+    :param identify_lang: if true, the language of each text will be guessed, using langdetect (default is False)
+    :param embedding: optional path to a word2vec embedding in txt format to compute frequencies among a set of
+    semantic neighbourgs (i.e., pseudo-paronyms)
+    :param neighbouring_size: size of semantic neighbouring in the embedding (as per gensim most_similar,
+    with topn=neighbouring_size)
     :return a pandas dataFrame of text metadata and feature frequencies; a global list of features with their frequencies
     """
 
@@ -53,7 +77,8 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, relFreqs
 
     if embedding:
         print(".......embedding counts.......")
-        myTexts = embed.get_embedded_counts(myTexts, my_feats, model, topn=neighbouring_size)
+        myTexts, my_feats = embed.get_embedded_counts(myTexts, my_feats, model, topn=neighbouring_size)
+        feat_list = [f for f in feat_list if f[0] in my_feats]
 
     unique_texts = [text["name"] for text in myTexts]
 

diff --git a/superstyl/preproc/embedding.py b/superstyl/preproc/embedding.py
@@ -1,5 +1,3 @@
-import numpy as np
-from scipy import spatial
 import gensim.models
 
 def load_embeddings(path):
@@ -25,12 +23,7 @@ def find_similar_words(model, word, topn=10):
     else:
         return [s[0] for s in model.most_similar(word, topn=topn)]
 
-# For tests
-# myTexts = [{'name': 'Letter1', 'aut': 'Smith', 'text': 'This is the text', 'lang': 'en', 'wordCounts': {'the': 1, 'this': 1}},
-#  {'name': 'Letter2', 'aut': 'Smith', 'text': 'Also the text', 'lang': 'en',
-#   'wordCounts': {'the': 1, 'also': 1}}]
-# feat_list = ['the']
-# feat = "the"
+
 def get_embedded_counts(myTexts, feat_list, model, topn=10):
     """
     Replace absolute frequencies by frequencies relative to a given semantic neighbouring
@@ -40,7 +33,14 @@ def get_embedded_counts(myTexts, feat_list, model, topn=10):
     :param model: the embeddings model
     :param topn: the n closest (as per cosine similarity) words on which to compute relative frequency
     :return: the myTexts collection with, for each text, a 'wordCounts' dictionary with said semantic relative frequencies
+    as well as the new feat_list with only the features that were actually used
     """
+    # First, create the new key
+    for i in enumerate(myTexts):
+        myTexts[i[0]]["embedded"] = {}
+
+    # keep only features present in the embedding
+    feat_list = [f for f in feat_list if f in list(model.index_to_key)]
 
     for feat in feat_list:
         similars = find_similar_words(model, feat, topn=topn)
@@ -50,17 +50,11 @@ def get_embedded_counts(myTexts, feat_list, model, topn=10):
 
         else:
             for i in enumerate(myTexts):
-
                 if feat in myTexts[i[0]]["wordCounts"].keys():
-
-                    if "embedded" not in myTexts[i[0]].keys():
-                        # then, initialise
-                        myTexts[i[0]]["embedded"] = {}
-
                     total = sum([myTexts[i[0]]["wordCounts"][s] for s in [feat]+similars if s in myTexts[i[0]]["wordCounts"].keys()])
                     myTexts[i[0]]["embedded"][feat] = myTexts[i[0]]["wordCounts"][feat] / total
 
-    return myTexts
+    return myTexts, feat_list
 
 
 

diff --git a/superstyl/preproc/features_extract.py b/superstyl/preproc/features_extract.py
@@ -2,27 +2,41 @@
 from collections import Counter
 import nltk.tokenize
 import nltk
+import regex as re
 
-
-def count_words(text, feats = "words", n = 1):
+def count_features(text, feats ="words", n = 1):
     """
-    Get feature counts from  a text (words, chars or POS n-grams)
+    Get feature counts from  a text (words, chars or POS n-grams, or affixes(+punct if keep_punct),
+    following Sapkota et al., NAACL 2015
     :param text: the source text
-    :param feats: the type of feats: words, chars, POS (supported only for English)
+    :param feats: the type of feats: words, chars, POS (supported only for English), or affixes
     :param n: the length of n-grams
-    :return: features absolute frequencies in text as a counter
+    :return: features absolute frequencies in text as a counter, and the total of frequencies
     """
-    # Should this be called count_words ? It counts other features as well... count_features ? It's just a grep and replace away.
 
     if feats == "words":
         tokens = nltk.tokenize.wordpunct_tokenize(text)
         if n > 1:
             tokens = ["_".join(t) for t in list(nltk.ngrams(tokens, n))]
+        total = len(tokens)
 
     elif feats == "chars":
-        tokens = list(text.replace(' ', '_'))
-        if n > 1:
-            tokens = ["".join(t) for t in list(nltk.ngrams(tokens, n))]
+        tokens = [re.sub(r'\p{Z}', '_', ''.join(ngram)) for ngram in nltk.ngrams(text, n)]
+        total = len(tokens)
+
+    elif feats == "affixes":
+        words = nltk.tokenize.wordpunct_tokenize(text)
+        ngrams = [''.join(ngram) for ngram in nltk.ngrams(text, n)]
+        # relative frequencies should be computed from all existing n-grams
+        total = len(ngrams)
+        # and now get all types from Sapkota et al.
+        affs = [w[:3] for w in words if len(w) > n] + [w[-3:] for w in words if len(w) > n]
+        # space affixes (and punct affixes if keep_punct has been enabled)
+        space_affs_and_punct = [re.sub(r'\p{Z}', '_', ngram)
+                                for ngram in ngrams
+                                if re.search(r'(^\p{Z})|(\p{Z}$)|(\p{P})', ngram)
+                                ]
+        tokens = affs + space_affs_and_punct
 
     #POS in english with NLTK - need to propose spacy later on
     elif feats == "pos":
@@ -32,6 +46,7 @@
             tokens = ["_".join(t) for t in list(nltk.ngrams(pos_tags, n))]
         else:
             tokens = pos_tags
+        total = len(tokens)
 
     # Adding sentence length ; still commented as it is a work in progress, an integer won't do, a quantile would be better
     #elif feats == "sentenceLength":
@@ -40,21 +55,21 @@
 
     #Adding an error message in case some distracted guy like me would enter something wrong:
     else:
-        raise ValueError("Unsupported feature type. Choose from 'words', 'chars', or 'pos'.")
+        raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes' or 'pos'.")
 
     counts = Counter()
     counts.update(tokens)
 
-    return counts
+    return counts, total
 
-def relative_frequencies(wordCounts):
+def relative_frequencies(wordCounts, total):
     """
     For a counter of word counts, return the relative frequencies
     :param wordCounts: a dictionary of word counts
+    :param total, the total number of features
     :return a counter of word relative frequencies
     """
 
-    total = sum(wordCounts.values())
     for t in wordCounts.keys():
         wordCounts[t] = wordCounts[t] / total
 
@@ -65,19 +80,21 @@
     """
     :param myTexts: a 'myTexts' object, containing documents to be processed
     :param feat_list: a list of features to be selected
-    :param feats: type of feats (words, chars, POS)
+    :param feats: type of feats (words, chars, affixes or POS)
     :param n: n-grams length
     :return: list of features, with total frequency
     """
     my_feats = Counter()
+    total = 0
 
     for text in myTexts:
-        counts = count_words(text["text"], feats=feats, n=n)
+        counts, text_total = count_features(text["text"], feats=feats, n=n)
 
         my_feats.update(counts)
+        total = total + text_total
 
     if relFreqs:
-        my_feats = relative_frequencies(my_feats)
+        my_feats = relative_frequencies(my_feats, total)
 
     # sort them
     my_feats = [(i, my_feats[i]) for i in sorted(my_feats, key=my_feats.get, reverse=True)]
@@ -90,18 +107,18 @@
     Get counts for a collection of texts
     :param myTexts: the document collection
     :param feat_list: a list of features to be selected (None for all)
-    :param feats: the type of feats (words, chars, etc.)
+    :param feats: the type of feats (words, chars, affixes, POS)
     :param n: the length of n-grams
     :param relFreqs: whether to compute relative freqs
     :return: the collection with, for each text, a 'wordCounts' dictionary
     """
 
     for i in enumerate(myTexts):
 
-        counts = count_words(myTexts[i[0]]["text"], feats=feats, n=n)
+        counts, total = count_features(myTexts[i[0]]["text"], feats=feats, n=n)
 
         if relFreqs:
-            counts = relative_frequencies(counts)
+            counts = relative_frequencies(counts, total)
 
         if feat_list:
             # and keep only the ones in the feature list

diff --git a/superstyl/preproc/features_select.py b/superstyl/preproc/features_select.py