From 7f11fcd252152227270c108e2dc5d9856893d9cc Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Camps Date: Thu, 22 Feb 2024 16:25:40 +0100 Subject: [PATCH 1/7] include affixes in standard load_corpus --- features_filter.py | 36 ------------------------- load_corpus.py | 14 +++++++--- superstyl/preproc/features_extract.py | 38 +++++++++++++++++---------- superstyl/preproc/features_select.py | 36 ------------------------- tests/test_main.py | 6 ++--- 5 files changed, 38 insertions(+), 92 deletions(-) delete mode 100755 features_filter.py delete mode 100755 superstyl/preproc/features_select.py diff --git a/features_filter.py b/features_filter.py deleted file mode 100755 index 67580f87..00000000 --- a/features_filter.py +++ /dev/null @@ -1,36 +0,0 @@ -import superstyl.preproc.features_select as fs -import json -import regex as re - -#TODO: implement more types from Sapkota et al? - -if __name__ == "__main__": - - import argparse - parser = argparse.ArgumentParser() - parser.add_argument('-f', action="store", help="list of features in json (such as produced by main.py)", required=True) - parser.add_argument('--affixes_grams', action='store_true', help="Keep affixes (space starting or ending n-grams)", default=False) - parser.add_argument('--punct_grams', action='store_true', help="Keep n-grams containing punctuation", default=False) - #parser.add_argument('--word-grams', action='store_true', help="Keep n-grams with word content", default=False) - args = parser.parse_args() - - print(".......loading preexisting feature list.......") - with open(args.f, 'r') as f: - my_feats = json.loads(f.read()) - - print(".......Filtering feature list.......") - my_feats = fs.filter_ngrams(my_feats, affixes=args.affixes_grams, punct=args.punct_grams) - - # name the output - outfile = re.sub(r"\.json$", "", args.f) - if args.affixes_grams: - outfile = outfile+"_affixes" - - if args.punct_grams: - outfile = outfile + "_punct" - - outfile = outfile+".json" - - print(".......Writing .......") - with open(outfile, "w") as out: - out.write(json.dumps(my_feats)) diff --git a/load_corpus.py b/load_corpus.py index f4638556..3ebfc275 100755 --- a/load_corpus.py +++ b/load_corpus.py @@ -12,13 +12,21 @@ parser.add_argument('-s', nargs='+', help="paths to files") parser.add_argument('-o', action='store', help="optional base name of output files", type=str, default=False) parser.add_argument('-f', action="store", help="optional list of features in json", default=False) - parser.add_argument('-t', action='store', help="types of features (words or chars)", type=str, default="words") + parser.add_argument('-t', action='store', help="types of features (words, chars, affixes - " + "as per Sapkota et al. 2015 - or POS). POS are currently" + "only implemented for Modern English", type=str, + default="words", choices=["words", "chars", "affixes", "POS"]) parser.add_argument('-n', action='store', help="n grams lengths (default 1)", default=1, type=int) parser.add_argument('-k', action='store', help="How many most frequent?", default=5000, type=int) parser.add_argument('--absolute_freqs', action='store_true', help="switch to get absolute instead of relative freqs", default=False) - parser.add_argument('-x', action='store', help="format (txt, xml or tei)", default="txt") + parser.add_argument('-x', action='store', help="format (txt, xml or tei)", default="txt", + choices=["txt", "xml", "tei"] + ) parser.add_argument('--sampling', action='store_true', help="Sample the texts?", default=False) - parser.add_argument('--sample_units', action='store', help="Units of length for sampling (words, verses; default: words)", default="words", type=str) + parser.add_argument('--sample_units', action='store', help="Units of length for sampling " + "(words, verses; default: words)", + choices=["words", "verses"], + default="words", type=str) parser.add_argument('--sample_size', action='store', help="Size for sampling (default: 3000)", default=3000, type=int) parser.add_argument('--sample_step', action='store', help="Step for sampling with overlap (default is no overlap)", default=None, type=int) parser.add_argument('--max_samples', action='store', help="Maximum number of (randomly selected) samples per author/class (default is all)", diff --git a/superstyl/preproc/features_extract.py b/superstyl/preproc/features_extract.py index 0b4ddee1..2d3f7989 100755 --- a/superstyl/preproc/features_extract.py +++ b/superstyl/preproc/features_extract.py @@ -2,27 +2,37 @@ from collections import Counter import nltk.tokenize import nltk +import regex as re - -def count_words(text, feats = "words", n = 1): +def count_features(text, feats ="words", n = 1): """ - Get feature counts from a text (words, chars or POS n-grams) + Get feature counts from a text (words, chars or POS n-grams, or affixes(+punct if keep_punct), + following Sapkota et al., NAACL 2015 :param text: the source text - :param feats: the type of feats: words, chars, POS (supported only for English) + :param feats: the type of feats: words, chars, POS (supported only for English), or affixes :param n: the length of n-grams :return: features absolute frequencies in text as a counter """ - # Should this be called count_words ? It counts other features as well... count_features ? It's just a grep and replace away. - if feats == "words": + if feats == "words" or feats == "affixes": tokens = nltk.tokenize.wordpunct_tokenize(text) if n > 1: tokens = ["_".join(t) for t in list(nltk.ngrams(tokens, n))] elif feats == "chars": - tokens = list(text.replace(' ', '_')) - if n > 1: - tokens = ["".join(t) for t in list(nltk.ngrams(tokens, n))] + tokens = [re.sub(r'\p{Z}', '_', ''.join(ngram)) for ngram in nltk.ngrams(text, n)] + + elif feats == "affixes": + words = nltk.tokenize.wordpunct_tokenize(text) + ngrams = [''.join(ngram) for ngram in nltk.ngrams(text, n)] + # and now get all types from Sapkota et al. + affs = [w[:3] for w in words if len(w) > n] + [w[-3:] for w in words if len(w) > n] + # space affixes (and punct affixes if keep_punct has been enabled) + space_affs_and_punct = [re.sub(r'\p{Z}', '_', ngram) + for ngram in ngrams + if re.search(r'(^\p{Z})|(\p{Z}$)|(\p{P})', ngram) + ] + tokens = affs + space_affs_and_punct #POS in english with NLTK - need to propose spacy later on elif feats == "pos": @@ -40,7 +50,7 @@ def count_words(text, feats = "words", n = 1): #Adding an error message in case some distracted guy like me would enter something wrong: else: - raise ValueError("Unsupported feature type. Choose from 'words', 'chars', or 'pos'.") + raise ValueError("Unsupported feature type. Choose from 'words', 'chars', 'affixes' or 'pos'.") counts = Counter() counts.update(tokens) @@ -65,14 +75,14 @@ def get_feature_list(myTexts, feats="words", n=1, relFreqs=True): """ :param myTexts: a 'myTexts' object, containing documents to be processed :param feat_list: a list of features to be selected - :param feats: type of feats (words, chars, POS) + :param feats: type of feats (words, chars, affixes or POS) :param n: n-grams length :return: list of features, with total frequency """ my_feats = Counter() for text in myTexts: - counts = count_words(text["text"], feats=feats, n=n) + counts = count_features(text["text"], feats=feats, n=n) my_feats.update(counts) @@ -90,7 +100,7 @@ def get_counts(myTexts, feat_list=None, feats = "words", n = 1, relFreqs = False Get counts for a collection of texts :param myTexts: the document collection :param feat_list: a list of features to be selected (None for all) - :param feats: the type of feats (words, chars, etc.) + :param feats: the type of feats (words, chars, affixes, POS) :param n: the length of n-grams :param relFreqs: whether to compute relative freqs :return: the collection with, for each text, a 'wordCounts' dictionary @@ -98,7 +108,7 @@ def get_counts(myTexts, feat_list=None, feats = "words", n = 1, relFreqs = False for i in enumerate(myTexts): - counts = count_words(myTexts[i[0]]["text"], feats=feats, n=n) + counts = count_features(myTexts[i[0]]["text"], feats=feats, n=n) if relFreqs: counts = relative_frequencies(counts) diff --git a/superstyl/preproc/features_select.py b/superstyl/preproc/features_select.py deleted file mode 100755 index 3d1e9418..00000000 --- a/superstyl/preproc/features_select.py +++ /dev/null @@ -1,36 +0,0 @@ -import regex as re - -def filter_ngrams(feat_list, affixes=True, punct=True): - """ - Filter a list of features in input to yield a selection of n-grams, according to the parameters, - following Sapkota et al., NAACL 2015 - feat_list: the feature list (typically, coming of load_corpus.py and loaded) - affixes: affixes (n-grams beginning or ending by space) - punct: n-grams containing punctuation - """ - - out = [] - - if affixes: - out = out + [f for f in feat_list if f[0].startswith('_') or f[0].endswith('_')] - switch = True - seen = set([f[0] for f in out]) - - if punct: - # a bit trickier: need to remove underscore not to include n-grams with just - # underscore as punctuation - if switch: - out = out + [f for f in feat_list if re.match(r"\p{P}", re.sub('_', '', f[0])) - and f[0] not in seen] - - else: - out = out + [f for f in feat_list if re.match(r"\p{P}", re.sub('_', '', f[0]))] - - return out - - - - - - - diff --git a/tests/test_main.py b/tests/test_main.py index 3b1bcb4b..b18b5f40 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -391,13 +391,13 @@ def test_counts(self): # GIVEN text = "the cat the dog the squirrel the cat the cat" # WHEN - results = superstyl.preproc.features_extract.count_words(text, feats = "words", n = 1) + results = superstyl.preproc.features_extract.count_features(text, feats ="words", n = 1) # THEN expected = {'the': 5, 'cat': 3, 'dog': 1, 'squirrel': 1} self.assertEqual(results, expected) # WHEN - results = superstyl.preproc.features_extract.count_words(text, feats="words", n=2) + results = superstyl.preproc.features_extract.count_features(text, feats="words", n=2) # THEN expected = {'the_cat': 3, 'cat_the': 2, 'the_dog': 1, 'dog_the': 1, 'the_squirrel': 1, 'squirrel_the': 1} self.assertEqual(results, expected) @@ -405,7 +405,7 @@ def test_counts(self): # GIVEN text = "the yo yo" # WHEN - results = superstyl.preproc.features_extract.count_words(text, feats="chars", n=3) + results = superstyl.preproc.features_extract.count_features(text, feats="chars", n=3) # THEN expected = {'the': 1, 'he_': 1, 'e_y': 1, '_yo': 2, 'yo_': 1, 'o_y': 1} self.assertEqual(results, expected) From effce56aa36b5828cdd6585aacf58b34d592b524 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Camps Date: Thu, 22 Feb 2024 17:40:32 +0100 Subject: [PATCH 2/7] fixed the function and added tests --- superstyl/preproc/features_extract.py | 25 +++-- tests/{test_main.py => test_load_corpus.py} | 113 ++++++++++++++++++-- 2 files changed, 123 insertions(+), 15 deletions(-) rename tests/{test_main.py => test_load_corpus.py} (72%) diff --git a/superstyl/preproc/features_extract.py b/superstyl/preproc/features_extract.py index 2d3f7989..874cd7eb 100755 --- a/superstyl/preproc/features_extract.py +++ b/superstyl/preproc/features_extract.py @@ -11,20 +11,24 @@ def count_features(text, feats ="words", n = 1): :param text: the source text :param feats: the type of feats: words, chars, POS (supported only for English), or affixes :param n: the length of n-grams - :return: features absolute frequencies in text as a counter + :return: features absolute frequencies in text as a counter, and the total of frequencies """ - if feats == "words" or feats == "affixes": + if feats == "words": tokens = nltk.tokenize.wordpunct_tokenize(text) if n > 1: tokens = ["_".join(t) for t in list(nltk.ngrams(tokens, n))] + total = len(tokens) elif feats == "chars": tokens = [re.sub(r'\p{Z}', '_', ''.join(ngram)) for ngram in nltk.ngrams(text, n)] + total = len(tokens) elif feats == "affixes": words = nltk.tokenize.wordpunct_tokenize(text) ngrams = [''.join(ngram) for ngram in nltk.ngrams(text, n)] + # relative frequencies should be computed from all existing n-grams + total = len(ngrams) # and now get all types from Sapkota et al. affs = [w[:3] for w in words if len(w) > n] + [w[-3:] for w in words if len(w) > n] # space affixes (and punct affixes if keep_punct has been enabled) @@ -42,6 +46,7 @@ def count_features(text, feats ="words", n = 1): tokens = ["_".join(t) for t in list(nltk.ngrams(pos_tags, n))] else: tokens = pos_tags + total = len(tokens) # Adding sentence length ; still commented as it is a work in progress, an integer won't do, a quantile would be better #elif feats == "sentenceLength": @@ -55,16 +60,16 @@ def count_features(text, feats ="words", n = 1): counts = Counter() counts.update(tokens) - return counts + return counts, total -def relative_frequencies(wordCounts): +def relative_frequencies(wordCounts, total): """ For a counter of word counts, return the relative frequencies :param wordCounts: a dictionary of word counts + :param total, the total number of features :return a counter of word relative frequencies """ - total = sum(wordCounts.values()) for t in wordCounts.keys(): wordCounts[t] = wordCounts[t] / total @@ -80,14 +85,16 @@ def get_feature_list(myTexts, feats="words", n=1, relFreqs=True): :return: list of features, with total frequency """ my_feats = Counter() + total = 0 for text in myTexts: - counts = count_features(text["text"], feats=feats, n=n) + counts, text_total = count_features(text["text"], feats=feats, n=n) my_feats.update(counts) + total = total + text_total if relFreqs: - my_feats = relative_frequencies(my_feats) + my_feats = relative_frequencies(my_feats, total) # sort them my_feats = [(i, my_feats[i]) for i in sorted(my_feats, key=my_feats.get, reverse=True)] @@ -108,10 +115,10 @@ def get_counts(myTexts, feat_list=None, feats = "words", n = 1, relFreqs = False for i in enumerate(myTexts): - counts = count_features(myTexts[i[0]]["text"], feats=feats, n=n) + counts, total = count_features(myTexts[i[0]]["text"], feats=feats, n=n) if relFreqs: - counts = relative_frequencies(counts) + counts = relative_frequencies(counts, total) if feat_list: # and keep only the ones in the feature list diff --git a/tests/test_main.py b/tests/test_load_corpus.py similarity index 72% rename from tests/test_main.py rename to tests/test_load_corpus.py index b18b5f40..faa3c651 100644 --- a/tests/test_main.py +++ b/tests/test_load_corpus.py @@ -6,7 +6,6 @@ import superstyl.preproc.embedding import superstyl.preproc.select import superstyl.preproc.text_count -import superstyl.preproc.features_select import os import glob @@ -119,9 +118,103 @@ def test_load_corpus(self): expected_feats = [('this', 2 / 12), ('is', 2 / 12), ('the', 2 / 12), ('text', 2 / 12)] self.assertEqual(feats, expected_feats) + # WHEN + corpus, feats = superstyl.load.load_corpus(self.paths, feats="chars", n=3, format="txt", keep_punct=True, + relFreqs=False) + + # THEN + expected_feats = [('e_t', 3), ('_te', 3), ('tex', 3), ('ext', 3), ('is_', 3), ('Thi', 2), ('his', 2), ('s_i', 2), + ('_is', 2), ('_th', 2), ('the', 2), ('he_', 2), ('xt!', 2), ('Voi', 1), ('oic', 1), ('ici', 1), + ('ci_', 1), ('i_l', 1), ('_le', 1), ('le_', 1), ('xte', 1), ('te!', 1), ('is,', 1), ('s,_', 1), + (',_a', 1), ('_al', 1), ('als', 1), ('lso', 1), ('so_', 1), ('o_,', 1), ('_,_', 1), (',_t', 1), + ('s_t', 1)] + + + expected_corpus = {'author': + {'Dupont_Letter1.txt': 'Dupont', 'Smith_Letter2.txt': 'Smith', 'Smith_Letter1.txt': 'Smith'}, + 'lang': {'Dupont_Letter1.txt': 'NA', 'Smith_Letter2.txt': 'NA', 'Smith_Letter1.txt': 'NA'}, + 'e_t': {'Dupont_Letter1.txt': 1, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 1}, + '_te': {'Dupont_Letter1.txt': 1, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 1}, + 'tex': {'Dupont_Letter1.txt': 1, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 1}, + 'ext': {'Dupont_Letter1.txt': 1, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 1}, + 'is_': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 2}, + 'Thi': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 1}, + 'his': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 1}, + 's_i': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 1}, + '_is': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 1}, + '_th': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 1}, + 'the': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 1}, + 'he_': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 1}, + 'xt!': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 1}, + 'Voi': {'Dupont_Letter1.txt': 1, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0}, + 'oic': {'Dupont_Letter1.txt': 1, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0}, + 'ici': {'Dupont_Letter1.txt': 1, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0}, + 'ci_': {'Dupont_Letter1.txt': 1, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0}, + 'i_l': {'Dupont_Letter1.txt': 1, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0}, + '_le': {'Dupont_Letter1.txt': 1, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0}, + 'le_': {'Dupont_Letter1.txt': 1, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0}, + 'xte': {'Dupont_Letter1.txt': 1, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0}, + 'te!': {'Dupont_Letter1.txt': 1, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0}, + 'is,': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 0}, + 's,_': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 0}, + ',_a': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 0}, + '_al': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 0}, + 'als': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 0}, + 'lso': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 0}, + 'so_': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 0}, + 'o_,': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 0}, + '_,_': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 0}, + ',_t': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1, 'Smith_Letter1.txt': 0}, + 's_t': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 1}} + + + self.assertEqual(sorted(feats), sorted(expected_feats)) + self.assertEqual(corpus.to_dict(), expected_corpus) + # WHEN + corpus, feats = superstyl.load.load_corpus(self.paths, feats="affixes", n=3, format="txt", keep_punct=True) - # TODO: test other options + # THEN + expected_feats = [('_te', 3/51), ('tex', 3/51), ('ext', 2/51), ('is_', 3/51), ('Thi', 2/51), ('his', 2/51), + ('_is', 2/51), ('_th', 2/51), ('he_', 2/51), ('xt!', 2/51), ('Voi', 1/51),('ici', 1/51), + ('ci_', 1/51), ('_le', 1/51), ('le_', 1/51), ('xte', 1/51), ('te!', 1/51), ('is,', 1/51), + ('s,_', 1/51), (',_a', 1/51), ('_al', 1/51), ('als', 1/51), ('lso', 1/51), ('so_', 1/51), + ('o_,', 1/51), ('_,_', 1/51), (',_t', 1/51)] + + expected_corpus = {'author': + {'Dupont_Letter1.txt': 'Dupont', 'Smith_Letter2.txt': 'Smith', + 'Smith_Letter1.txt': 'Smith'}, + 'lang': {'Dupont_Letter1.txt': 'NA', 'Smith_Letter2.txt': 'NA', 'Smith_Letter1.txt': 'NA'}, + '_te': {'Dupont_Letter1.txt': 1/13, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 1/15}, + 'tex': {'Dupont_Letter1.txt': 1/13, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 1/15}, + 'ext': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 1/15}, + 'is_': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 2/15}, + 'Thi': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 1/15}, + 'his': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 1/15}, + '_is': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 1/15}, + '_th': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 1/15}, + 'he_': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 1/15}, + 'xt!': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 1/15}, + 'Voi': {'Dupont_Letter1.txt': 1/13, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0}, + 'ici': {'Dupont_Letter1.txt': 1/13, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0}, + 'ci_': {'Dupont_Letter1.txt': 1/13, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0}, + '_le': {'Dupont_Letter1.txt': 1/13, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0}, + 'le_': {'Dupont_Letter1.txt': 1/13, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0}, + 'xte': {'Dupont_Letter1.txt': 1/13, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0}, + 'te!': {'Dupont_Letter1.txt': 1/13, 'Smith_Letter2.txt': 0, 'Smith_Letter1.txt': 0}, + 'is,': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 0}, + 's,_': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 0}, + ',_a': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 0}, + '_al': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 0}, + 'als': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 0}, + 'lso': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 0}, + 'so_': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 0}, + 'o_,': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 0}, + '_,_': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 0}, + ',_t': {'Dupont_Letter1.txt': 0, 'Smith_Letter2.txt': 1/23, 'Smith_Letter1.txt': 0}} + + self.assertEqual(sorted(feats), sorted(expected_feats)) + self.assertEqual(corpus.to_dict(), expected_corpus) def test_load_texts_txt(self): @@ -393,21 +486,29 @@ def test_counts(self): # WHEN results = superstyl.preproc.features_extract.count_features(text, feats ="words", n = 1) # THEN - expected = {'the': 5, 'cat': 3, 'dog': 1, 'squirrel': 1} + expected = ({'the': 5, 'cat': 3, 'dog': 1, 'squirrel': 1}, 10) self.assertEqual(results, expected) # WHEN results = superstyl.preproc.features_extract.count_features(text, feats="words", n=2) # THEN - expected = {'the_cat': 3, 'cat_the': 2, 'the_dog': 1, 'dog_the': 1, 'the_squirrel': 1, 'squirrel_the': 1} + expected = ({'the_cat': 3, 'cat_the': 2, 'the_dog': 1, 'dog_the': 1, 'the_squirrel': 1, 'squirrel_the': 1}, 9) self.assertEqual(results, expected) # GIVEN - text = "the yo yo" + text = "These yo yo!" # WHEN results = superstyl.preproc.features_extract.count_features(text, feats="chars", n=3) # THEN - expected = {'the': 1, 'he_': 1, 'e_y': 1, '_yo': 2, 'yo_': 1, 'o_y': 1} + expected = ({'_yo': 2, 'The': 1, 'hes': 1, 'ese': 1, 'se_': 1, 'e_y': 1, 'yo_': 1, 'o_y': 1, 'yo!': 1}, 10) + self.assertEqual(results, expected) + + # GIVEN + text = "These yo yo!" + # WHEN + results = superstyl.preproc.features_extract.count_features(text, feats="affixes", n=3) + # THEN + expected = ({'_yo': 2, 'The': 1, 'ese': 1, 'se_': 1, 'yo_': 1, 'yo!': 1}, 10) self.assertEqual(results, expected) def test_max_sampling(self): From 096fcc0b1e968ed67fffb693a3af04cb356be699 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Camps Date: Thu, 22 Feb 2024 19:00:36 +0100 Subject: [PATCH 3/7] added tests for embedding --- superstyl/load.py | 4 +- superstyl/preproc/embedding.py | 24 +++++------ tests/test_load_corpus.py | 74 ++++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+), 17 deletions(-) diff --git a/superstyl/load.py b/superstyl/load.py index 7e19b21b..eec1afdb 100644 --- a/superstyl/load.py +++ b/superstyl/load.py @@ -2,7 +2,6 @@ import superstyl.preproc.features_extract as fex from superstyl.preproc.text_count import count_process import superstyl.preproc.embedding as embed -import json import tqdm import pandas @@ -53,7 +52,8 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, relFreqs if embedding: print(".......embedding counts.......") - myTexts = embed.get_embedded_counts(myTexts, my_feats, model, topn=neighbouring_size) + myTexts, my_feats = embed.get_embedded_counts(myTexts, my_feats, model, topn=neighbouring_size) + feat_list = [f for f in feat_list if f[0] in my_feats] unique_texts = [text["name"] for text in myTexts] diff --git a/superstyl/preproc/embedding.py b/superstyl/preproc/embedding.py index 6d0d8404..6dc0f262 100644 --- a/superstyl/preproc/embedding.py +++ b/superstyl/preproc/embedding.py @@ -1,5 +1,3 @@ -import numpy as np -from scipy import spatial import gensim.models def load_embeddings(path): @@ -25,12 +23,7 @@ def find_similar_words(model, word, topn=10): else: return [s[0] for s in model.most_similar(word, topn=topn)] -# For tests -# myTexts = [{'name': 'Letter1', 'aut': 'Smith', 'text': 'This is the text', 'lang': 'en', 'wordCounts': {'the': 1, 'this': 1}}, -# {'name': 'Letter2', 'aut': 'Smith', 'text': 'Also the text', 'lang': 'en', -# 'wordCounts': {'the': 1, 'also': 1}}] -# feat_list = ['the'] -# feat = "the" + def get_embedded_counts(myTexts, feat_list, model, topn=10): """ Replace absolute frequencies by frequencies relative to a given semantic neighbouring @@ -40,7 +33,14 @@ def get_embedded_counts(myTexts, feat_list, model, topn=10): :param model: the embeddings model :param topn: the n closest (as per cosine similarity) words on which to compute relative frequency :return: the myTexts collection with, for each text, a 'wordCounts' dictionary with said semantic relative frequencies + as well as the new feat_list with only the features that were actually used """ + # First, create the new key + for i in enumerate(myTexts): + myTexts[i[0]]["embedded"] = {} + + # keep only features present in the embedding + feat_list = [f for f in feat_list if f in list(model.index_to_key)] for feat in feat_list: similars = find_similar_words(model, feat, topn=topn) @@ -50,17 +50,11 @@ def get_embedded_counts(myTexts, feat_list, model, topn=10): else: for i in enumerate(myTexts): - if feat in myTexts[i[0]]["wordCounts"].keys(): - - if "embedded" not in myTexts[i[0]].keys(): - # then, initialise - myTexts[i[0]]["embedded"] = {} - total = sum([myTexts[i[0]]["wordCounts"][s] for s in [feat]+similars if s in myTexts[i[0]]["wordCounts"].keys()]) myTexts[i[0]]["embedded"][feat] = myTexts[i[0]]["wordCounts"][feat] / total - return myTexts + return myTexts, feat_list diff --git a/tests/test_load_corpus.py b/tests/test_load_corpus.py index faa3c651..52887e9d 100644 --- a/tests/test_load_corpus.py +++ b/tests/test_load_corpus.py @@ -216,6 +216,29 @@ def test_load_corpus(self): self.assertEqual(sorted(feats), sorted(expected_feats)) self.assertEqual(corpus.to_dict(), expected_corpus) + # WHEN + # TODO: fix pos ! + #corpus, feats = superstyl.load.load_corpus(self.paths, feats="pos", n=2, format="txt") + + # Now, test embedding + # WHEN + corpus, feats = superstyl.load.load_corpus(self.paths, feats="words", n=1, format="txt", + embedding=THIS_DIR+"/embed/test_embedding.wv.txt", + neighbouring_size=1) + # THEN + + expected_feats = [('this', 2), ('is', 2), ('the', 2), ('text', 2), ('also', 1)] + expected_corpus = {'author': {'Dupont_Letter1.txt': 'Dupont', 'Smith_Letter1.txt': 'Smith', 'Smith_Letter2.txt': 'Smith'}, + 'lang': {'Dupont_Letter1.txt': 'NA', 'Smith_Letter1.txt': 'NA', 'Smith_Letter2.txt': 'NA'}, + 'this': {'Dupont_Letter1.txt': 0.0, 'Smith_Letter1.txt': 0.5, 'Smith_Letter2.txt': 0.5}, + 'is': {'Dupont_Letter1.txt': 0.0, 'Smith_Letter1.txt': 0.5, 'Smith_Letter2.txt': 0.5}, + 'the': {'Dupont_Letter1.txt': 0.0, 'Smith_Letter1.txt': 0.5, 'Smith_Letter2.txt': 0.5}, + 'text': {'Dupont_Letter1.txt': 0.0, 'Smith_Letter1.txt': 0.5, 'Smith_Letter2.txt': 0.5}, + 'also': {'Dupont_Letter1.txt': 0.0, 'Smith_Letter1.txt': 0.0, 'Smith_Letter2.txt': 1.0}} + self.assertEqual(feats, expected_feats) + self.assertEqual(corpus.to_dict(), expected_corpus) + + def test_load_texts_txt(self): # SCENARIO: from paths to txt, get myTexts object, i.e., a list of dictionaries @@ -525,6 +548,57 @@ def test_max_sampling(self): self.assertEqual(len([text for text in results if text["aut"] == 'Smith']), 1) +class Embed(unittest.TestCase): + model = superstyl.preproc.embedding.load_embeddings(THIS_DIR+"/embed/test_embedding.wv.txt") + def test_find_similar_words(self): + # Feature: find the n most similar words in an embedding + # GIVEN + word = "this" + # WHEN + results = superstyl.preproc.embedding.find_similar_words(self.model, word, topn=1) + # THEN + expected = ["the"] + self.assertEqual(results, expected) + + # GIVEN + word = "supercalifragilistic" + # WHEN + results = superstyl.preproc.embedding.find_similar_words(self.model, word, topn=1) + # THEN + expected = None + self.assertEqual(results, expected) + + def test_get_embedded_counts(self): + # FEATURE : for a myTexts objects, containing feature counts, a list of features, and an embedding model + # Get the relative frequencies of each words in regard to the topn most similar in the model + + # GIVEN + myTexts = [{'name': 'Letter1', 'aut': 'Smith', 'text': 'This is the text', 'lang': 'en', + 'wordCounts': {'this': 1, 'is': 1, 'the': 1, 'text': 1}}, + {'name': 'Letter2', 'aut': 'Smith', 'text': 'This is also the text', 'lang': 'en', 'wordCounts': + {'this': 1, 'is': 1, 'also': 1, 'the': 1, 'text': 1}}, + {'name': 'Letter1', 'aut': 'Dupont', 'text': 'Voici le texte', 'lang': 'fr', 'wordCounts': + {'Voici': 1, 'le': 1, 'texte': 1}}] + feat_list = ["this", "the", "voici"] + # WHEN + results, new_feat_list = superstyl.preproc.embedding.get_embedded_counts(myTexts, feat_list, self.model, topn=1) + # THEN + expected = [{'name': 'Letter1', 'aut': 'Smith', 'text': 'This is the text', 'lang': 'en', + 'wordCounts': {'this': 1, 'is': 1, 'the': 1, 'text': 1}, + 'embedded': {'this': 0.5, 'the': 0.5}}, + {'name': 'Letter2', 'aut': 'Smith', 'text': 'This is also the text', 'lang': 'en', + 'wordCounts': {'this': 1, 'is': 1, 'also': 1, 'the': 1, 'text': 1}, + 'embedded': {'this': 0.5, 'the': 0.5}}, + {'name': 'Letter1', 'aut': 'Dupont', 'text': 'Voici le texte', 'lang': 'fr', + 'wordCounts': {'Voici': 1, 'le': 1, 'texte': 1}, + 'embedded': {} + }] + self.assertEqual(results, expected) + self.assertEqual(new_feat_list, ["this", "the"]) + + + + # TODO: tests for SVM, etc. # Test all options of main commands, see if they are accepted or not From 43d0f446d235891c3d43bc097d6944203a4325e2 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Camps Date: Thu, 22 Feb 2024 19:02:18 +0100 Subject: [PATCH 4/7] and the embedding file --- tests/embed/test_embedding.wv.txt | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 tests/embed/test_embedding.wv.txt diff --git a/tests/embed/test_embedding.wv.txt b/tests/embed/test_embedding.wv.txt new file mode 100644 index 00000000..e0a00662 --- /dev/null +++ b/tests/embed/test_embedding.wv.txt @@ -0,0 +1,7 @@ +6 100 +the -0.038194 -0.24487 0.72812 -0.39961 0.083172 0.043953 -0.39141 0.3344 -0.57545 0.087459 0.28787 -0.06731 0.30906 -0.26384 -0.13231 -0.20757 0.33395 -0.33848 -0.31743 -0.48336 0.1464 -0.37304 0.34577 0.052041 0.44946 -0.46971 0.02628 -0.54155 -0.15518 -0.14107 -0.039722 0.28277 0.14393 0.23464 -0.31021 0.086173 0.20397 0.52624 0.17164 -0.082378 -0.71787 -0.41531 0.20335 -0.12763 0.41367 0.55187 0.57908 -0.33477 -0.36559 -0.54857 -0.062892 0.26584 0.30205 0.99775 -0.80481 -3.0243 0.01254 -0.36942 2.2167 0.72201 -0.24978 0.92136 0.034514 0.46745 1.1079 -0.19358 -0.074575 0.23353 -0.052062 -0.22044 0.057162 -0.15806 -0.30798 -0.41625 0.37972 0.15006 -0.53212 -0.2055 -1.2526 0.071624 0.70565 0.49744 -0.42063 0.26148 -1.538 -0.30223 -0.073438 -0.28312 0.37104 -0.25217 0.016215 -0.017099 -0.38984 0.87424 -0.72569 -0.51058 -0.52028 -0.1459 0.8278 0.27062 +, -0.10767 0.11053 0.59812 -0.54361 0.67396 0.10663 0.038867 0.35481 0.06351 -0.094189 0.15786 -0.81665 0.14172 0.21939 0.58505 -0.52158 0.22783 -0.16642 -0.68228 0.3587 0.42568 0.19021 0.91963 0.57555 0.46185 0.42363 -0.095399 -0.42749 -0.16567 -0.056842 -0.29595 0.26037 -0.26606 -0.070404 -0.27662 0.15821 0.69825 0.43081 0.27952 -0.45437 -0.33801 -0.58184 0.22364 -0.5778 -0.26862 -0.20425 0.56394 -0.58524 -0.14365 -0.64218 0.0054697 -0.35248 0.16162 1.1796 -0.47674 -2.7553 -0.1321 -0.047729 1.0655 1.1034 -0.2208 0.18669 0.13177 0.15117 0.7131 -0.35215 0.91348 0.61783 0.70992 0.23955 -0.14571 -0.37859 -0.045959 -0.47368 0.2385 0.20536 -0.18996 0.32507 -1.1112 -0.36341 0.98679 -0.084776 -0.54008 0.11726 -1.0194 -0.24424 0.12771 0.013884 0.080374 -0.35414 0.34951 -0.7226 0.37549 0.4441 -0.99059 0.61214 -0.35111 -0.83155 0.45293 0.082577 +this -0.57058 0.44183 0.70102 -0.41713 -0.34058 0.02339 -0.071537 0.48177 -0.013121 0.16834 -0.13389 0.040626 0.15827 -0.44342 -0.019403 -0.009661 -0.046284 0.093228 -0.27331 0.2285 0.33089 -0.36474 0.078741 0.3585 0.44757 -0.2299 0.18077 -0.6265 0.053852 -0.29154 -0.4256 0.62903 0.14393 -0.046004 -0.21007 0.48879 -0.057698 0.37431 -0.030075 -0.34494 -0.29702 0.15095 0.28248 -0.16578 0.076131 -0.093016 0.79365 -0.60489 -0.18874 -1.0173 0.31962 -0.16344 0.54177 1.1725 -0.47875 -3.3842 -0.081301 -0.3528 1.8372 0.44516 -0.52666 0.99786 -0.32178 0.033462 1.1783 -0.072905 0.39737 0.26166 0.33111 -0.35629 -0.16558 -0.44382 -0.14183 -0.37976 0.28994 -0.029114 -0.35169 -0.27694 -1.344 0.19555 0.16887 0.040237 -0.80212 0.23366 -1.3837 -0.023132 0.085395 -0.74051 -0.073934 -0.58838 -0.085735 -0.10525 -0.51571 0.15038 -0.16694 -0.16372 -0.22702 -0.66102 0.47197 0.37253 +is -0.54264 0.41476 1.0322 -0.40244 0.46691 0.21816 -0.074864 0.47332 0.080996 -0.22079 -0.12808 -0.1144 0.50891 0.11568 0.028211 -0.3628 0.43823 0.047511 0.20282 0.49857 -0.10068 0.13269 0.16972 0.11653 0.31355 0.25713 0.092783 -0.56826 -0.52975 -0.051456 -0.67326 0.92533 0.2693 0.22734 0.66365 0.26221 0.19719 0.2609 0.18774 -0.3454 -0.42635 0.13975 0.56338 -0.56907 0.12398 -0.12894 0.72484 -0.26105 -0.26314 -0.43605 0.078908 -0.84146 0.51595 1.3997 -0.7646 -3.1453 -0.29202 -0.31247 1.5129 0.52435 0.21456 0.42452 -0.088411 -0.17805 1.1876 0.10579 0.76571 0.21914 0.35824 -0.11636 0.093261 -0.62483 -0.21898 0.21796 0.74056 -0.43735 0.14343 0.14719 -1.1605 -0.050508 0.12677 -0.014395 -0.98676 -0.091297 -1.2054 -0.11974 0.047847 -0.54001 0.52457 -0.70963 -0.32528 -0.1346 -0.41314 0.33435 -0.0072412 0.32253 -0.044219 -1.2969 0.76217 0.46349 +also -0.33819 0.064568 -0.032558 -0.29448 0.84125 -0.29092 -0.35264 0.35777 0.004152 -0.0067549 -0.11512 -0.38832 0.49764 0.47187 0.046247 -0.059806 0.59317 -0.080286 -0.45926 0.28211 0.33909 -0.25741 0.30599 0.53594 0.1168 -0.30916 -0.16143 -0.1841 -0.26339 -0.035592 -0.13136 1.1538 -0.61616 0.73314 0.46168 0.4241 0.2918 0.73092 -0.17098 -0.03529 -0.6867 -0.24653 0.34776 -0.46747 0.21257 -0.052958 0.1032 -0.52122 0.61087 -0.71005 -0.16765 -0.34415 0.27119 1.1337 -0.33195 -2.3864 -0.52352 -0.25531 0.80993 1.3563 -0.1452 0.32792 0.11149 0.17806 1.0008 -0.37243 0.3127 0.28634 0.47915 -0.23534 0.13146 -0.5478 0.054173 -0.19163 0.16276 -0.067267 -0.0044537 0.55708 -1.2568 -0.063385 0.62438 -0.28284 -0.6458 -0.2832 -1.8987 -0.5706 0.026083 -0.41721 0.29686 -0.18416 -0.19252 -0.59915 -0.17981 0.17649 -0.56043 0.48284 -0.44081 -0.84036 0.78533 0.36017 +text -0.49705 0.71642 0.40119 -0.05761 0.83614 0.8256 0.08963 -0.53492 0.34335 -0.27079 -0.011152 0.025207 -0.1235 0.11801 0.045312 0.73144 0.13744 -0.13084 -0.028249 -0.30789 -0.81864 -0.54517 0.25151 0.53891 0.38293 -1.0343 -0.1104 0.44977 -0.13019 0.24847 0.1048 0.19567 -0.42672 -0.37912 0.14535 -0.025532 -0.23523 -0.3638 -0.14269 0.0062072 -0.63 -0.23068 0.086461 0.22126 -0.65625 -0.55701 -0.60243 -0.13159 -0.027226 0.0044152 1.4123 1.3042 0.54118 0.33443 -0.51865 -1.8253 -0.30525 -0.32747 1.236 0.08771 0.007793 0.36571 -0.39304 -0.79174 0.57874 -0.0025427 0.10442 0.64166 -0.1881 -0.76203 0.23008 0.30637 1.0386 -0.69846 0.31094 0.63762 -0.09997 0.16999 -0.59984 -0.89565 -0.25059 -0.93011 -0.59606 -0.32965 -1.6828 0.39102 0.65383 -1.5176 0.61748 0.0075596 0.040066 0.60803 -0.027058 0.15273 -0.16887 -0.47664 -0.61775 -0.98735 0.23776 0.39952 From e798f5c2757bc2c591806779eec3b00a1e668663 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Camps Date: Thu, 22 Feb 2024 20:16:01 +0100 Subject: [PATCH 5/7] added documentation --- load_corpus.py | 5 +++-- superstyl/load.py | 27 ++++++++++++++++++++++++++- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/load_corpus.py b/load_corpus.py index 3ebfc275..56dd309b 100755 --- a/load_corpus.py +++ b/load_corpus.py @@ -19,7 +19,8 @@ parser.add_argument('-n', action='store', help="n grams lengths (default 1)", default=1, type=int) parser.add_argument('-k', action='store', help="How many most frequent?", default=5000, type=int) parser.add_argument('--absolute_freqs', action='store_true', help="switch to get absolute instead of relative freqs", default=False) - parser.add_argument('-x', action='store', help="format (txt, xml or tei)", default="txt", + parser.add_argument('-x', action='store', help="format (txt, xml or tei) /!\ only txt is fully implemented", + default="txt", choices=["txt", "xml", "tei"] ) parser.add_argument('--sampling', action='store_true', help="Sample the texts?", default=False) @@ -31,7 +32,7 @@ parser.add_argument('--sample_step', action='store', help="Step for sampling with overlap (default is no overlap)", default=None, type=int) parser.add_argument('--max_samples', action='store', help="Maximum number of (randomly selected) samples per author/class (default is all)", default=None, type=int) - parser.add_argument('--keep_punct', action='store_true', help="whether or not to keep punctuation and caps (default is False)", + parser.add_argument('--keep_punct', action='store_true', help="whether to keep punctuation and caps (default is False)", default=False) parser.add_argument('--keep_sym', action='store_true', help="if true, same as keep_punct, plus no Unidecode, and numbers are kept as well (default is False)", diff --git a/superstyl/load.py b/superstyl/load.py index eec1afdb..83101ffe 100644 --- a/superstyl/load.py +++ b/superstyl/load.py @@ -10,7 +10,32 @@ def load_corpus(data_paths, feat_list=None, feats="words", n=1, k=5000, relFreqs identify_lang=False, embedding=False, neighbouring_size=10): """ Main function to load a corpus from a collection of file, and an optional list of features to extract. - :param #TODO, document all params + :param data_paths: paths to the source files + :param feat_list: an optional list of features (as created by load_corpus), default None + :param feats: the type of features, one of 'words', 'chars', 'affixes, and 'POS'. Affixes are inspired by + Sapkota et al. 2015, and include space_prefix, space_suffix, prefix, suffix, and, if keep_pos, punctuation n-grams. + POS are currently only implemented for Modern English + :param n: n grams lengths (default 1) + :param k: How many most frequent? The function takes the rank of k (if k is smaller than the total number of features), + gets its frequencies, and only include features of superior or equal total frequencies. + :param relFreqs: return relative frequencies (default: True) + :param format: one of txt, xml or tei. /!\ only txt is fully implemented. + :param sampling: whether to sample the texts, by cutting it into slices of a given length, until the last possible + slice of this length, which means that often the end of the text will be eliminated (default False) + :param units: units of length for sampling, one of 'words', 'verses' (default: words). 'verses' is only implemented + for the 'tei' format + :param size: the size of the samples (in units) + :param step: step for sampling with overlap (default is step = size, which means no overlap). + Reduce for overlapping slices + :param max_samples: Maximum number of (randomly selected) samples per author/class (default is all) + :param keep_punct: whether to keep punctuation and caps (default is False) + :param keep_sym: same as keep_punct, plus no Unidecode, and numbers are kept as well (default is False). /!\ does not + actually keep symbols + :param identify_lang: if true, the language of each text will be guessed, using langdetect (default is False) + :param embedding: optional path to a word2vec embedding in txt format to compute frequencies among a set of + semantic neighbourgs (i.e., pseudo-paronyms) + :param neighbouring_size: size of semantic neighbouring in the embedding (as per gensim most_similar, + with topn=neighbouring_size) :return a pandas dataFrame of text metadata and feature frequencies; a global list of features with their frequencies """ From 6e5702348be9597e36a32cf40b2bb4416ff91e51 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Camps Date: Thu, 22 Feb 2024 20:18:18 +0100 Subject: [PATCH 6/7] small doc change --- load_corpus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/load_corpus.py b/load_corpus.py index 56dd309b..cfe51749 100755 --- a/load_corpus.py +++ b/load_corpus.py @@ -9,7 +9,7 @@ import argparse parser = argparse.ArgumentParser() - parser.add_argument('-s', nargs='+', help="paths to files") + parser.add_argument('-s', nargs='+', help="paths to files", required=True) parser.add_argument('-o', action='store', help="optional base name of output files", type=str, default=False) parser.add_argument('-f', action="store", help="optional list of features in json", default=False) parser.add_argument('-t', action='store', help="types of features (words, chars, affixes - " From 8fb83710314a7073c0973cf10deeb8666be99acb Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Camps Date: Thu, 22 Feb 2024 20:24:53 +0100 Subject: [PATCH 7/7] fixed README and now DONE --- README.md | 9 --------- 1 file changed, 9 deletions(-) diff --git a/README.md b/README.md index 84e3edf0..82486fa3 100755 --- a/README.md +++ b/README.md @@ -140,15 +140,6 @@ python load_corpus.py -s data/psyche/train/* -t chars -n 3 -x tei --sampling --s You have a lot of options for feats extraction, inclusion or not of punctuation and symbols, sampling, source file formats, …, that can be accessed through the help. -### Optional: Filter features - -You can filter certain features (for instance retain only 'pseudo-affixes' from character n-grams) using the command - -```bash -python features_filter.py -f feature_list.json --affixes_grams --punct_grams -``` - - ### Optional: Merge different features You can merge several sets of features, extracted in csv with the previous commands, by doing: