Skip to content

Commit

Permalink
Merge pull request #43 from SupervisedStylometry/moreTests
Browse files Browse the repository at this point in the history
Starting to create more tests
  • Loading branch information
Jean-Baptiste-Camps committed Feb 15, 2024
2 parents c8d4d60 + b005a17 commit dbe264b
Show file tree
Hide file tree
Showing 5 changed files with 113 additions and 27 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ["3.9", "3.10", "3.11"]
python-version: ["3.9", "3.10", "3.11", "3.12"]

steps:
- uses: actions/checkout@v3
Expand Down
1 change: 0 additions & 1 deletion main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import superstyl.preproc.tuyau as tuy
import superstyl.preproc.features_extract as fex
from superstyl.preproc.text_count import count_process
import fasttext
import pandas
import json
# from multiprocessing import Pool
Expand Down
34 changes: 17 additions & 17 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
langdetect==1.0.9
joblib==1.2.0
lxml==4.9.1
nltk==3.6.6
numpy==1.26.4
pybind11==2.8.1
scikit-learn==1.2.1
scipy==1.10.0
six==1.16.0
tqdm==4.64.1
unidecode==1.3.2
pandas==2.2.0
pyarrow==15.0.0
argparse==1.4.0
regex==2022.10.31
matplotlib==3.6.2
imbalanced-learn==0.8.1
langdetect>=1.0.9
joblib>=1.2.0
lxml>=4.9.1
nltk>=3.6.6
numpy>=1.26.4
pybind11>=2.8.1
scikit-learn>=1.2.1
scipy>=1.10.0
six>=1.16.0
tqdm>=4.64.1
unidecode>=1.3.2
pandas>=2.2.0
pyarrow>=15.0.0
argparse>=1.4.0
regex>=2022.10.31
matplotlib>=3.6.2
imbalanced-learn>=0.8.1
8 changes: 4 additions & 4 deletions superstyl/preproc/features_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@ def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False):
"""
Get word counts from a text
:param text: the source text
:param feat_list: a list of features to be selected
:param feats: the type of feats (words, chars, etc.)
:param n: the length of n-grams
:param relFreqs: whether or not to compute relative freqs
:param relFreqs: whether to compute relative freqs
:return: feature frequencies in text
"""

Expand Down Expand Up @@ -51,8 +52,8 @@ def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False):

def get_feature_list(myTexts, feats="words", n=1, relFreqs=True):
"""
:param myTexts: a 'myTexts' object, containing documents to be processed
:param feat_list: a list of features to be selected
:param feats: type of feats (words, chars)
:param n: n-grams length
:return: list of features, with total frequency
Expand All @@ -65,7 +66,6 @@ def get_feature_list(myTexts, feats="words", n=1, relFreqs=True):
my_feats.update(counts)

# sort them

my_feats = [(i, my_feats[i]) for i in sorted(my_feats, key=my_feats.get, reverse=True)]

return my_feats
Expand All @@ -77,7 +77,7 @@ def get_counts(myTexts, feat_list, feats = "words", n = 1, relFreqs = False):
:param myTexts: the document collection
:param feats: the type of feats (words, chars, etc.)
:param n: the length of n-grams
:param relFreqs: whether or not to compute relative freqs
:param relFreqs: whether to compute relative freqs
:return: the collection with, for each text, a 'wordCounts' dictionary
"""

Expand Down
95 changes: 91 additions & 4 deletions tests/test_main.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,102 @@
import unittest
from superstyl.preproc.tuyau import normalise
import superstyl.preproc.tuyau
import superstyl.preproc.features_extract

class DataLoading(unittest.TestCase):
# First, testing the tuyau features
def test_normalise(self):
text = " Hello, Mr. 𓀁, how are §§ you; doing?"
expected_default = "hello mr how are you doing"
self.assertEqual(normalise(text), expected_default)
self.assertEqual(superstyl.preproc.tuyau.normalise(text), expected_default)
expected_keeppunct = "Hello, Mr. , how are SSSS you; doing?"
self.assertEqual(normalise(text, keep_punct=True), expected_keeppunct)
self.assertEqual(superstyl.preproc.tuyau.normalise(text, keep_punct=True), expected_keeppunct)
expected_keepsym = "Hello, Mr. 𓀁, how are §§ you; doing?"
self.assertEqual(normalise(text, keep_sym=True), expected_keepsym)
self.assertEqual(superstyl.preproc.tuyau.normalise(text, keep_sym=True), expected_keepsym)

def test_detect_lang(self):
french = "Bonjour, Monsieur, comment allez-vous?"
# NB: it fails on that !!!
# english = "Hello, How do you do good sir?"
# still too hard
# english = "Hello, How do you do good sir? Are you well today?"
english = "Hello, How do you do good sir? Are you well today? Is this so bloody hard? Really, this is still failing?"
italian = "Buongiorno signore, come sta?"
#TODO: find something that manages old languages, like fasttext did…
self.assertEqual(superstyl.preproc.tuyau.detect_lang(french), "fr")
self.assertEqual(superstyl.preproc.tuyau.detect_lang(english), "en")
self.assertEqual(superstyl.preproc.tuyau.detect_lang(italian), "it")

# Now, lower level features,
# from features_extract
def test_counts(self):
text = "the cat the dog the squirrel the cat the cat"
superstyl.preproc.features_extract.count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False)
self.assertEqual(
superstyl.preproc.features_extract.count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False),
{'the': 5, 'cat': 3, 'dog': 1, 'squirrel': 1}
)
self.assertEqual(
superstyl.preproc.features_extract.count_words(text, feat_list=None, feats="words", n=1, relFreqs=True),
{'the': 0.5, 'cat': 0.3, 'dog': 0.1, 'squirrel': 0.1}
)
self.assertEqual(
superstyl.preproc.features_extract.count_words(text, feat_list=['the', 'cat'], feats="words", n=1, relFreqs=False),
{'the': 5, 'cat': 3}
)
self.assertEqual(
superstyl.preproc.features_extract.count_words(text, feat_list=['the', 'cat'], feats="words", n=1, relFreqs=True),
{'the': 0.5, 'cat': 0.3}
)
self.assertEqual(
superstyl.preproc.features_extract.count_words(text, feat_list=None, feats="words", n=2, relFreqs=False),
{'the_cat': 3, 'cat_the': 2, 'the_dog': 1, 'dog_the': 1, 'the_squirrel': 1, 'squirrel_the': 1}
)
self.assertEqual(
superstyl.preproc.features_extract.count_words(text, feat_list=None, feats="words", n=2, relFreqs=True),
{'the_cat': 3/9, 'cat_the': 2/9, 'the_dog': 1/9, 'dog_the': 1/9, 'the_squirrel': 1/9, 'squirrel_the': 1/9}
)

text = "the yo yo"
self.assertEqual(
superstyl.preproc.features_extract.count_words(text, feat_list=None, feats="chars", n=3, relFreqs=False),
{'the': 1, 'he_': 1, 'e_y': 1, '_yo': 2, 'yo_': 1, 'o_y': 1}
)
self.assertEqual(
superstyl.preproc.features_extract.count_words(text, feat_list=['the'], feats="chars", n=3, relFreqs=True),
{'the': 1/7}
)

# Testing the processing of "myTexts" objects
def test_get_feature_list(self):
myTexts = [
{"name": "Letter1", "aut": "Smith", "text": "This is the text", "lang": "en"},
{"name": "Letter2", "aut": "Smith", "text": "This is also the text", "lang": "en"},
{"name": "Letter1", "aut": "Dupont", "text": "Voici le texte", "lang": "fr"},
]
self.assertEqual(
superstyl.preproc.features_extract.get_feature_list(myTexts, feats="words", n=1, relFreqs=False),
[('This', 2), ('is', 2), ('the', 2), ('text', 2), ('also', 1), ('Voici', 1), ('le', 1), ('texte', 1)]
)

def test_get_counts(self):
myTexts = [
{"name": "Letter1", "aut": "Smith", "text": "This is the text", "lang": "en"},
{"name": "Letter2", "aut": "Smith", "text": "This is also the text", "lang": "en"},
{"name": "Letter1", "aut": "Dupont", "text": "Voici le texte", "lang": "fr"},
]

self.assertEqual(
superstyl.preproc.features_extract.get_counts(myTexts, ['the', 'is', 'also', 'le'], feats = "words",
n = 1, relFreqs = True),
[{'name': 'Letter1', 'aut': 'Smith', 'text': 'This is the text', 'lang': 'en',
'wordCounts': {'the': 0.25, 'is': 0.25}},
{'name': 'Letter2', 'aut': 'Smith', 'text': 'This is also the text', 'lang': 'en',
'wordCounts': {'the': 0.2, 'is': 0.2, 'also': 0.2}},
{'name': 'Letter1', 'aut': 'Dupont', 'text': 'Voici le texte', 'lang': 'fr', 'wordCounts': {'le': 1/3}}]
)

#TODO: a lot more tests



if __name__ == '__main__':
Expand Down

0 comments on commit dbe264b

Please sign in to comment.