Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Various fixes #68

Merged
merged 5 commits into from
May 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion load_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@

if not args.f:
with open(feat_file, "w") as out:
out.write(json.dumps(my_feats, ensure_ascii=False))
out.write(json.dumps(my_feats, ensure_ascii=False, indent=0))
print("Features list saved to " + feat_file)

corpus.to_csv(corpus_file)
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ nltk>=3.6.6
numpy>=1.26.4
pybind11>=2.8.1
scikit-learn>=1.3.0
scipy>=1.10.0
scipy>=1.10.0,<1.13.0
six>=1.16.0
tqdm>=4.64.1
unidecode>=1.3.2
Expand Down
18 changes: 15 additions & 3 deletions superstyl/preproc/pipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,18 +77,30 @@ def detect_lang(string):


def normalise(text, keep_punct=False, keep_sym=False):
"""
Function to normalise an input string. By defaults, it removes all but word chars, remove accents,
and normalise space, and then normalise unicode.
:param keep_punct: if true, in addition, also keeps Punctuation and case distinction
:param keep_sym: if true, same as keep_punct, but keeps also N?umbers, Symbols, Marks, such as combining diacritics,
as well as Private use characters, and no Unidecode is applied
"""
# Remove all but word chars, remove accents, and normalise space
# and then normalise unicode

if keep_sym:
out = re.sub(r"\s+", " ", re.sub(r"[^\p{L}\p{P}\p{N}]+", " ", text.strip()))
out = re.sub(r"[^\p{L}\p{P}\p{N}\p{S}\p{M}\p{Co}]+", " ", text)

else:
if keep_punct:
out = re.sub(r"\s+", " ", unidecode.unidecode(re.sub(r"[^\p{L}\p{P}]+", " ", text.strip())))
out = re.sub(r"[^\p{L}\p{P}]+", " ", text)

else:
out = re.sub(r"\s+", " ", unidecode.unidecode(re.sub(r"[\W0-9]+", " ", text.lower()).strip()))
#out = re.sub(r"[\W0-9]+", " ", text.lower())
out = re.sub(r"[^\p{L}]+", " ", text.lower())

out = unidecode.unidecode(out)

out = re.sub(r"\s+", " ", out).strip()

return out

Expand Down
59 changes: 37 additions & 22 deletions tests/test_load_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,14 +283,15 @@ def test_load_texts_txt(self):

self.assertEqual(results, expected)

#TODO: test keep_sym, according to revised definition
# WHEN
# results = superstyl.preproc.pipe.load_texts(self.paths, identify_lang=False, format="txt",
# keep_sym=True, max_samples=None)
results = superstyl.preproc.pipe.load_texts(self.paths, identify_lang=False, format="txt",
keep_sym=True, max_samples=None)
# THEN
# expected = [{'name': 'Dupont_Letter1.txt', 'aut': 'Dupont', 'text': 'Voici le texte!', 'lang': 'NA'},
# {'name': 'Smith_Letter1.txt', 'aut': 'Smith', 'text': 'This is the text!', 'lang': 'NA'},
# {'name': 'Smith_Letter2.txt', 'aut': 'Smith', 'text': 'This is, also , the text!', 'lang': 'NA'}]
expected = [{'name': 'Dupont_Letter1.txt', 'aut': 'Dupont', 'text': 'Voici le texte!', 'lang': 'NA'},
{'name': 'Smith_Letter1.txt', 'aut': 'Smith', 'text': 'This is the text!', 'lang': 'NA'},
{'name': 'Smith_Letter2.txt', 'aut': 'Smith', 'text': 'This is, © also © , the text!', 'lang': 'NA'}]

self.assertEqual(results, expected)

# WHEN
results = superstyl.preproc.pipe.load_texts(self.paths, identify_lang=True, format="txt", keep_punct=True,
Expand All @@ -314,9 +315,9 @@ def test_docs_to_samples(self):
self.assertEqual(results, expected)

# WHEN
results = superstyl.preproc.pipe.docs_to_samples(self.paths, identify_lang=False, size=2, step=1,
results = superstyl.preproc.pipe.docs_to_samples(sorted(self.paths), identify_lang=False, size=2, step=1,
units="words", format="txt", keep_punct=True,
keep_sym=False,
keep_sym=True,
max_samples=None)

# THEN
Expand All @@ -329,14 +330,15 @@ def test_docs_to_samples(self):
{'name': 'Smith_Letter1.txt_3-5', 'aut': 'Smith', 'text': 'text !', 'lang': 'NA'},
{'name': 'Smith_Letter2.txt_0-2', 'aut': 'Smith', 'text': 'This is', 'lang': 'NA'},
{'name': 'Smith_Letter2.txt_1-3', 'aut': 'Smith', 'text': 'is ,', 'lang': 'NA'},
{'name': 'Smith_Letter2.txt_2-4', 'aut': 'Smith', 'text': ', also', 'lang': 'NA'},
{'name': 'Smith_Letter2.txt_3-5', 'aut': 'Smith', 'text': 'also ,', 'lang': 'NA'},
{'name': 'Smith_Letter2.txt_4-6', 'aut': 'Smith', 'text': ', the', 'lang': 'NA'},
{'name': 'Smith_Letter2.txt_5-7', 'aut': 'Smith', 'text': 'the text', 'lang': 'NA'},
{'name': 'Smith_Letter2.txt_6-8', 'aut': 'Smith', 'text': 'text !', 'lang': 'NA'}]
self.assertEqual(results, expected)
{'name': 'Smith_Letter2.txt_2-4', 'aut': 'Smith', 'text': ', ©', 'lang': 'NA'},
{'name': 'Smith_Letter2.txt_3-5', 'aut': 'Smith', 'text': '© also', 'lang': 'NA'},
{'name': 'Smith_Letter2.txt_4-6', 'aut': 'Smith', 'text': 'also ©', 'lang': 'NA'},
{'name': 'Smith_Letter2.txt_5-7', 'aut': 'Smith', 'text': '© ,', 'lang': 'NA'},
{'name': 'Smith_Letter2.txt_6-8', 'aut': 'Smith', 'text': ', the', 'lang': 'NA'},
{'name': 'Smith_Letter2.txt_7-9', 'aut': 'Smith', 'text': 'the text', 'lang': 'NA'},
{'name': 'Smith_Letter2.txt_8-10', 'aut': 'Smith', 'text': 'text !', 'lang': 'NA'}]

# TODO: test keep_sym
self.assertEqual(results, expected)

# WHEN
results = superstyl.preproc.pipe.docs_to_samples(self.paths, identify_lang=True, size=2, step=None,
Expand Down Expand Up @@ -492,13 +494,26 @@ class DataLoading(unittest.TestCase):
# Now down to lower level features
# First, testing the pipe features
def test_normalise(self):
text = " Hello, Mr. 𓀁, how are §§ you; doing?"
expected_default = "hello mr how are you doing"
self.assertEqual(superstyl.preproc.pipe.normalise(text), expected_default)
expected_keeppunct = "Hello, Mr. , how are SSSS you; doing?"
self.assertEqual(superstyl.preproc.pipe.normalise(text, keep_punct=True), expected_keeppunct)
expected_keepsym = "Hello, Mr. 𓀁, how are §§ you; doing?" #TODO: modify test according to new def
self.assertEqual(superstyl.preproc.pipe.normalise(text, keep_sym=True), expected_keepsym)
# FEATURE
# Normalise an input text, according to different options
# SCENARIO
# GIVEN
text = " Hello, Mr. 𓀁, how are §§ you; doing? ſõ ❡"
# WHEN
results = superstyl.preproc.pipe.normalise(text)
# THEN
expected_default = "hello mr how are you doing s o"
self.assertEqual(results, expected_default)
# WHEN
results = superstyl.preproc.pipe.normalise(text, keep_punct=True)
# THEN
expected_keeppunct = "Hello, Mr. , how are SSSS you; doing? s o"
self.assertEqual(results, expected_keeppunct)
# WHEN
results = superstyl.preproc.pipe.normalise(text, keep_sym=True)
# THEN
expected_keepsym = "Hello, Mr. 𓀁, how are §§ you; doing? ſõ ❡"
self.assertEqual(results, expected_keepsym)

def test_detect_lang(self):
french = "Bonjour, Monsieur, comment allez-vous?"
Expand Down
7 changes: 6 additions & 1 deletion tests/test_train_svm.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ def test_train_svm(self):
self.assertEqual(results["classification_report"], expected_results["classification_report"])
self.assertEqual(results["misattributions"].to_dict(), expected_results["misattributions"])
self.assertEqual(list(results.keys()), expected_keys)
# This is only the first minimal test for this function

# WHEN
#results = superstyl.train_svm(train, test, final_pred=False, balance="SMOTETomek")


# This is only the first minimal tests for this function


12 changes: 8 additions & 4 deletions train_svm.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,16 @@
else:
args.o = ''

svm["confusion_matrix"].to_csv(args.o+"confusion_matrix.csv")
svm["misattributions"].to_csv(args.o+"misattributions.csv")

if args.cross_validate is not None or (args.test_path is not None and not args.final):
svm["confusion_matrix"].to_csv(args.o+"confusion_matrix.csv")
svm["misattributions"].to_csv(args.o+"misattributions.csv")

joblib.dump(svm["pipeline"], args.o+'mySVM.joblib')

print(".......... Writing final predictions to " + args.o + "FINAL_PREDICTIONS.csv ........")
svm["final_predictions"].to_csv(args.o+"FINAL_PREDICTIONS.csv")
if args.final:
print(".......... Writing final predictions to " + args.o + "FINAL_PREDICTIONS.csv ........")
svm["final_predictions"].to_csv(args.o+"FINAL_PREDICTIONS.csv")

if args.get_coefs:
print(".......... Writing coefficients to disk ........")
Expand Down
Loading