From 9c863fa3d0e47bb6cba0d2f0def0e93b80f98e44 Mon Sep 17 00:00:00 2001 From: Louanes Hamla Date: Tue, 31 Aug 2021 16:11:13 +0200 Subject: [PATCH 1/4] Added my french_noun_synonym transformation --- TestRunner.py | 2 +- .../french_synonym_transformation/README.md | 20 +++++ .../french_synonym_transformation/__init__.py | 2 + .../requirements.txt | Bin 0 -> 278 bytes .../french_synonym_transformation/test.json | 62 +++++++++++++ .../transformation.py | 85 ++++++++++++++++++ 6 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 transformations/french_synonym_transformation/README.md create mode 100644 transformations/french_synonym_transformation/__init__.py create mode 100644 transformations/french_synonym_transformation/requirements.txt create mode 100644 transformations/french_synonym_transformation/test.json create mode 100644 transformations/french_synonym_transformation/transformation.py diff --git a/TestRunner.py b/TestRunner.py index e28b47958..4c70f826f 100644 --- a/TestRunner.py +++ b/TestRunner.py @@ -18,7 +18,7 @@ def load(module, cls): def load_test_cases(test_json): try: - with open(test_json) as f: + with open(test_json,encoding="utf-8") as f: d = json.load(f) examples = d["test_cases"] return examples diff --git a/transformations/french_synonym_transformation/README.md b/transformations/french_synonym_transformation/README.md new file mode 100644 index 000000000..5dd50940e --- /dev/null +++ b/transformations/french_synonym_transformation/README.md @@ -0,0 +1,20 @@ +# Noun Synonym Substitution 🩎 + ⌚ → 🐍 + + +This transformation change some words with synonyms according to if their POS tag is a NOUN for simple french sentences. It requires Spacy_lefff (an extention of spacy for french POS and lemmatizing) and nltk package with the open multilingual wordnet dictionary. + +Authors : Lisa Barthe and Louanes Hamla from Fablab by Inetum in Paris + +## What type of transformation it is ? +This transformation allows to create paraphrases with a different word in french. The general meaning of the sentence remains but it can be declined on different paraphrases with one noun variation. + +## Supported Task + +This perturbation can be used for any French task. + +## What does it intend to benefit ? + +This perturbation would benefit all tasks which have a sentence/paragraph/document as input like text classification, text generation, etc. that requires synthetic data augmentation / diversification. + +## What are the limitation of this transformation ? +This tool does not take the general context into account, sometimes, the ouput will not match the general sense of te sentence. \ No newline at end of file diff --git a/transformations/french_synonym_transformation/__init__.py b/transformations/french_synonym_transformation/__init__.py new file mode 100644 index 000000000..89ecd1199 --- /dev/null +++ b/transformations/french_synonym_transformation/__init__.py @@ -0,0 +1,2 @@ +from .transformation import * + diff --git a/transformations/french_synonym_transformation/requirements.txt b/transformations/french_synonym_transformation/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc277cc070b5b1d7d3147193a7adb9419f31e3d7 GIT binary patch literal 278 zcmb71U1sW()N-6>hxjgWkMfWIj{2PzQ-_J!44LU31Bhj6| zsu0Z@wb2oq^yDp64|2S#Zse0*I%(TyXR1+eFbSqYKf~mR76lNk^JN$#h(gD15jcs?D$3eAhN;&-Punf3O3Z8nbq*skEyDzQI(vD(MHS4>Rik literal 0 HcmV?d00001 diff --git a/transformations/french_synonym_transformation/test.json b/transformations/french_synonym_transformation/test.json new file mode 100644 index 000000000..e97494ea9 --- /dev/null +++ b/transformations/french_synonym_transformation/test.json @@ -0,0 +1,62 @@ +{ + "type": "french_synonym_transformation", + "test_cases": [ + + { + "class": "FrenchNounSynonymTransformation", + "inputs": { + "sentence": "Il a vĂ©cu par la force et par la force il nous a quittĂ©s. Je n'ai pas Ă©tĂ© surpris de la cohĂ©rence avec laquelle il a conclu son existence." + }, + "outputs": [{ + "sentence": "Il a vĂ©cu par la violence et par la violence il nous a quittĂ©s. Je n'ai pas Ă©tĂ© surpris de la cohĂ©rence avec laquelle il a conclu son existence." + }] + + }, + + { + "class": "FrenchNounSynonymTransformation", + "inputs": { + "sentence": "Dans cette vie, vous devez planter un arbre, Ă©crire un livre et avoir un enfant." + }, + "outputs": [{ + "sentence": "Dans cette vie, vous devez planter un arbre, Ă©crire un ouvrage et avoir un enfant." + }] + + }, + + { + "class": "FrenchNounSynonymTransformation", + "inputs": { + "sentence": "Vous ne pouvez pas voter pour cet homme, avec tout ce qu’il a fait!" + }, + "outputs": [{ + "sentence": "Vous ne pouvez pas voter pour cet individu, avec tout ce qu’il a fait!" + }] + + }, + + { + "class": "FrenchNounSynonymTransformation", + "inputs": { + "sentence": "La hausse des taux attirera les investissements." + }, + "outputs": [{ + "sentence": "La hausse des ratio attirera les investissements." + }] + + }, + + { + "class": "FrenchNounSynonymTransformation", + "inputs": { + "sentence": "Il mĂšne son projet rapidement et avec rigueur." + }, + "outputs": [{ + "sentence": "Il mĂšne son projet rapidement et avec sĂ©vĂ©ritĂ©." + }] + + } + + + ] +} diff --git a/transformations/french_synonym_transformation/transformation.py b/transformations/french_synonym_transformation/transformation.py new file mode 100644 index 000000000..90738f50d --- /dev/null +++ b/transformations/french_synonym_transformation/transformation.py @@ -0,0 +1,85 @@ +from textblob import TextBlob, Blobber, Word +import re +from textblob_fr import PatternTagger, PatternAnalyzer +import nltk +nltk.download('wordnet') +from textblob.wordnet import NOUN, VERB, ADV, ADJ +import spacy +from spacy_lefff import LefffLemmatizer, POSTagger +from spacy.language import Language +from nltk.corpus import wordnet +import nltk +nltk.download('omw') + +from interfaces.SentenceOperation import SentenceOperation +from tasks.TaskTypes import TaskType + +@Language.factory('french_lemmatizer') +def create_french_lemmatizer(nlp, name): + return LefffLemmatizer() + +@Language.factory('POSTagger') +def create_POSTagger(nlp, name): + return POSTagger() + + +nlp = spacy.load('fr_core_news_md') + +nlp.add_pipe('POSTagger', name ='pos') +nlp.add_pipe('french_lemmatizer', name='lefff', after='pos') + + +def synonym_transformation(text): + doc = nlp(text) + nouns = [d.text for d in doc if d.pos_ == "NOUN"] + synonyms_noun_list = [] + for i in nouns : + dict_noun_synonyms = {} + dict_noun_synonyms['noun'] = i + dict_noun_synonyms['synonyms'] = list(set([l.name() for syn in wordnet.synsets(i, lang = 'fra') for l in syn.lemmas('fra')])) + if len(dict_noun_synonyms['synonyms']) > 0: + synonyms_noun_list.append(dict_noun_synonyms) + + valid_noun_list = [] + for j in synonyms_noun_list: + for k in j['synonyms']: + valid_noun_dict = {} + valid_noun_dict['noun'] = j['noun'] + valid_noun_dict['syn'] = k + if nlp(j['noun']).similarity(nlp(k)) > .55 and not nlp(j['noun']).similarity(nlp(k)) >= .999: + valid_noun_list.append(valid_noun_dict) + + text_noun_generated = [] + pertu=[] + for l in valid_noun_list: + text_noun_generated.append(text.replace(l['noun'], l['syn'])) + text_noun_generated.sort(reverse=True) + for sent in text_noun_generated: + if nlp(text).similarity(nlp(i)) > .40 and not nlp(text).similarity(nlp(i)) >= .999: + pertu.append(sent) + break + + return pertu + + + + +class FrenchNounSynonymTransformation(SentenceOperation): + tasks = [ + TaskType.TEXT_CLASSIFICATION, + TaskType.TEXT_TO_TEXT_GENERATION, + TaskType.TEXT_TAGGING, + ] + languages = ["fr"] + + def __init__(self, seed=0, max_outputs=1): + super().__init__(seed, max_outputs=max_outputs) + + def generate(self, sentence : str): + perturbed_texts = synonym_transformation( + sentence + ) + print("perturbed text inside of class",perturbed_texts) + return perturbed_texts + + From 82c23dd699b4d6649a1cb7c19bef4168e330f834 Mon Sep 17 00:00:00 2001 From: Louanes Hamla Date: Tue, 31 Aug 2021 23:07:16 +0200 Subject: [PATCH 2/4] add librariries to requirements --- TestRunner.py | 2 +- .../requirements.txt | Bin 278 -> 422 bytes .../french_synonym_transformation/test.json | 25 +++++------------- 3 files changed, 8 insertions(+), 19 deletions(-) diff --git a/TestRunner.py b/TestRunner.py index 4c70f826f..e28b47958 100644 --- a/TestRunner.py +++ b/TestRunner.py @@ -18,7 +18,7 @@ def load(module, cls): def load_test_cases(test_json): try: - with open(test_json,encoding="utf-8") as f: + with open(test_json) as f: d = json.load(f) examples = d["test_cases"] return examples diff --git a/transformations/french_synonym_transformation/requirements.txt b/transformations/french_synonym_transformation/requirements.txt index fc277cc070b5b1d7d3147193a7adb9419f31e3d7..4821ff3dc04dae9b6c9148adace31b8479adb5ce 100644 GIT binary patch delta 156 zcmbQnw2ay8|377hG=?Gu1%_mXd?1|)WaRo;kTFIabluiX21%kFfXuzPyU;@MjU=1ZexeA67h9sb>d>|PQwiKq$ O2(B&V!mdK!oT delta 11 ScmZ3+JdKIz|G$l9B8&hVE(ARQ diff --git a/transformations/french_synonym_transformation/test.json b/transformations/french_synonym_transformation/test.json index e97494ea9..612a9aea9 100644 --- a/transformations/french_synonym_transformation/test.json +++ b/transformations/french_synonym_transformation/test.json @@ -5,10 +5,10 @@ { "class": "FrenchNounSynonymTransformation", "inputs": { - "sentence": "Il a vĂ©cu par la force et par la force il nous a quittĂ©s. Je n'ai pas Ă©tĂ© surpris de la cohĂ©rence avec laquelle il a conclu son existence." + "sentence": "Il vit par la force et par la force il est parti." }, "outputs": [{ - "sentence": "Il a vĂ©cu par la violence et par la violence il nous a quittĂ©s. Je n'ai pas Ă©tĂ© surpris de la cohĂ©rence avec laquelle il a conclu son existence." + "sentence": "Il vit par la violence et par la violence il est parti." }] }, @@ -16,25 +16,14 @@ { "class": "FrenchNounSynonymTransformation", "inputs": { - "sentence": "Dans cette vie, vous devez planter un arbre, Ă©crire un livre et avoir un enfant." + "sentence": "Dans cette vie, vous devez planter un arbre, lire un livre et avoir un enfant." }, "outputs": [{ - "sentence": "Dans cette vie, vous devez planter un arbre, Ă©crire un ouvrage et avoir un enfant." + "sentence": "Dans cette vie, vous devez planter un arbre, lire un ouvrage et avoir un enfant." }] }, - - { - "class": "FrenchNounSynonymTransformation", - "inputs": { - "sentence": "Vous ne pouvez pas voter pour cet homme, avec tout ce qu’il a fait!" - }, - "outputs": [{ - "sentence": "Vous ne pouvez pas voter pour cet individu, avec tout ce qu’il a fait!" - }] - - }, - + { "class": "FrenchNounSynonymTransformation", "inputs": { @@ -49,10 +38,10 @@ { "class": "FrenchNounSynonymTransformation", "inputs": { - "sentence": "Il mĂšne son projet rapidement et avec rigueur." + "sentence": "Il entreprend son projet rapidement et avec enthousiasme." }, "outputs": [{ - "sentence": "Il mĂšne son projet rapidement et avec sĂ©vĂ©ritĂ©." + "sentence": "Il entreprend son projet rapidement et avec passion." }] } From e02a9a13c4002cffd55f8b97288c3b425c7fce9e Mon Sep 17 00:00:00 2001 From: Louanes Hamla Date: Tue, 7 Sep 2021 09:00:06 +0200 Subject: [PATCH 3/4] use utf-16 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index cdab6188c..22b023532 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def recursive_requirements(): os.path.dirname(__file__), folder + "/requirements.txt" ) if os.path.isfile(r_file): - with open(r_file) as f: + with open(r_file,encoding='utf-16') as f: requirements += f.read() + "\n" return requirements From cbce7c45f212c5705c6e3df23565c35d3b28195e Mon Sep 17 00:00:00 2001 From: Louanes Hamla Date: Tue, 7 Sep 2021 09:07:18 +0200 Subject: [PATCH 4/4] specify utf-8 in the read function of setup.py --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 22b023532..2d3b67f4b 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ def all_folders(): def read(fname): - with open(os.path.join(os.path.dirname(__file__), fname)) as f: + with open(os.path.join(os.path.dirname(__file__), fname),encoding='utf-8') as f: data = f.read() return data @@ -34,7 +34,7 @@ def recursive_requirements(): os.path.dirname(__file__), folder + "/requirements.txt" ) if os.path.isfile(r_file): - with open(r_file,encoding='utf-16') as f: + with open(r_file) as f: requirements += f.read() + "\n" return requirements