diff --git a/setup.py b/setup.py index cdab6188c..2d3b67f4b 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ def all_folders(): def read(fname): - with open(os.path.join(os.path.dirname(__file__), fname)) as f: + with open(os.path.join(os.path.dirname(__file__), fname),encoding='utf-8') as f: data = f.read() return data diff --git a/transformations/french_synonym_transformation/README.md b/transformations/french_synonym_transformation/README.md new file mode 100644 index 000000000..5dd50940e --- /dev/null +++ b/transformations/french_synonym_transformation/README.md @@ -0,0 +1,20 @@ +# Noun Synonym Substitution 🦎 + ⌨️ → 🐍 + + +This transformation change some words with synonyms according to if their POS tag is a NOUN for simple french sentences. It requires Spacy_lefff (an extention of spacy for french POS and lemmatizing) and nltk package with the open multilingual wordnet dictionary. + +Authors : Lisa Barthe and Louanes Hamla from Fablab by Inetum in Paris + +## What type of transformation it is ? +This transformation allows to create paraphrases with a different word in french. The general meaning of the sentence remains but it can be declined on different paraphrases with one noun variation. + +## Supported Task + +This perturbation can be used for any French task. + +## What does it intend to benefit ? + +This perturbation would benefit all tasks which have a sentence/paragraph/document as input like text classification, text generation, etc. that requires synthetic data augmentation / diversification. + +## What are the limitation of this transformation ? +This tool does not take the general context into account, sometimes, the ouput will not match the general sense of te sentence. \ No newline at end of file diff --git a/transformations/french_synonym_transformation/__init__.py b/transformations/french_synonym_transformation/__init__.py new file mode 100644 index 000000000..89ecd1199 --- /dev/null +++ b/transformations/french_synonym_transformation/__init__.py @@ -0,0 +1,2 @@ +from .transformation import * + diff --git a/transformations/french_synonym_transformation/requirements.txt b/transformations/french_synonym_transformation/requirements.txt new file mode 100644 index 000000000..4821ff3dc Binary files /dev/null and b/transformations/french_synonym_transformation/requirements.txt differ diff --git a/transformations/french_synonym_transformation/test.json b/transformations/french_synonym_transformation/test.json new file mode 100644 index 000000000..612a9aea9 --- /dev/null +++ b/transformations/french_synonym_transformation/test.json @@ -0,0 +1,51 @@ +{ + "type": "french_synonym_transformation", + "test_cases": [ + + { + "class": "FrenchNounSynonymTransformation", + "inputs": { + "sentence": "Il vit par la force et par la force il est parti." + }, + "outputs": [{ + "sentence": "Il vit par la violence et par la violence il est parti." + }] + + }, + + { + "class": "FrenchNounSynonymTransformation", + "inputs": { + "sentence": "Dans cette vie, vous devez planter un arbre, lire un livre et avoir un enfant." + }, + "outputs": [{ + "sentence": "Dans cette vie, vous devez planter un arbre, lire un ouvrage et avoir un enfant." + }] + + }, + + { + "class": "FrenchNounSynonymTransformation", + "inputs": { + "sentence": "La hausse des taux attirera les investissements." + }, + "outputs": [{ + "sentence": "La hausse des ratio attirera les investissements." + }] + + }, + + { + "class": "FrenchNounSynonymTransformation", + "inputs": { + "sentence": "Il entreprend son projet rapidement et avec enthousiasme." + }, + "outputs": [{ + "sentence": "Il entreprend son projet rapidement et avec passion." + }] + + } + + + ] +} diff --git a/transformations/french_synonym_transformation/transformation.py b/transformations/french_synonym_transformation/transformation.py new file mode 100644 index 000000000..90738f50d --- /dev/null +++ b/transformations/french_synonym_transformation/transformation.py @@ -0,0 +1,85 @@ +from textblob import TextBlob, Blobber, Word +import re +from textblob_fr import PatternTagger, PatternAnalyzer +import nltk +nltk.download('wordnet') +from textblob.wordnet import NOUN, VERB, ADV, ADJ +import spacy +from spacy_lefff import LefffLemmatizer, POSTagger +from spacy.language import Language +from nltk.corpus import wordnet +import nltk +nltk.download('omw') + +from interfaces.SentenceOperation import SentenceOperation +from tasks.TaskTypes import TaskType + +@Language.factory('french_lemmatizer') +def create_french_lemmatizer(nlp, name): + return LefffLemmatizer() + +@Language.factory('POSTagger') +def create_POSTagger(nlp, name): + return POSTagger() + + +nlp = spacy.load('fr_core_news_md') + +nlp.add_pipe('POSTagger', name ='pos') +nlp.add_pipe('french_lemmatizer', name='lefff', after='pos') + + +def synonym_transformation(text): + doc = nlp(text) + nouns = [d.text for d in doc if d.pos_ == "NOUN"] + synonyms_noun_list = [] + for i in nouns : + dict_noun_synonyms = {} + dict_noun_synonyms['noun'] = i + dict_noun_synonyms['synonyms'] = list(set([l.name() for syn in wordnet.synsets(i, lang = 'fra') for l in syn.lemmas('fra')])) + if len(dict_noun_synonyms['synonyms']) > 0: + synonyms_noun_list.append(dict_noun_synonyms) + + valid_noun_list = [] + for j in synonyms_noun_list: + for k in j['synonyms']: + valid_noun_dict = {} + valid_noun_dict['noun'] = j['noun'] + valid_noun_dict['syn'] = k + if nlp(j['noun']).similarity(nlp(k)) > .55 and not nlp(j['noun']).similarity(nlp(k)) >= .999: + valid_noun_list.append(valid_noun_dict) + + text_noun_generated = [] + pertu=[] + for l in valid_noun_list: + text_noun_generated.append(text.replace(l['noun'], l['syn'])) + text_noun_generated.sort(reverse=True) + for sent in text_noun_generated: + if nlp(text).similarity(nlp(i)) > .40 and not nlp(text).similarity(nlp(i)) >= .999: + pertu.append(sent) + break + + return pertu + + + + +class FrenchNounSynonymTransformation(SentenceOperation): + tasks = [ + TaskType.TEXT_CLASSIFICATION, + TaskType.TEXT_TO_TEXT_GENERATION, + TaskType.TEXT_TAGGING, + ] + languages = ["fr"] + + def __init__(self, seed=0, max_outputs=1): + super().__init__(seed, max_outputs=max_outputs) + + def generate(self, sentence : str): + perturbed_texts = synonym_transformation( + sentence + ) + print("perturbed text inside of class",perturbed_texts) + return perturbed_texts + +