diff --git a/transformations/french_synonym_adjectives_transformation/README.md b/transformations/french_synonym_adjectives_transformation/README.md new file mode 100644 index 000000000..d0e10c3b0 --- /dev/null +++ b/transformations/french_synonym_adjectives_transformation/README.md @@ -0,0 +1,20 @@ +# Adjective Synonym Substitution 🦎 + ⌨️ → 🐍 + + +This transformation change some words with synonyms according to if their POS tag is a ADJ for simple french sentences. It requires Spacy_lefff (an extention of spacy for french POS and lemmatizing) and nltk package with the open multilingual wordnet dictionary. + +Authors : Lisa Barthe and Louanes Hamla from Fablab by Inetum in Paris + +## What type of transformation it is ? +This transformation allows to create paraphrases with a different word in french. The general meaning of the sentence remains but it can be declined on different paraphrases with one adjective variation. + +## Supported Task + +This perturbation can be used for any French task. + +## What does it intend to benefit ? + +This perturbation would benefit all tasks which have a sentence/paragraph/document as input like text classification, text generation, etc. that requires synthetic data augmentation / diversification. + +## What are the limitation of this transformation ? +This tool does not take the general context into account, sometimes, the ouput will not match the general sense of te sentence. \ No newline at end of file diff --git a/transformations/french_synonym_adjectives_transformation/__init__.py b/transformations/french_synonym_adjectives_transformation/__init__.py new file mode 100644 index 000000000..89ecd1199 --- /dev/null +++ b/transformations/french_synonym_adjectives_transformation/__init__.py @@ -0,0 +1,2 @@ +from .transformation import * + diff --git a/transformations/french_synonym_adjectives_transformation/requirements.txt b/transformations/french_synonym_adjectives_transformation/requirements.txt new file mode 100644 index 000000000..bfb6b6816 --- /dev/null +++ b/transformations/french_synonym_adjectives_transformation/requirements.txt @@ -0,0 +1,5 @@ +#fr core news md model : +fr-core-news-md @ https://github.com/explosion/spacy-models/releases/download/fr_core_news_md-3.0.0/fr_core_news_md-3.0.0-py3-none-any.whl +spacy-lefff==0.4.0 +textblob_fr==0.2.0 +nltk \ No newline at end of file diff --git a/transformations/french_synonym_adjectives_transformation/test.json b/transformations/french_synonym_adjectives_transformation/test.json new file mode 100644 index 000000000..2190e35ad --- /dev/null +++ b/transformations/french_synonym_adjectives_transformation/test.json @@ -0,0 +1,40 @@ +{ + "type": "french_synonym_adjectives_transformation", + "test_cases": [ + + + { + "class": "FrenchAdjectivesSynonymTransformation", + "inputs": { + "sentence": "Le sanglier a des dents pointues et un pelage sombre" + }, + "outputs": [{ + "sentence": "Le sanglier a des dents pointues et un pelage noir" + + }] + }, + { + "class": "FrenchAdjectivesSynonymTransformation", + "inputs": { + "sentence": "Ce fut un impressionnant festival de mode, les mannequins sont tous jolis." + }, + "outputs": [{ + "sentence": "Ce fut un imposant festival de mode, les mannequins sont tous jolis." + }] + + }, + + { + "class": "FrenchAdjectivesSynonymTransformation", + "inputs": { + "sentence": "Nous sommes venu en grand nombre." + }, + "outputs": [{ + "sentence": "Nous sommes venu en important nombre." + }] + + } + + + ] +} \ No newline at end of file diff --git a/transformations/french_synonym_adjectives_transformation/transformation.py b/transformations/french_synonym_adjectives_transformation/transformation.py new file mode 100644 index 000000000..0434a189c --- /dev/null +++ b/transformations/french_synonym_adjectives_transformation/transformation.py @@ -0,0 +1,85 @@ +from textblob import TextBlob, Blobber, Word +import re +from textblob_fr import PatternTagger, PatternAnalyzer +import nltk +nltk.download('wordnet') +from textblob.wordnet import NOUN, VERB, ADV, ADJ +import spacy +from spacy_lefff import LefffLemmatizer, POSTagger +from spacy.language import Language +from nltk.corpus import wordnet +import nltk +nltk.download('omw') + +from interfaces.SentenceOperation import SentenceOperation +from tasks.TaskTypes import TaskType + +@Language.factory('french_lemmatizer') +def create_french_lemmatizer(nlp, name): + return LefffLemmatizer() + +@Language.factory('POSTagger') +def create_POSTagger(nlp, name): + return POSTagger() + + +nlp = spacy.load('fr_core_news_md') + +nlp.add_pipe('POSTagger', name ='pos') +nlp.add_pipe('french_lemmatizer', name='lefff', after='pos') + + + +def synonym_transformation(text): + doc = nlp(text) + adjectives = [d.text for d in doc if d.pos_ == "ADJ"] + + synonyms_adjective_list = [] + for i in adjectives: + dict_adjective_synonyms = {} + dict_adjective_synonyms['adjective'] = i + dict_adjective_synonyms['synonyms'] = list(set([l.name() for syn in wordnet.synsets(i, lang = 'fra', pos = ADJ) for l in syn.lemmas('fra')])) + if len(dict_adjective_synonyms['synonyms']) > 0: + synonyms_adjective_list.append(dict_adjective_synonyms) + + valid_adjective_list = [] + for j in synonyms_adjective_list: + for k in j['synonyms']: + valid_adjective_dict = {} + valid_adjective_dict['adjective'] = j['adjective'] + valid_adjective_dict['syn'] = k + if nlp(j['adjective']).similarity(nlp(k)) > .50 and not nlp(j['adjective']).similarity(nlp(k)) >= .999: + valid_adjective_list.append(valid_adjective_dict) + text_adjective_generated = [] + for l in valid_adjective_list: + text_adjective_generated.append(text.replace(l['adjective'], l['syn'])) + pertu=[] + text_adjective_generated.sort() + + for i in text_adjective_generated : + if nlp(text).similarity(nlp(i)) > .90 and not nlp(text).similarity(nlp(i)) >= .999: + pertu.append(i) + break + + return pertu + + +class FrenchAdjectivesSynonymTransformation(SentenceOperation): + tasks = [ + TaskType.TEXT_CLASSIFICATION, + TaskType.TEXT_TO_TEXT_GENERATION, + TaskType.TEXT_TAGGING, + ] + languages = ["fr"] + + def __init__(self, seed=0, max_outputs=1): + super().__init__(seed, max_outputs=max_outputs) + + def generate(self, sentence : str): + perturbed_texts = synonym_transformation( + sentence + ) + print("perturbed text inside of class",perturbed_texts) + return perturbed_texts + +