Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

French Nouns Transformation #247

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def all_folders():


def read(fname):
with open(os.path.join(os.path.dirname(__file__), fname)) as f:
with open(os.path.join(os.path.dirname(__file__), fname),encoding='utf-8') as f:
data = f.read()
return data

Expand Down
20 changes: 20 additions & 0 deletions transformations/french_synonym_transformation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Noun Synonym Substitution 🦎 + ⌨️ → 🐍


This transformation change some words with synonyms according to if their POS tag is a NOUN for simple french sentences. It requires Spacy_lefff (an extention of spacy for french POS and lemmatizing) and nltk package with the open multilingual wordnet dictionary.

Authors : Lisa Barthe and Louanes Hamla from Fablab by Inetum in Paris

## What type of transformation it is ?
This transformation allows to create paraphrases with a different word in french. The general meaning of the sentence remains but it can be declined on different paraphrases with one noun variation.

## Supported Task

This perturbation can be used for any French task.

## What does it intend to benefit ?

This perturbation would benefit all tasks which have a sentence/paragraph/document as input like text classification, text generation, etc. that requires synthetic data augmentation / diversification.

## What are the limitation of this transformation ?
This tool does not take the general context into account, sometimes, the ouput will not match the general sense of te sentence.
2 changes: 2 additions & 0 deletions transformations/french_synonym_transformation/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .transformation import *

Binary file not shown.
51 changes: 51 additions & 0 deletions transformations/french_synonym_transformation/test.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
{
"type": "french_synonym_transformation",
"test_cases": [

{
"class": "FrenchNounSynonymTransformation",
"inputs": {
"sentence": "Il vit par la force et par la force il est parti."
},
"outputs": [{
"sentence": "Il vit par la violence et par la violence il est parti."
}]

},

{
"class": "FrenchNounSynonymTransformation",
"inputs": {
"sentence": "Dans cette vie, vous devez planter un arbre, lire un livre et avoir un enfant."
},
"outputs": [{
"sentence": "Dans cette vie, vous devez planter un arbre, lire un ouvrage et avoir un enfant."
}]

},

{
"class": "FrenchNounSynonymTransformation",
"inputs": {
"sentence": "La hausse des taux attirera les investissements."
},
"outputs": [{
"sentence": "La hausse des ratio attirera les investissements."
}]

},

{
"class": "FrenchNounSynonymTransformation",
"inputs": {
"sentence": "Il entreprend son projet rapidement et avec enthousiasme."
},
"outputs": [{
"sentence": "Il entreprend son projet rapidement et avec passion."
}]

}


]
}
85 changes: 85 additions & 0 deletions transformations/french_synonym_transformation/transformation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from textblob import TextBlob, Blobber, Word
import re
from textblob_fr import PatternTagger, PatternAnalyzer
import nltk
nltk.download('wordnet')
from textblob.wordnet import NOUN, VERB, ADV, ADJ
import spacy
from spacy_lefff import LefffLemmatizer, POSTagger
from spacy.language import Language
from nltk.corpus import wordnet
import nltk
nltk.download('omw')

from interfaces.SentenceOperation import SentenceOperation
from tasks.TaskTypes import TaskType

@Language.factory('french_lemmatizer')
def create_french_lemmatizer(nlp, name):
return LefffLemmatizer()

@Language.factory('POSTagger')
def create_POSTagger(nlp, name):
return POSTagger()


nlp = spacy.load('fr_core_news_md')

nlp.add_pipe('POSTagger', name ='pos')
nlp.add_pipe('french_lemmatizer', name='lefff', after='pos')


def synonym_transformation(text):
doc = nlp(text)
nouns = [d.text for d in doc if d.pos_ == "NOUN"]
synonyms_noun_list = []
for i in nouns :
dict_noun_synonyms = {}
dict_noun_synonyms['noun'] = i
dict_noun_synonyms['synonyms'] = list(set([l.name() for syn in wordnet.synsets(i, lang = 'fra') for l in syn.lemmas('fra')]))
if len(dict_noun_synonyms['synonyms']) > 0:
synonyms_noun_list.append(dict_noun_synonyms)

valid_noun_list = []
for j in synonyms_noun_list:
for k in j['synonyms']:
valid_noun_dict = {}
valid_noun_dict['noun'] = j['noun']
valid_noun_dict['syn'] = k
if nlp(j['noun']).similarity(nlp(k)) > .55 and not nlp(j['noun']).similarity(nlp(k)) >= .999:
valid_noun_list.append(valid_noun_dict)

text_noun_generated = []
pertu=[]
for l in valid_noun_list:
text_noun_generated.append(text.replace(l['noun'], l['syn']))
text_noun_generated.sort(reverse=True)
for sent in text_noun_generated:
if nlp(text).similarity(nlp(i)) > .40 and not nlp(text).similarity(nlp(i)) >= .999:
pertu.append(sent)
break

return pertu




class FrenchNounSynonymTransformation(SentenceOperation):
tasks = [
TaskType.TEXT_CLASSIFICATION,
TaskType.TEXT_TO_TEXT_GENERATION,
TaskType.TEXT_TAGGING,
]
languages = ["fr"]

def __init__(self, seed=0, max_outputs=1):
super().__init__(seed, max_outputs=max_outputs)

def generate(self, sentence : str):
perturbed_texts = synonym_transformation(
sentence
)
print("perturbed text inside of class",perturbed_texts)
return perturbed_texts