GEM-benchmark · Louanes1 · Aug 31, 2021 · Aug 31, 2021 · Sep 7, 2021 · Sep 7, 2021
diff --git a/setup.py b/setup.py
@@ -20,7 +20,7 @@ def all_folders():
 
 
 def read(fname):
-    with open(os.path.join(os.path.dirname(__file__), fname)) as f:
+    with open(os.path.join(os.path.dirname(__file__), fname),encoding='utf-8') as f:
         data = f.read()
     return data
 

diff --git a/transformations/french_synonym_transformation/README.md b/transformations/french_synonym_transformation/README.md
@@ -0,0 +1,20 @@
+# Noun Synonym Substitution 🦎  + ⌨️ → 🐍
+
+
+This transformation change some words with synonyms according to if their POS tag is a NOUN for simple french sentences. It requires Spacy_lefff (an extention of spacy for french POS and lemmatizing) and nltk package with the open multilingual wordnet dictionary.
+
+Authors : Lisa Barthe and Louanes Hamla from Fablab by Inetum in Paris
+
+## What type of transformation it is ?
+This transformation allows to create paraphrases with a different word in french. The general meaning of the sentence remains but it can be declined on different paraphrases with one noun variation.
+
+## Supported Task
+
+This perturbation can be used for any French task.
+
+## What does it intend to benefit ?
+
+This perturbation would benefit all tasks which have a sentence/paragraph/document as input like text classification, text generation, etc. that requires synthetic data augmentation / diversification.
+
+## What are the limitation of this transformation ?
+This tool does not take the general context into account, sometimes, the ouput will not match the general sense of te sentence.
diff --git a/transformations/french_synonym_transformation/__init__.py b/transformations/french_synonym_transformation/__init__.py
@@ -0,0 +1,2 @@
+from .transformation import *
+
diff --git a/transformations/french_synonym_transformation/requirements.txt b/transformations/french_synonym_transformation/requirements.txt
diff --git a/transformations/french_synonym_transformation/test.json b/transformations/french_synonym_transformation/test.json
@@ -0,0 +1,51 @@
+{
+  "type": "french_synonym_transformation",
+  "test_cases": [
+
+    {
+      "class": "FrenchNounSynonymTransformation",
+      "inputs": {
+        "sentence": "Il vit par la force et par la force il est parti."
+      },
+      "outputs": [{
+        "sentence": "Il vit par la violence et par la violence il est parti."
+      }]
+
+    },
+
+    {
+      "class": "FrenchNounSynonymTransformation",
+      "inputs": {
+        "sentence": "Dans cette vie, vous devez planter un arbre, lire un livre et avoir un enfant."
+      },
+      "outputs": [{
+        "sentence": "Dans cette vie, vous devez planter un arbre, lire un ouvrage et avoir un enfant."
+      }]
+
+    },
+
+    {
+      "class": "FrenchNounSynonymTransformation",
+      "inputs": {
+        "sentence": "La hausse des taux attirera les investissements."
+      },
+      "outputs": [{
+        "sentence": "La hausse des ratio attirera les investissements."
+      }]
+
+    },
+
+    {
+      "class": "FrenchNounSynonymTransformation",
+      "inputs": {
+        "sentence": "Il entreprend son projet rapidement et avec enthousiasme."
+      },
+      "outputs": [{
+        "sentence": "Il entreprend son projet rapidement et avec passion."
+      }]
+
+    }
+
+
+  ]
+}
diff --git a/transformations/french_synonym_transformation/transformation.py b/transformations/french_synonym_transformation/transformation.py
@@ -0,0 +1,85 @@
+from textblob import TextBlob, Blobber, Word
+import re
+from textblob_fr import PatternTagger, PatternAnalyzer
+import nltk
+nltk.download('wordnet')
+from textblob.wordnet import NOUN, VERB, ADV, ADJ
+import spacy
+from spacy_lefff import LefffLemmatizer, POSTagger
+from spacy.language import Language
+from nltk.corpus import wordnet
+import nltk
+nltk.download('omw') 
+
+from interfaces.SentenceOperation import SentenceOperation
+from tasks.TaskTypes import TaskType
+
+@Language.factory('french_lemmatizer')
+def create_french_lemmatizer(nlp, name):
+    return LefffLemmatizer()
+
+@Language.factory('POSTagger')
+def create_POSTagger(nlp, name):
+    return POSTagger()
+
+
+nlp = spacy.load('fr_core_news_md')
+
+nlp.add_pipe('POSTagger', name ='pos')
+nlp.add_pipe('french_lemmatizer', name='lefff', after='pos')
+
+
+def synonym_transformation(text):    
+	doc = nlp(text)
+	nouns = [d.text for d in doc if d.pos_ == "NOUN"]
+	synonyms_noun_list = []
+	for i in nouns :
+		dict_noun_synonyms = {}
+		dict_noun_synonyms['noun'] = i
+		dict_noun_synonyms['synonyms'] = list(set([l.name() for syn in wordnet.synsets(i, lang = 'fra') for l in syn.lemmas('fra')]))
+		if len(dict_noun_synonyms['synonyms']) > 0:
+			synonyms_noun_list.append(dict_noun_synonyms)
+
+	valid_noun_list = []
+	for j in synonyms_noun_list:
+		for k in j['synonyms']:
+			valid_noun_dict = {}
+			valid_noun_dict['noun'] = j['noun']
+			valid_noun_dict['syn'] = k
+			if nlp(j['noun']).similarity(nlp(k)) > .55 and not nlp(j['noun']).similarity(nlp(k)) >= .999:
+				valid_noun_list.append(valid_noun_dict)
+
+	text_noun_generated = []
+	pertu=[]
+	for l in valid_noun_list:
+		text_noun_generated.append(text.replace(l['noun'], l['syn']))
+	text_noun_generated.sort(reverse=True)
+	for sent in text_noun_generated:
+		if nlp(text).similarity(nlp(i)) > .40 and not nlp(text).similarity(nlp(i)) >= .999:
+			pertu.append(sent)
+			break
+
+	return pertu
+
+
+
+
+class FrenchNounSynonymTransformation(SentenceOperation):
+    tasks = [
+        TaskType.TEXT_CLASSIFICATION,
+        TaskType.TEXT_TO_TEXT_GENERATION,
+        TaskType.TEXT_TAGGING,
+    ]
+    languages = ["fr"]
+
+    def __init__(self, seed=0, max_outputs=1):
+        super().__init__(seed, max_outputs=max_outputs)
+
+    def generate(self, sentence : str):
+        perturbed_texts = synonym_transformation(
+            sentence
+        )
+        print("perturbed text inside of class",perturbed_texts)
+        return perturbed_texts
+
+