GEM-benchmark · marco-digio · Jul 29, 2021 · Jul 30, 2021 · Aug 4, 2021 · Sep 9, 2021
diff --git a/transformations/character_duplication/README.md b/transformations/character_duplication/README.md
@@ -0,0 +1,21 @@
+# Character Duplication
+This perturbation adds noise to all types of text sources (sentence, paragraph, etc.) proportional to noise erupting from keyboard typos making common spelling errors.
+
+Author name: Marco Di Giovanni
+Author email: marco.digiovanni@polimi.it
+Author Affiliation: Politecnico di Milano and University of Bologna
+
+
+
+## What type of a transformation is this?
+This transformation acts like a perturbation to test robustness.
+Few letters picked at random are duplicated.
+Generated transformations display high similarity to the source sentences.
+
+## What tasks does it intend to benefit?
+- This perturbation would benefit all tasks which have a sentence/paragraph/document as input like text classification, text generation, etc.
+- The generated texts mimic typing mistakes.
+
+## What are the limitations of this transformation?
+- This transformation is not capable of generating linguistically diverse text.
+- This transformation will mainly affect the performance of token/word-level models, while character-level models should be much robust.
diff --git a/transformations/character_duplication/__init__.py b/transformations/character_duplication/__init__.py
@@ -0,0 +1 @@
+from .transformation import *
diff --git a/transformations/character_duplication/test.json b/transformations/character_duplication/test.json
@@ -0,0 +1,50 @@
+{
+  "type": "character_duplication",
+  "test_cases": [
+    {
+      "class": "CharacterDuplication",
+      "inputs": {
+        "sentence": "Andrew finally returned the French book to Chris that I bought last week"
+      },
+      "outputs": [{
+        "sentence": "Anndrew ffinnallly returrned thee  French book too Chhris that I bought last  week"
+      }]
+    },
+    {
+      "class": "CharacterDuplication",
+      "inputs": {
+        "sentence": "Sentences with gapping, such as Paul likes coffee and Mary tea, lack an overt predicate to indicate the relation between two or more arguments."
+      },
+      "outputs": [{
+        "sentence": "Seentencees  witth gappiing, succhh as Paul likess cooffee and Mary tea, lackk an overt predicate ttoo indiicate tthe relation between two  orr moree arrguuments."
+      }]
+    },
+    {
+      "class": "CharacterDuplication",
+      "inputs": {
+        "sentence": "Alice in Wonderland is a 2010 American live-action/animated dark fantasy adventure film"
+      },
+      "outputs": [{
+        "sentence": "Allice inn WWondderland  is a 2010 AAmmerican live-acctioon/animated dark fantasyy adventure film"
+      }]
+    },
+    {
+      "class": "CharacterDuplication",
+      "inputs": {
+        "sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 to 2001"
+      },
+      "outputs": [{
+        "sentence": "Ujjjal Deev  Dossanjh seerved ass  33rd Premier of  Briitish Columbia from 2000 to  2001"
+      }]
+    },
+    {
+      "class": "CharacterDuplication",
+      "inputs": {
+        "sentence": "Neuroplasticity is a continuous processing allowing short-term, medium-term, and long-term remodeling of the neuronosynaptic organization."
+      },
+      "outputs": [{
+        "sentence": "Neeuroplaastticiity is aa continnuuous processingg alllowing short-term, mediium-term, and long-terrmm remoodelingg of the neuronosynaptic  orrganizzatiionn."
+      }]
+    }
+  ]
+}
diff --git a/transformations/character_duplication/transformation.py b/transformations/character_duplication/transformation.py
@@ -0,0 +1,56 @@
+import random
+
+from interfaces.SentenceOperation import SentenceOperation
+from tasks.TaskTypes import TaskType
+
+
+def duplicate(text, prob=0.1, seed=42, max_outputs=1):
+    """
+    This function duplicates random chars (not digits) in the text string, with specified probability. It returns a list of different perturbed strings, whose length is specified by max_outputs.
+    """
+    random.seed(seed)
+
+    original_text = list(text)
+    perturbed_texts = []
+    for _ in range(max_outputs):
+        perturbed_text = [
+            [letter]
+            if letter.isdigit() or random.random() > prob
+            else [letter, letter]
+            for letter in original_text
+        ]
+        perturbed_text = [
+            letter for sublist in perturbed_text for letter in sublist
+        ]
+        perturbed_texts.append("".join(perturbed_text))
+    return perturbed_texts
+
+
+class CharacterDuplication(SentenceOperation):
+    tasks = [
+        TaskType.TEXT_CLASSIFICATION,
+        TaskType.TEXT_TO_TEXT_GENERATION,
+    ]
+    languages = ["All"]
+    keywords = [
+        "morphological",
+        "noise",
+        "rule-based",
+        "highly-meaning-preserving",
+        "high-precision",
+        "high-coverage",
+        "high-generations",
+    ]
+
+    def __init__(self, seed=42, max_outputs=1, prob=0.1):
+        super().__init__(seed, max_outputs=max_outputs)
+        self.prob = prob
+
+    def generate(self, sentence: str):
+        perturbed_texts = duplicate(
+            text=sentence,
+            prob=self.prob,
+            seed=self.seed,
+            max_outputs=self.max_outputs,
+        )
+        return perturbed_texts