GEM-benchmark · JerryX1110 · Dec 9, 2021 · Dec 9, 2021
diff --git a/transformations/chinese_to_digits/README.md b/transformations/chinese_to_digits/README.md
@@ -0,0 +1,21 @@
+# Chinese Numerical Words to Digits 十→ 🔟
+This transformation transfers the numbers written in Chinese to digits numbers,  such as “一”  (one) to  1, "二" (two)  to 2, and "百" (a hundred) to 100. 
+
+
+
+Author name: Xiaohao Xu
+Author email: xxh11102019@outlook.com
+
+## What type of a transformation is this?
+This transformation transfers the Numbers written in Chinese to digits number. 
+
+## What tasks does it intend to benefit?
+This transformation is a vital component of Chinese NLP system such as: chat robot, knowledge graph, and OCR system. It would benefit all tasks which have a sentence/paragraph/document with numbers written in Capital Chinese as input like text classification, text generation, etc. 
+
+## Previous Work
+
+1) chinese2digits project: https://github.com/Wall-ee/chinese2digits
+
+## What are the limitations of this transformation?
+It depends on the implementation by the chinese2digits project.
+
diff --git a/transformations/chinese_to_digits/__init__.py b/transformations/chinese_to_digits/__init__.py
@@ -0,0 +1 @@
+from .transformation import *
diff --git a/transformations/chinese_to_digits/requirements.txt b/transformations/chinese_to_digits/requirements.txt
@@ -0,0 +1 @@
+chinese2digits
diff --git a/transformations/chinese_to_digits/test.json b/transformations/chinese_to_digits/test.json
@@ -0,0 +1,60 @@
+{
+  "type": "chinese_to_digits",
+  "test_cases": [
+    {
+      "class": "ChineseToDigits",
+      "inputs": {
+        "sentence": "小明今年亏了十万，真糟糕！"
+      },
+      "outputs": [
+        {
+          "sentence": "小明今年亏了100000.0，真糟糕！"
+        }
+      ]
+    },
+    {
+      "class": "ChineseToDigits",
+      "inputs": {
+        "sentence": "一只羊，两只羊，三只羊。。。"
+      },
+      "outputs": [
+        {
+          "sentence": "1只羊，2只羊，3只羊。。。"
+        }
+      ]
+    },
+    {
+      "class": "ChineseToDigits",
+      "inputs": {
+        "sentence": "您好，一共伍亿柒仟万拾柒元。"
+      },
+      "outputs": [
+        {
+          "sentence": "您好，一共570000017元。"
+        }
+      ]
+    },
+    {
+      "class": "ChineseToDigits",
+      "inputs": {
+        "sentence": "这次奖学金的获得者只占全校同学的万分之四"
+      },
+      "outputs": [
+        {
+          "sentence": "这次奖学金的获得者只占全校同学的0.0004"
+        }
+      ]
+    },
+    {
+      "class": "ChineseToDigits",
+      "inputs": {
+        "sentence": "随着两个遗迹文明的发展，他们终于开始了争斗。遗迹之间的能量冲突是战争的导火索。"
+      },
+      "outputs": [
+        {
+          "sentence": "随着2个遗迹文明的发展，他们终于开始了争斗。遗迹之间的能量冲突是战争的导火索。"
+        }
+      ]
+    }
+  ]
+}
diff --git a/transformations/chinese_to_digits/transformation.py b/transformations/chinese_to_digits/transformation.py
@@ -0,0 +1,56 @@
+import random
+import chinese2digits as c2d
+from interfaces.SentenceOperation import SentenceOperation
+from tasks.TaskTypes import TaskType
+
+"""
+Chinese numerical words to digits
+"""
+
+class ChineseToDigits(SentenceOperation):
+    tasks = [
+        TaskType.TEXT_CLASSIFICATION,
+        TaskType.TEXT_TO_TEXT_GENERATION
+    ]
+    languages = ["zh"]
+    keywords = ["morphological", "lexical", "api-based", "written", "highly-meaning-preserving"]
+
+    # transformation_prob : the probability that the transformation is applied to the input text
+
+    def __init__(self, seed=0, max_outputs=1, transformation_prob=1):
+        super().__init__(seed, max_outputs=max_outputs)
+        self.transformation_prob = transformation_prob
+        self.seed = seed
+        self.max_outputs = max_outputs
+
+
+    def generate(self, sentence: str):
+
+        random.seed(self.seed)
+
+        transfered_texts = []
+
+        converter = opencc.OpenCC(self.converter_config)
+
+        for _ in range(self.max_outputs):
+            buffer_text = ""
+            for chinese_character in sentence:
+                if random.random() <= self.transformation_prob:
+
+                    new_chinese_character = c2d.takeChineseNumberFromString(chinese_character)['replacedText']
+                else:
+                    new_chinese_character = chinese_character
+
+                buffer_text += new_chinese_character
+            transfered_texts.append(buffer_text)
+        return transfered_texts
+
+"""
+if __name__ == '__main__':
+    simp_text = "小明今年亏了十万，真糟糕！"
+    trans_func = ChineseToDigits()
+    new_text = trans_func.generate(simp_text)
+    print(new_text)
+
+"""
+