Skip to content

Commit ca8e109

Browse files
TysonYujensan-1
andauthored
Closes SEACrowd#314 | Add dataloader for Indonesia chinese mt robust eval (SEACrowd#388)
* add dataloader for indonesian_madurese_bible_translation * update dataloader for indonesia_chinese_mtrobusteval * Delete seacrowd/sea_datasets/indonesian_madurese_bible_translation/indonesian_madurese_bible_translation.py * Update indonesia_chinese_mtrobusteval.py * update code based on the reviewer comments * add __init__.py * Update seacrowd/sea_datasets/indonesia_chinese_mtrobusteval/indonesia_chinese_mtrobusteval.py * Update seacrowd/sea_datasets/indonesia_chinese_mtrobusteval/indonesia_chinese_mtrobusteval.py --------- Co-authored-by: Jennifer Santoso <62987563+jen-santoso@users.noreply.github.com>
1 parent 8482c95 commit ca8e109

File tree

2 files changed

+151
-0
lines changed

2 files changed

+151
-0
lines changed

seacrowd/sea_datasets/indonesia_chinese_mtrobusteval/__init__.py

Whitespace-only changes.
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
from pathlib import Path
2+
from typing import Dict, List, Tuple
3+
4+
import datasets
5+
import jsonlines
6+
import pandas as pd
7+
8+
from seacrowd.utils import schemas
9+
from seacrowd.utils.configs import SEACrowdConfig
10+
from seacrowd.utils.constants import Licenses, Tasks
11+
12+
_CITATION = """\
13+
@article{,
14+
author = {supryzhu},
15+
title = {Indonesia-Chinese-MTRobustEval},
16+
journal = {None},
17+
volume = {None},
18+
year = {2023},
19+
url = {https://github.com/supryzhu/Indonesia-Chinese-MTRobustEval},
20+
doi = {None},
21+
biburl = {None},
22+
bibsource = {None}
23+
}
24+
"""
25+
26+
27+
_DATASETNAME = "indonesia_chinese_mtrobusteval"
28+
29+
_DESCRIPTION = """\
30+
The dataset is curated for the purpose of evaluating the robustness of Neural Machine Translation (NMT) towards natural occuring noise
31+
(typo, slang, code switching, etc.). The dataset is crawled from Twitter, then pre-processed to obtain sentences with noise.
32+
The dataset consists of a thousand noisy sentences. The dataset is translated into Chinese manually as the benchmark for evaluating the robustness of NMT.
33+
"""
34+
35+
_HOMEPAGE = "https://github.com/supryzhu/Indonesia-Chinese-MTRobustEval"
36+
37+
_LANGUAGES = ["ind", "cmn"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)
38+
39+
40+
_LICENSE = Licenses.MIT.value # example: Licenses.MIT.value, Licenses.CC_BY_NC_SA_4_0.value, Licenses.UNLICENSE.value, Licenses.UNKNOWN.value
41+
42+
_LOCAL = False
43+
44+
_URLS = {
45+
_DATASETNAME: "https://github.com/supryzhu/Indonesia-Chinese-MTRobustEval/raw/main/data/Indonesia-Chinese.xlsx",
46+
}
47+
48+
_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION]
49+
50+
_SOURCE_VERSION = "1.0.0"
51+
52+
_SEACROWD_VERSION = "1.0.0"
53+
54+
55+
class IndonesiaChineseMtRobustEval(datasets.GeneratorBasedBuilder):
56+
"""The dataset consists of a thousand noisy sentences. The dataset is translated into Chinese manually as the benchmark for evaluating the robustness of NMT."""
57+
58+
SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
59+
SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
60+
61+
BUILDER_CONFIGS = [
62+
SEACrowdConfig(
63+
name=f"{_DATASETNAME}_source",
64+
version=SOURCE_VERSION,
65+
description="indonesia_chinese_mtrobusteval source schema",
66+
schema="source",
67+
subset_id=f"{_DATASETNAME}",
68+
),
69+
SEACrowdConfig(
70+
name=f"{_DATASETNAME}_seacrowd_t2t",
71+
version=SEACROWD_VERSION,
72+
description="indonesia_chinese_mtrobusteval SEACrowd schema",
73+
schema="seacrowd_t2t",
74+
subset_id=f"{_DATASETNAME}",
75+
),
76+
]
77+
78+
DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"
79+
80+
def _info(self) -> datasets.DatasetInfo:
81+
82+
if self.config.schema == "source":
83+
features = datasets.Features(
84+
{
85+
"id": datasets.Value("string"),
86+
"src": datasets.Value("string"),
87+
"tgt": datasets.Value("string"),
88+
}
89+
)
90+
91+
elif self.config.schema == "seacrowd_t2t":
92+
features = schemas.text2text_features
93+
94+
return datasets.DatasetInfo(
95+
description=_DESCRIPTION,
96+
features=features,
97+
homepage=_HOMEPAGE,
98+
license=_LICENSE,
99+
citation=_CITATION,
100+
)
101+
102+
def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
103+
"""Returns SplitGenerators."""
104+
urls = _URLS[_DATASETNAME]
105+
file_path = dl_manager.download(urls)
106+
df = pd.read_excel(file_path)
107+
src = df["Indonesia"].tolist()
108+
tgt = df["Chinese"].tolist()
109+
results = []
110+
for i, item in enumerate(src):
111+
results.append({"id": str(i), "src": item, "tgt": tgt[i]})
112+
self._write_jsonl(file_path + ".jsonl", results)
113+
114+
return [
115+
datasets.SplitGenerator(
116+
name=datasets.Split.TRAIN,
117+
# Whatever you put in gen_kwargs will be passed to _generate_examples
118+
gen_kwargs={
119+
"filepath": file_path + ".jsonl",
120+
"split": "train",
121+
},
122+
)
123+
]
124+
125+
126+
def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
127+
if self.config.schema == "source":
128+
i = 0
129+
with jsonlines.open(filepath) as f:
130+
for each_data in f.iter():
131+
ex = {
132+
"id": each_data["id"],
133+
"src": each_data["src"],
134+
"tgt": each_data["tgt"],
135+
}
136+
yield i, ex
137+
i += 1
138+
139+
elif self.config.schema == "seacrowd_t2t":
140+
i = 0
141+
with jsonlines.open(filepath) as f:
142+
for each_data in f.iter():
143+
ex = {"id": each_data["id"], "text_1": each_data["src"], "text_2": each_data["tgt"], "text_1_name": "ind", "text_2_name": "cmn"}
144+
yield i, ex
145+
i += 1
146+
147+
def _write_jsonl(self, filepath, values):
148+
with jsonlines.open(filepath, "w") as writer:
149+
for line in values:
150+
writer.write(line)
151+

0 commit comments

Comments
 (0)