Skip to content

Commit 623e5d6

Browse files
authored
Closes SEACrowd#48 | Create dataset loader for SREDFM (SEACrowd#495)
* [New Feature] Add SREDFM dataloader (temp) * [Fix] Inequal string and unique id bug for SREDFM Dataloader * [Fix] Refactor based on reviews * [Fix] Remove redundant RE task in constants.py * [Fix] Implement reviews * [Fix] Implement review feedbacks
1 parent 1499a13 commit 623e5d6

File tree

2 files changed

+298
-0
lines changed

2 files changed

+298
-0
lines changed

seacrowd/sea_datasets/sredfm/__init__.py

Whitespace-only changes.
Lines changed: 298 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,298 @@
1+
# Some code referenced from https://huggingface.co/datasets/Babelscape/SREDFM/blob/main/SREDFM.py
2+
3+
from pathlib import Path
4+
from typing import Dict, List, Tuple
5+
6+
import datasets
7+
import jsonlines
8+
9+
from seacrowd.utils import schemas
10+
from seacrowd.utils.configs import SEACrowdConfig
11+
from seacrowd.utils.constants import Licenses, Tasks
12+
13+
_CITATION = """\
14+
@inproceedings{huguet-cabot-et-al-2023-redfm-dataset,
15+
title = "RED$^{\rm FM}$: a Filtered and Multilingual Relation Extraction Dataset",
16+
author = "Huguet Cabot, Pere-Lluís and Tedeschi, Simone and Ngonga Ngomo, Axel-Cyrille and
17+
Navigli, Roberto",
18+
booktitle = "Proc. of the 61st Annual Meeting of the Association for Computational Linguistics: ACL 2023",
19+
month = jul,
20+
year = "2023",
21+
address = "Toronto, Canada",
22+
publisher = "Association for Computational Linguistics",
23+
url = "https://arxiv.org/abs/2306.09802",
24+
}
25+
"""
26+
27+
_DATASETNAME = "sredfm"
28+
29+
30+
_DESCRIPTION = """\
31+
SREDFM is an automatically annotated dataset for relation extraction task covering 18 languages, 400 relation types, 13 entity types, totaling more than 40 million triplet instances. SREDFM includes Vietnamnese.
32+
"""
33+
34+
_HOMEPAGE = "https://github.com/babelscape/rebel"
35+
36+
_LANGUAGES = ["vie"]
37+
38+
_LICENSE = Licenses.CC_BY_SA_4_0.value
39+
40+
_LOCAL = False
41+
42+
_URLS = {
43+
"train": "https://huggingface.co/datasets/Babelscape/SREDFM/resolve/main/data/train.vi.jsonl",
44+
"dev": "https://huggingface.co/datasets/Babelscape/SREDFM/resolve/main/data/dev.vi.jsonl",
45+
"test": "https://huggingface.co/datasets/Babelscape/SREDFM/resolve/main/data/test.vi.jsonl",
46+
"relations_url": "https://huggingface.co/datasets/Babelscape/SREDFM/raw/main/relations.tsv",
47+
}
48+
49+
_SUPPORTED_TASKS = [Tasks.RELATION_EXTRACTION]
50+
51+
_SOURCE_VERSION = "1.0.0"
52+
53+
_SEACROWD_VERSION = "1.0.0"
54+
55+
56+
class SREDFMDataset(datasets.GeneratorBasedBuilder):
57+
"""SREDFM is an automatically annotated dataset for relation extraction task.
58+
Relation Extraction (RE) is a task that identifies relationships between entities in a text,
59+
enabling the acquisition of relational facts and bridging the gap between natural language
60+
and structured knowledge. SREDFM covers 400 relation types, 13 entity types,
61+
totaling more than 40 million triplet instances."""
62+
63+
SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
64+
SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
65+
66+
BUILDER_CONFIGS = [
67+
SEACrowdConfig(
68+
name=f"{_DATASETNAME}_source",
69+
version=SOURCE_VERSION,
70+
description=f"{_DATASETNAME} source schema",
71+
schema="source",
72+
subset_id=f"{_DATASETNAME}",
73+
),
74+
SEACrowdConfig(
75+
name=f"{_DATASETNAME}_seacrowd_kb",
76+
version=SEACROWD_VERSION,
77+
description=f"{_DATASETNAME} SEACrowd schema",
78+
schema="seacrowd_kb",
79+
subset_id=f"{_DATASETNAME}",
80+
),
81+
]
82+
83+
DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"
84+
85+
def _info(self) -> datasets.DatasetInfo:
86+
if self.config.schema == "source":
87+
features = datasets.Features(
88+
{
89+
"docid": datasets.Value("string"),
90+
"title": datasets.Value("string"),
91+
"uri": datasets.Value("string"),
92+
"text": datasets.Value("string"),
93+
"entities": [
94+
{
95+
"uri": datasets.Value(dtype="string"),
96+
"surfaceform": datasets.Value(dtype="string"),
97+
"type": datasets.Value(dtype="string"),
98+
"start": datasets.Value(dtype="int32"),
99+
"end": datasets.Value(dtype="int32"),
100+
}
101+
],
102+
"relations": [
103+
{
104+
"subject": datasets.Value(dtype="int32"),
105+
"predicate": datasets.Value(dtype="string"),
106+
"object": datasets.Value(dtype="int32"),
107+
}
108+
],
109+
}
110+
)
111+
112+
elif self.config.schema == "seacrowd_kb":
113+
features = schemas.kb_features
114+
115+
return datasets.DatasetInfo(
116+
description=_DESCRIPTION,
117+
features=features,
118+
homepage=_HOMEPAGE,
119+
license=_LICENSE,
120+
citation=_CITATION,
121+
)
122+
123+
def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
124+
"""Returns SplitGenerators."""
125+
data_dir = dl_manager.download_and_extract(_URLS)
126+
127+
relation_names = dict()
128+
relation_path = data_dir["relations_url"]
129+
with open(relation_path, encoding="utf-8") as f:
130+
for row in f:
131+
rel_code, rel_name, _, _ = row.strip().split("\t")
132+
relation_names[rel_code] = rel_name
133+
134+
return [
135+
datasets.SplitGenerator(
136+
name=datasets.Split.TRAIN,
137+
gen_kwargs={"filepath": data_dir["train"], "relation_names": relation_names},
138+
),
139+
datasets.SplitGenerator(
140+
name=datasets.Split.TEST,
141+
gen_kwargs={"filepath": data_dir["test"], "relation_names": relation_names},
142+
),
143+
datasets.SplitGenerator(
144+
name=datasets.Split.VALIDATION,
145+
gen_kwargs={"filepath": data_dir["dev"], "relation_names": relation_names},
146+
),
147+
]
148+
149+
def _generate_examples(self, filepath: Path, relation_names: dict) -> Tuple[int, Dict]:
150+
"""Yields examples as (key, example) tuples."""
151+
152+
if self.config.schema == "source":
153+
with jsonlines.open(filepath) as f:
154+
skip = set()
155+
for example in f.iter():
156+
if example["docid"] in skip:
157+
continue
158+
skip.add(example["docid"])
159+
160+
entities = []
161+
for entity in example["entities"]:
162+
entities.append(
163+
{
164+
"uri": entity["uri"],
165+
"surfaceform": entity["surfaceform"],
166+
"start": entity["boundaries"][0],
167+
"end": entity["boundaries"][1],
168+
"type": entity["type"],
169+
}
170+
)
171+
172+
relations = []
173+
for relation in example["relations"]:
174+
if relation["predicate"]["uri"] not in relation_names or relation["confidence"] <= 0.75:
175+
continue
176+
177+
relations.append(
178+
{
179+
"subject": entities.index(
180+
{
181+
"uri": relation["subject"]["uri"],
182+
"surfaceform": relation["subject"]["surfaceform"],
183+
"start": relation["subject"]["boundaries"][0],
184+
"end": relation["subject"]["boundaries"][1],
185+
"type": relation["subject"]["type"],
186+
}
187+
),
188+
"predicate": relation_names[relation["predicate"]["uri"]],
189+
"object": entities.index(
190+
{
191+
"uri": relation["object"]["uri"],
192+
"surfaceform": relation["object"]["surfaceform"],
193+
"start": relation["object"]["boundaries"][0],
194+
"end": relation["object"]["boundaries"][1],
195+
"type": relation["object"]["type"],
196+
}
197+
),
198+
}
199+
)
200+
201+
if len(relations) == 0:
202+
continue
203+
204+
yield example["docid"], {
205+
"docid": example["docid"],
206+
"title": example["title"],
207+
"uri": example["uri"],
208+
"text": example["text"],
209+
"entities": entities,
210+
"relations": relations,
211+
}
212+
213+
elif self.config.schema == "seacrowd_kb":
214+
with jsonlines.open(filepath) as f:
215+
skip = set()
216+
i = 0
217+
for example in f.iter():
218+
if example["docid"] in skip:
219+
continue
220+
skip.add(example["docid"])
221+
222+
i += 1
223+
processed_text = example["text"].replace("\n", " ")
224+
passages = [
225+
{
226+
"id": f"{i}-{example['uri']}",
227+
"type": "text",
228+
"text": [processed_text],
229+
"offsets": [[0, len(processed_text)]],
230+
}
231+
]
232+
233+
entities = []
234+
for entity in example["entities"]:
235+
entities.append(
236+
{
237+
"id": entity["uri"],
238+
"type": entity["type"],
239+
"text": [entity["surfaceform"]],
240+
"offsets": [entity["boundaries"]],
241+
"normalized": {"db_name": "", "db_id": ""},
242+
}
243+
)
244+
245+
relations = []
246+
for relation in example["relations"]:
247+
if relation["predicate"]["uri"] not in relation_names or relation["confidence"] <= 0.75:
248+
continue
249+
250+
i += 1
251+
sub = relation["subject"]
252+
pred = relation["predicate"]
253+
obj = relation["object"]
254+
relations.append(
255+
{
256+
"id": f"{i}-{sub['uri']}-{pred['uri']}-{obj['uri']}",
257+
"type": relation_names[pred["uri"]],
258+
"arg1_id": str(
259+
entities.index(
260+
{
261+
"id": sub["uri"],
262+
"type": sub["type"],
263+
"text": [sub["surfaceform"]],
264+
"offsets": [sub["boundaries"]],
265+
"normalized": {"db_name": "", "db_id": ""},
266+
}
267+
)
268+
),
269+
"arg2_id": str(
270+
entities.index(
271+
{
272+
"id": obj["uri"],
273+
"type": obj["type"],
274+
"text": [obj["surfaceform"]],
275+
"offsets": [obj["boundaries"]],
276+
"normalized": {"db_name": "", "db_id": ""},
277+
}
278+
)
279+
),
280+
"normalized": {"db_name": "", "db_id": ""},
281+
}
282+
)
283+
284+
for entity in entities:
285+
i += 1
286+
entity["id"] = f"{i}-{entity['id']}"
287+
288+
if len(relations) == 0:
289+
continue
290+
291+
yield example["docid"], {
292+
"id": example["docid"],
293+
"passages": passages,
294+
"entities": entities,
295+
"relations": relations,
296+
"events": [],
297+
"coreferences": [],
298+
}

0 commit comments

Comments
 (0)