|
| 1 | +# Some code referenced from https://huggingface.co/datasets/Babelscape/SREDFM/blob/main/SREDFM.py |
| 2 | + |
| 3 | +from pathlib import Path |
| 4 | +from typing import Dict, List, Tuple |
| 5 | + |
| 6 | +import datasets |
| 7 | +import jsonlines |
| 8 | + |
| 9 | +from seacrowd.utils import schemas |
| 10 | +from seacrowd.utils.configs import SEACrowdConfig |
| 11 | +from seacrowd.utils.constants import Licenses, Tasks |
| 12 | + |
| 13 | +_CITATION = """\ |
| 14 | +@inproceedings{huguet-cabot-et-al-2023-redfm-dataset, |
| 15 | + title = "RED$^{\rm FM}$: a Filtered and Multilingual Relation Extraction Dataset", |
| 16 | + author = "Huguet Cabot, Pere-Lluís and Tedeschi, Simone and Ngonga Ngomo, Axel-Cyrille and |
| 17 | + Navigli, Roberto", |
| 18 | + booktitle = "Proc. of the 61st Annual Meeting of the Association for Computational Linguistics: ACL 2023", |
| 19 | + month = jul, |
| 20 | + year = "2023", |
| 21 | + address = "Toronto, Canada", |
| 22 | + publisher = "Association for Computational Linguistics", |
| 23 | + url = "https://arxiv.org/abs/2306.09802", |
| 24 | +} |
| 25 | +""" |
| 26 | + |
| 27 | +_DATASETNAME = "sredfm" |
| 28 | + |
| 29 | + |
| 30 | +_DESCRIPTION = """\ |
| 31 | +SREDFM is an automatically annotated dataset for relation extraction task covering 18 languages, 400 relation types, 13 entity types, totaling more than 40 million triplet instances. SREDFM includes Vietnamnese. |
| 32 | +""" |
| 33 | + |
| 34 | +_HOMEPAGE = "https://github.com/babelscape/rebel" |
| 35 | + |
| 36 | +_LANGUAGES = ["vie"] |
| 37 | + |
| 38 | +_LICENSE = Licenses.CC_BY_SA_4_0.value |
| 39 | + |
| 40 | +_LOCAL = False |
| 41 | + |
| 42 | +_URLS = { |
| 43 | + "train": "https://huggingface.co/datasets/Babelscape/SREDFM/resolve/main/data/train.vi.jsonl", |
| 44 | + "dev": "https://huggingface.co/datasets/Babelscape/SREDFM/resolve/main/data/dev.vi.jsonl", |
| 45 | + "test": "https://huggingface.co/datasets/Babelscape/SREDFM/resolve/main/data/test.vi.jsonl", |
| 46 | + "relations_url": "https://huggingface.co/datasets/Babelscape/SREDFM/raw/main/relations.tsv", |
| 47 | +} |
| 48 | + |
| 49 | +_SUPPORTED_TASKS = [Tasks.RELATION_EXTRACTION] |
| 50 | + |
| 51 | +_SOURCE_VERSION = "1.0.0" |
| 52 | + |
| 53 | +_SEACROWD_VERSION = "1.0.0" |
| 54 | + |
| 55 | + |
| 56 | +class SREDFMDataset(datasets.GeneratorBasedBuilder): |
| 57 | + """SREDFM is an automatically annotated dataset for relation extraction task. |
| 58 | + Relation Extraction (RE) is a task that identifies relationships between entities in a text, |
| 59 | + enabling the acquisition of relational facts and bridging the gap between natural language |
| 60 | + and structured knowledge. SREDFM covers 400 relation types, 13 entity types, |
| 61 | + totaling more than 40 million triplet instances.""" |
| 62 | + |
| 63 | + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) |
| 64 | + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) |
| 65 | + |
| 66 | + BUILDER_CONFIGS = [ |
| 67 | + SEACrowdConfig( |
| 68 | + name=f"{_DATASETNAME}_source", |
| 69 | + version=SOURCE_VERSION, |
| 70 | + description=f"{_DATASETNAME} source schema", |
| 71 | + schema="source", |
| 72 | + subset_id=f"{_DATASETNAME}", |
| 73 | + ), |
| 74 | + SEACrowdConfig( |
| 75 | + name=f"{_DATASETNAME}_seacrowd_kb", |
| 76 | + version=SEACROWD_VERSION, |
| 77 | + description=f"{_DATASETNAME} SEACrowd schema", |
| 78 | + schema="seacrowd_kb", |
| 79 | + subset_id=f"{_DATASETNAME}", |
| 80 | + ), |
| 81 | + ] |
| 82 | + |
| 83 | + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" |
| 84 | + |
| 85 | + def _info(self) -> datasets.DatasetInfo: |
| 86 | + if self.config.schema == "source": |
| 87 | + features = datasets.Features( |
| 88 | + { |
| 89 | + "docid": datasets.Value("string"), |
| 90 | + "title": datasets.Value("string"), |
| 91 | + "uri": datasets.Value("string"), |
| 92 | + "text": datasets.Value("string"), |
| 93 | + "entities": [ |
| 94 | + { |
| 95 | + "uri": datasets.Value(dtype="string"), |
| 96 | + "surfaceform": datasets.Value(dtype="string"), |
| 97 | + "type": datasets.Value(dtype="string"), |
| 98 | + "start": datasets.Value(dtype="int32"), |
| 99 | + "end": datasets.Value(dtype="int32"), |
| 100 | + } |
| 101 | + ], |
| 102 | + "relations": [ |
| 103 | + { |
| 104 | + "subject": datasets.Value(dtype="int32"), |
| 105 | + "predicate": datasets.Value(dtype="string"), |
| 106 | + "object": datasets.Value(dtype="int32"), |
| 107 | + } |
| 108 | + ], |
| 109 | + } |
| 110 | + ) |
| 111 | + |
| 112 | + elif self.config.schema == "seacrowd_kb": |
| 113 | + features = schemas.kb_features |
| 114 | + |
| 115 | + return datasets.DatasetInfo( |
| 116 | + description=_DESCRIPTION, |
| 117 | + features=features, |
| 118 | + homepage=_HOMEPAGE, |
| 119 | + license=_LICENSE, |
| 120 | + citation=_CITATION, |
| 121 | + ) |
| 122 | + |
| 123 | + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: |
| 124 | + """Returns SplitGenerators.""" |
| 125 | + data_dir = dl_manager.download_and_extract(_URLS) |
| 126 | + |
| 127 | + relation_names = dict() |
| 128 | + relation_path = data_dir["relations_url"] |
| 129 | + with open(relation_path, encoding="utf-8") as f: |
| 130 | + for row in f: |
| 131 | + rel_code, rel_name, _, _ = row.strip().split("\t") |
| 132 | + relation_names[rel_code] = rel_name |
| 133 | + |
| 134 | + return [ |
| 135 | + datasets.SplitGenerator( |
| 136 | + name=datasets.Split.TRAIN, |
| 137 | + gen_kwargs={"filepath": data_dir["train"], "relation_names": relation_names}, |
| 138 | + ), |
| 139 | + datasets.SplitGenerator( |
| 140 | + name=datasets.Split.TEST, |
| 141 | + gen_kwargs={"filepath": data_dir["test"], "relation_names": relation_names}, |
| 142 | + ), |
| 143 | + datasets.SplitGenerator( |
| 144 | + name=datasets.Split.VALIDATION, |
| 145 | + gen_kwargs={"filepath": data_dir["dev"], "relation_names": relation_names}, |
| 146 | + ), |
| 147 | + ] |
| 148 | + |
| 149 | + def _generate_examples(self, filepath: Path, relation_names: dict) -> Tuple[int, Dict]: |
| 150 | + """Yields examples as (key, example) tuples.""" |
| 151 | + |
| 152 | + if self.config.schema == "source": |
| 153 | + with jsonlines.open(filepath) as f: |
| 154 | + skip = set() |
| 155 | + for example in f.iter(): |
| 156 | + if example["docid"] in skip: |
| 157 | + continue |
| 158 | + skip.add(example["docid"]) |
| 159 | + |
| 160 | + entities = [] |
| 161 | + for entity in example["entities"]: |
| 162 | + entities.append( |
| 163 | + { |
| 164 | + "uri": entity["uri"], |
| 165 | + "surfaceform": entity["surfaceform"], |
| 166 | + "start": entity["boundaries"][0], |
| 167 | + "end": entity["boundaries"][1], |
| 168 | + "type": entity["type"], |
| 169 | + } |
| 170 | + ) |
| 171 | + |
| 172 | + relations = [] |
| 173 | + for relation in example["relations"]: |
| 174 | + if relation["predicate"]["uri"] not in relation_names or relation["confidence"] <= 0.75: |
| 175 | + continue |
| 176 | + |
| 177 | + relations.append( |
| 178 | + { |
| 179 | + "subject": entities.index( |
| 180 | + { |
| 181 | + "uri": relation["subject"]["uri"], |
| 182 | + "surfaceform": relation["subject"]["surfaceform"], |
| 183 | + "start": relation["subject"]["boundaries"][0], |
| 184 | + "end": relation["subject"]["boundaries"][1], |
| 185 | + "type": relation["subject"]["type"], |
| 186 | + } |
| 187 | + ), |
| 188 | + "predicate": relation_names[relation["predicate"]["uri"]], |
| 189 | + "object": entities.index( |
| 190 | + { |
| 191 | + "uri": relation["object"]["uri"], |
| 192 | + "surfaceform": relation["object"]["surfaceform"], |
| 193 | + "start": relation["object"]["boundaries"][0], |
| 194 | + "end": relation["object"]["boundaries"][1], |
| 195 | + "type": relation["object"]["type"], |
| 196 | + } |
| 197 | + ), |
| 198 | + } |
| 199 | + ) |
| 200 | + |
| 201 | + if len(relations) == 0: |
| 202 | + continue |
| 203 | + |
| 204 | + yield example["docid"], { |
| 205 | + "docid": example["docid"], |
| 206 | + "title": example["title"], |
| 207 | + "uri": example["uri"], |
| 208 | + "text": example["text"], |
| 209 | + "entities": entities, |
| 210 | + "relations": relations, |
| 211 | + } |
| 212 | + |
| 213 | + elif self.config.schema == "seacrowd_kb": |
| 214 | + with jsonlines.open(filepath) as f: |
| 215 | + skip = set() |
| 216 | + i = 0 |
| 217 | + for example in f.iter(): |
| 218 | + if example["docid"] in skip: |
| 219 | + continue |
| 220 | + skip.add(example["docid"]) |
| 221 | + |
| 222 | + i += 1 |
| 223 | + processed_text = example["text"].replace("\n", " ") |
| 224 | + passages = [ |
| 225 | + { |
| 226 | + "id": f"{i}-{example['uri']}", |
| 227 | + "type": "text", |
| 228 | + "text": [processed_text], |
| 229 | + "offsets": [[0, len(processed_text)]], |
| 230 | + } |
| 231 | + ] |
| 232 | + |
| 233 | + entities = [] |
| 234 | + for entity in example["entities"]: |
| 235 | + entities.append( |
| 236 | + { |
| 237 | + "id": entity["uri"], |
| 238 | + "type": entity["type"], |
| 239 | + "text": [entity["surfaceform"]], |
| 240 | + "offsets": [entity["boundaries"]], |
| 241 | + "normalized": {"db_name": "", "db_id": ""}, |
| 242 | + } |
| 243 | + ) |
| 244 | + |
| 245 | + relations = [] |
| 246 | + for relation in example["relations"]: |
| 247 | + if relation["predicate"]["uri"] not in relation_names or relation["confidence"] <= 0.75: |
| 248 | + continue |
| 249 | + |
| 250 | + i += 1 |
| 251 | + sub = relation["subject"] |
| 252 | + pred = relation["predicate"] |
| 253 | + obj = relation["object"] |
| 254 | + relations.append( |
| 255 | + { |
| 256 | + "id": f"{i}-{sub['uri']}-{pred['uri']}-{obj['uri']}", |
| 257 | + "type": relation_names[pred["uri"]], |
| 258 | + "arg1_id": str( |
| 259 | + entities.index( |
| 260 | + { |
| 261 | + "id": sub["uri"], |
| 262 | + "type": sub["type"], |
| 263 | + "text": [sub["surfaceform"]], |
| 264 | + "offsets": [sub["boundaries"]], |
| 265 | + "normalized": {"db_name": "", "db_id": ""}, |
| 266 | + } |
| 267 | + ) |
| 268 | + ), |
| 269 | + "arg2_id": str( |
| 270 | + entities.index( |
| 271 | + { |
| 272 | + "id": obj["uri"], |
| 273 | + "type": obj["type"], |
| 274 | + "text": [obj["surfaceform"]], |
| 275 | + "offsets": [obj["boundaries"]], |
| 276 | + "normalized": {"db_name": "", "db_id": ""}, |
| 277 | + } |
| 278 | + ) |
| 279 | + ), |
| 280 | + "normalized": {"db_name": "", "db_id": ""}, |
| 281 | + } |
| 282 | + ) |
| 283 | + |
| 284 | + for entity in entities: |
| 285 | + i += 1 |
| 286 | + entity["id"] = f"{i}-{entity['id']}" |
| 287 | + |
| 288 | + if len(relations) == 0: |
| 289 | + continue |
| 290 | + |
| 291 | + yield example["docid"], { |
| 292 | + "id": example["docid"], |
| 293 | + "passages": passages, |
| 294 | + "entities": entities, |
| 295 | + "relations": relations, |
| 296 | + "events": [], |
| 297 | + "coreferences": [], |
| 298 | + } |
0 commit comments