From 4f3e70439e43ad2f32af2b12eb68b6e1aaee6bdc Mon Sep 17 00:00:00 2001 From: "U-CORP\\ishan.jindal" Date: Thu, 28 Dec 2023 12:08:18 +0530 Subject: [PATCH 01/18] add initiaizer --- seacrowd/sea_datasets/ud/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100755 seacrowd/sea_datasets/ud/__init__.py diff --git a/seacrowd/sea_datasets/ud/__init__.py b/seacrowd/sea_datasets/ud/__init__.py new file mode 100755 index 000000000..e69de29bb From 74239d83ec64375a8661bc7a66130ae48b1c3563 Mon Sep 17 00:00:00 2001 From: ijindal Date: Thu, 28 Dec 2023 12:14:30 +0530 Subject: [PATCH 02/18] add ud driver --- seacrowd/sea_datasets/ud/ud.py | 176 +++++++++++++++++++++++++++++++++ 1 file changed, 176 insertions(+) create mode 100755 seacrowd/sea_datasets/ud/ud.py diff --git a/seacrowd/sea_datasets/ud/ud.py b/seacrowd/sea_datasets/ud/ud.py new file mode 100755 index 000000000..0768f9720 --- /dev/null +++ b/seacrowd/sea_datasets/ud/ud.py @@ -0,0 +1,176 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +from conllu import TokenList + +from seacrowd.utils import schemas +from seacrowd.utils.common_parser import load_ud_data, load_ud_data_as_seacrowd_kb +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Tasks + + +_CITATION = "" + +_LANGUAGES = ["ind", "vie"] +_LOCAL = False + +_DATASETNAME = "ud-v2.13" + +_SUBSETS = {"id_gsd" : "UD_Indonesian-GSD", + "id_csui": "UD_Indonesian-CSUI", + "id_pud" : "UD_Indonesian-PUD", + "vi_vtb": "UD_Vietnamese-VTB"} + +_DESCRIPTION = """\ +Universal Dependencies (UD) is a project that is developing cross-linguistically consistent treebank annotation + for many languages, with the goal of facilitating multilingual parser development, cross-lingual learning, and + parsing research from a language typology perspective. The annotation scheme is based on an evolution of (universal) + Stanford dependencies (de Marneffe et al., 2006, 2008, 2014), Google universal part-of-speech tags + (Petrov et al., 2012), and the Interset interlingua for morphosyntactic tagsets (Zeman, 2008). + The general philosophy is to provide a universal inventory of categories and guidelines to facilitate consistent + annotation of similar constructions across languages, while allowing language-specific extensions when necessary. +""" + +_HOMEPAGE = "https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-5287" + +_LICENSE = "Apache license 2.0 (apache-2.0)" + +_URLS = { + "ud-v2.12": "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-5150/ud-treebanks-v2.12.tgz?sequence=1&isAllowed=y", + "ud-v2.13": "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-5287/ud-treebanks-v2.13.tgz?sequence=1&isAllowed=y" +} + +_SUPPORTED_TASKS = [Tasks.POS_TAGGING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + + + + +class UDDataset(datasets.GeneratorBasedBuilder): + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_seq_label", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd Seq Label schema", + schema="seacrowd_seq_label", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + UPOS_TAGS = ["ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X"] + + def _info(self) -> datasets.DatasetInfo: + self.config.schema = "seacrowd_seq_label" + if self.config.schema == "source": + features = datasets.Features( + { + # metadata + "sent_id": datasets.Value("string"), + "text": datasets.Value("string"), + "text_en": datasets.Value("string"), + # tokens + "id": [datasets.Value("string")], + "form": [datasets.Value("string")], + "lemma": [datasets.Value("string")], + "upos": [datasets.Value("string")], + "xpos": [datasets.Value("string")], + "feats": [datasets.Value("string")], + "head": [datasets.Value("string")], + "deprel": [datasets.Value("string")], + "deps": [datasets.Value("string")], + "misc": [datasets.Value("string")], + } + ) + + elif self.config.schema == "seacrowd_seq_label": + features = schemas.seq_label_features(self.UPOS_TAGS) + + else: + raise NotImplementedError(f"Schema '{self.config.schema}' is not defined.") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators( + self, dl_manager: datasets.DownloadManager + ) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + # data_path = dl_manager.download(urls) + + return [] + + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + """instance tuple generated in the form (key, labels)""" + + dataset = list( + load_ud_data( + filepath, + filter_kwargs={"id": lambda i: isinstance(i, int)}, + ) + ) + + if self.config.schema == "source": + pass + + elif self.config.schema == "seacrowd_seq_label": + dataset = list( + map( + lambda d: { + "id": d["sent_id"], + "tokens": d["form"], + "labels": d["upos"], + }, + dataset, + ) + ) + + else: + raise NotImplementedError(f"Schema '{self.config.schema}' is not defined.") + + for key, example in enumerate(dataset): + yield key, example + + +if __name__ == "__main__": + data = datasets.load_dataset(__file__) + print("xx") From a40363c8d234fef500fd4815541aad4fdb5d42f9 Mon Sep 17 00:00:00 2001 From: ijindal Date: Thu, 28 Dec 2023 12:25:34 +0530 Subject: [PATCH 03/18] subsets datasets loaders --- seacrowd/sea_datasets/ud/ud_id_csui.py | 113 +++++++++++++++++++++++ seacrowd/sea_datasets/ud/ud_id_gsd.py | 121 +++++++++++++++++++++++++ seacrowd/sea_datasets/ud/ud_id_pud.py | 106 ++++++++++++++++++++++ seacrowd/sea_datasets/ud/ud_vi_vtb.py | 121 +++++++++++++++++++++++++ 4 files changed, 461 insertions(+) create mode 100755 seacrowd/sea_datasets/ud/ud_id_csui.py create mode 100755 seacrowd/sea_datasets/ud/ud_id_gsd.py create mode 100755 seacrowd/sea_datasets/ud/ud_id_pud.py create mode 100755 seacrowd/sea_datasets/ud/ud_vi_vtb.py diff --git a/seacrowd/sea_datasets/ud/ud_id_csui.py b/seacrowd/sea_datasets/ud/ud_id_csui.py new file mode 100755 index 000000000..7bfec87ee --- /dev/null +++ b/seacrowd/sea_datasets/ud/ud_id_csui.py @@ -0,0 +1,113 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils.common_parser import load_ud_data, load_ud_data_as_seacrowd_kb +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Tasks +from seacrowd.sea_datasets.ud.ud import UDDataset, _DATASETNAME + +_CITATION = "" + +_LANGUAGES = ["ind", "vie"] +_LOCAL = False + +_SUBSET = "id_csui" + +_DESCRIPTION = """\ +Universal Dependencies (UD) is a project that is developing cross-linguistically consistent treebank annotation + for many languages, with the goal of facilitating multilingual parser development, cross-lingual learning, and + parsing research from a language typology perspective. The annotation scheme is based on an evolution of (universal) + Stanford dependencies (de Marneffe et al., 2006, 2008, 2014), Google universal part-of-speech tags + (Petrov et al., 2012), and the Interset interlingua for morphosyntactic tagsets (Zeman, 2008). + The general philosophy is to provide a universal inventory of categories and guidelines to facilitate consistent + annotation of similar constructions across languages, while allowing language-specific extensions when necessary. +""" + +_HOMEPAGE = "https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-5287" + +_LICENSE = "Apache license 2.0 (apache-2.0)" + +_URLS = { + "id_csui": { + "train": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-CSUI/master/id_csui-ud-train.conllu", + "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-CSUI/master/id_csui-ud-test.conllu", + }, +} + +_SUPPORTED_TASKS = [Tasks.POS_TAGGING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + + +class UdIdCSUIDataset(UDDataset): + + # def __init__(self, subset): + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{_SUBSET}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME}_{_SUBSET} source schema", + schema="source", + subset_id=f"{_DATASETNAME}_{_SUBSET}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_{_SUBSET}_seacrowd_seq_label", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME}_{_SUBSET} SEACrowd Seq Label schema", + schema="seacrowd_seq_label", + subset_id=f"{_DATASETNAME}_{_SUBSET}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{_SUBSET}_source" + + def _split_generators( + self, dl_manager: datasets.DownloadManager + ) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_SUBSET] + data_path = dl_manager.download(urls) + print(data_path) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_path["train"] + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": data_path["test"], + }, + ) + ] + + +if __name__ == "__main__": + data = datasets.load_dataset(__file__) \ No newline at end of file diff --git a/seacrowd/sea_datasets/ud/ud_id_gsd.py b/seacrowd/sea_datasets/ud/ud_id_gsd.py new file mode 100755 index 000000000..4f80d3539 --- /dev/null +++ b/seacrowd/sea_datasets/ud/ud_id_gsd.py @@ -0,0 +1,121 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils.common_parser import load_ud_data, load_ud_data_as_seacrowd_kb +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Tasks +from seacrowd.sea_datasets.ud.ud import UDDataset, _DATASETNAME + +_CITATION = "" + +_LANGUAGES = ["ind", "vie"] +_LOCAL = False + +_SUBSET = "id_gsd" + +_DESCRIPTION = """\ +Universal Dependencies (UD) is a project that is developing cross-linguistically consistent treebank annotation + for many languages, with the goal of facilitating multilingual parser development, cross-lingual learning, and + parsing research from a language typology perspective. The annotation scheme is based on an evolution of (universal) + Stanford dependencies (de Marneffe et al., 2006, 2008, 2014), Google universal part-of-speech tags + (Petrov et al., 2012), and the Interset interlingua for morphosyntactic tagsets (Zeman, 2008). + The general philosophy is to provide a universal inventory of categories and guidelines to facilitate consistent + annotation of similar constructions across languages, while allowing language-specific extensions when necessary. +""" + +_HOMEPAGE = "https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-5287" + +_LICENSE = "Apache license 2.0 (apache-2.0)" + +_URLS = { + "id_gsd": { + "train": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-GSD/master/id_gsd-ud-train.conllu", + "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-GSD/master/id_gsd-ud-test.conllu", + "dev": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-GSD/master/id_gsd-ud-dev.conllu", + }, +} + +_SUPPORTED_TASKS = [Tasks.POS_TAGGING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + + +class UdIdGSDDataset(UDDataset): + + # def __init__(self, subset): + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{_SUBSET}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME}_{_SUBSET} source schema", + schema="source", + subset_id=f"{_DATASETNAME}_{_SUBSET}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_{_SUBSET}_seacrowd_seq_label", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME}_{_SUBSET} SEACrowd Seq Label schema", + schema="seacrowd_seq_label", + subset_id=f"{_DATASETNAME}_{_SUBSET}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{_SUBSET}_source" + + def _split_generators( + self, dl_manager: datasets.DownloadManager + ) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_SUBSET] + data_path = dl_manager.download(urls) + print(data_path) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_path["train"] + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": data_path["test"], + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": data_path["dev"], + }, + ), + ] + + +if __name__ == "__main__": + data = datasets.load_dataset(__file__) + diff --git a/seacrowd/sea_datasets/ud/ud_id_pud.py b/seacrowd/sea_datasets/ud/ud_id_pud.py new file mode 100755 index 000000000..d8cb84a0e --- /dev/null +++ b/seacrowd/sea_datasets/ud/ud_id_pud.py @@ -0,0 +1,106 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils.common_parser import load_ud_data, load_ud_data_as_seacrowd_kb +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Tasks +from seacrowd.sea_datasets.ud.ud import UDDataset, _DATASETNAME + +_CITATION = "" + +_LANGUAGES = ["ind", "vie"] +_LOCAL = False + +_SUBSET = "id_pud" + +_DESCRIPTION = """\ +Universal Dependencies (UD) is a project that is developing cross-linguistically consistent treebank annotation + for many languages, with the goal of facilitating multilingual parser development, cross-lingual learning, and + parsing research from a language typology perspective. The annotation scheme is based on an evolution of (universal) + Stanford dependencies (de Marneffe et al., 2006, 2008, 2014), Google universal part-of-speech tags + (Petrov et al., 2012), and the Interset interlingua for morphosyntactic tagsets (Zeman, 2008). + The general philosophy is to provide a universal inventory of categories and guidelines to facilitate consistent + annotation of similar constructions across languages, while allowing language-specific extensions when necessary. +""" + +_HOMEPAGE = "https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-5287" + +_LICENSE = "Apache license 2.0 (apache-2.0)" + +_URLS = { + "id_pud": { + "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-PUD/master/id_pud-ud-test.conllu" + }, +} + +_SUPPORTED_TASKS = [Tasks.POS_TAGGING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + + +class UdIdPUDDataset(UDDataset): + + # def __init__(self, subset): + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{_SUBSET}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME}_{_SUBSET} source schema", + schema="source", + subset_id=f"{_DATASETNAME}_{_SUBSET}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_{_SUBSET}_seacrowd_seq_label", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME}_{_SUBSET} SEACrowd Seq Label schema", + schema="seacrowd_seq_label", + subset_id=f"{_DATASETNAME}_{_SUBSET}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{_SUBSET}_source" + + def _split_generators( + self, dl_manager: datasets.DownloadManager + ) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_SUBSET] + data_path = dl_manager.download(urls) + print(data_path) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": data_path["test"], + }, + ) + ] + + +if __name__ == "__main__": + data = datasets.load_dataset(__file__) \ No newline at end of file diff --git a/seacrowd/sea_datasets/ud/ud_vi_vtb.py b/seacrowd/sea_datasets/ud/ud_vi_vtb.py new file mode 100755 index 000000000..0b6f2970d --- /dev/null +++ b/seacrowd/sea_datasets/ud/ud_vi_vtb.py @@ -0,0 +1,121 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils.common_parser import load_ud_data, load_ud_data_as_seacrowd_kb +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Tasks +from seacrowd.sea_datasets.ud.ud import UDDataset, _DATASETNAME + +_CITATION = "" + +_LANGUAGES = ["ind", "vie"] +_LOCAL = False + +_SUBSET = "vi_vtb" + +_DESCRIPTION = """\ +Universal Dependencies (UD) is a project that is developing cross-linguistically consistent treebank annotation + for many languages, with the goal of facilitating multilingual parser development, cross-lingual learning, and + parsing research from a language typology perspective. The annotation scheme is based on an evolution of (universal) + Stanford dependencies (de Marneffe et al., 2006, 2008, 2014), Google universal part-of-speech tags + (Petrov et al., 2012), and the Interset interlingua for morphosyntactic tagsets (Zeman, 2008). + The general philosophy is to provide a universal inventory of categories and guidelines to facilitate consistent + annotation of similar constructions across languages, while allowing language-specific extensions when necessary. +""" + +_HOMEPAGE = "https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-5287" + +_LICENSE = "Apache license 2.0 (apache-2.0)" + +_URLS = { + "vi_vtb": { + "train": "https://raw.githubusercontent.com/UniversalDependencies/UD_Vietnamese-VTB/master/vi_vtb-ud-train.conllu", + "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Vietnamese-VTB/master/vi_vtb-ud-test.conllu", + "dev": "https://raw.githubusercontent.com/UniversalDependencies/UD_Vietnamese-VTB/master/vi_vtb-ud-dev.conllu", + }, +} + +_SUPPORTED_TASKS = [Tasks.POS_TAGGING] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + + +class UdViVTBDataset(UDDataset): + + # def __init__(self, subset): + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{_SUBSET}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME}_{_SUBSET} source schema", + schema="source", + subset_id=f"{_DATASETNAME}_{_SUBSET}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_{_SUBSET}_seacrowd_seq_label", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME}_{_SUBSET} SEACrowd Seq Label schema", + schema="seacrowd_seq_label", + subset_id=f"{_DATASETNAME}_{_SUBSET}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{_SUBSET}_source" + + def _split_generators( + self, dl_manager: datasets.DownloadManager + ) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_SUBSET] + data_path = dl_manager.download(urls) + print(data_path) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_path["train"] + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": data_path["test"], + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": data_path["dev"], + }, + ), + ] + + +if __name__ == "__main__": + data = datasets.load_dataset(__file__) + From c8ce267186797a1f8a469b8922af50bcffedc3b4 Mon Sep 17 00:00:00 2001 From: Ishan Jindal Date: Thu, 28 Dec 2023 12:40:21 +0530 Subject: [PATCH 04/18] Update common_parser.py for utf8 encoding --- seacrowd/utils/common_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/seacrowd/utils/common_parser.py b/seacrowd/utils/common_parser.py index c4fbbfe29..5e7350c9a 100644 --- a/seacrowd/utils/common_parser.py +++ b/seacrowd/utils/common_parser.py @@ -6,7 +6,7 @@ def load_conll_data(file_path): # Read file - data = open(file_path, "r").readlines() + data = open(file_path, "r", encoding="utf8").readlines() # Prepare buffer dataset = [] @@ -34,7 +34,7 @@ def load_ud_data(filepath, filter_kwargs=None, assert_fn=None): :param assert_fn: assertion to make sure raw data is in the expected format :return: generator with schema following CONLLU """ - dataset_raw = parse(open(filepath).read()) + dataset_raw = parse(open(filepath, encoding="utf8").read()) filter_kwargs = filter_kwargs or dict() if callable(assert_fn): From 9772965e9b250c008f29c31b9692ecaf4d245e25 Mon Sep 17 00:00:00 2001 From: ijindal Date: Mon, 15 Jan 2024 23:34:47 +0530 Subject: [PATCH 05/18] Remove separate functions --- seacrowd/sea_datasets/ud/ud_id_csui.py | 113 ----------------------- seacrowd/sea_datasets/ud/ud_id_gsd.py | 121 ------------------------- seacrowd/sea_datasets/ud/ud_id_pud.py | 106 ---------------------- seacrowd/sea_datasets/ud/ud_vi_vtb.py | 121 ------------------------- 4 files changed, 461 deletions(-) delete mode 100755 seacrowd/sea_datasets/ud/ud_id_csui.py delete mode 100755 seacrowd/sea_datasets/ud/ud_id_gsd.py delete mode 100755 seacrowd/sea_datasets/ud/ud_id_pud.py delete mode 100755 seacrowd/sea_datasets/ud/ud_vi_vtb.py diff --git a/seacrowd/sea_datasets/ud/ud_id_csui.py b/seacrowd/sea_datasets/ud/ud_id_csui.py deleted file mode 100755 index 7bfec87ee..000000000 --- a/seacrowd/sea_datasets/ud/ud_id_csui.py +++ /dev/null @@ -1,113 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from pathlib import Path -from typing import Dict, List, Tuple - -import datasets - -from seacrowd.utils.common_parser import load_ud_data, load_ud_data_as_seacrowd_kb -from seacrowd.utils.configs import SEACrowdConfig -from seacrowd.utils.constants import Tasks -from seacrowd.sea_datasets.ud.ud import UDDataset, _DATASETNAME - -_CITATION = "" - -_LANGUAGES = ["ind", "vie"] -_LOCAL = False - -_SUBSET = "id_csui" - -_DESCRIPTION = """\ -Universal Dependencies (UD) is a project that is developing cross-linguistically consistent treebank annotation - for many languages, with the goal of facilitating multilingual parser development, cross-lingual learning, and - parsing research from a language typology perspective. The annotation scheme is based on an evolution of (universal) - Stanford dependencies (de Marneffe et al., 2006, 2008, 2014), Google universal part-of-speech tags - (Petrov et al., 2012), and the Interset interlingua for morphosyntactic tagsets (Zeman, 2008). - The general philosophy is to provide a universal inventory of categories and guidelines to facilitate consistent - annotation of similar constructions across languages, while allowing language-specific extensions when necessary. -""" - -_HOMEPAGE = "https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-5287" - -_LICENSE = "Apache license 2.0 (apache-2.0)" - -_URLS = { - "id_csui": { - "train": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-CSUI/master/id_csui-ud-train.conllu", - "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-CSUI/master/id_csui-ud-test.conllu", - }, -} - -_SUPPORTED_TASKS = [Tasks.POS_TAGGING] - -_SOURCE_VERSION = "1.0.0" - -_SEACROWD_VERSION = "1.0.0" - - - -class UdIdCSUIDataset(UDDataset): - - # def __init__(self, subset): - - SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) - SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) - - BUILDER_CONFIGS = [ - SEACrowdConfig( - name=f"{_DATASETNAME}_{_SUBSET}_source", - version=SOURCE_VERSION, - description=f"{_DATASETNAME}_{_SUBSET} source schema", - schema="source", - subset_id=f"{_DATASETNAME}_{_SUBSET}", - ), - SEACrowdConfig( - name=f"{_DATASETNAME}_{_SUBSET}_seacrowd_seq_label", - version=SEACROWD_VERSION, - description=f"{_DATASETNAME}_{_SUBSET} SEACrowd Seq Label schema", - schema="seacrowd_seq_label", - subset_id=f"{_DATASETNAME}_{_SUBSET}", - ), - ] - - DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{_SUBSET}_source" - - def _split_generators( - self, dl_manager: datasets.DownloadManager - ) -> List[datasets.SplitGenerator]: - """Returns SplitGenerators.""" - urls = _URLS[_SUBSET] - data_path = dl_manager.download(urls) - print(data_path) - - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={ - "filepath": data_path["train"] - }, - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, - gen_kwargs={ - "filepath": data_path["test"], - }, - ) - ] - - -if __name__ == "__main__": - data = datasets.load_dataset(__file__) \ No newline at end of file diff --git a/seacrowd/sea_datasets/ud/ud_id_gsd.py b/seacrowd/sea_datasets/ud/ud_id_gsd.py deleted file mode 100755 index 4f80d3539..000000000 --- a/seacrowd/sea_datasets/ud/ud_id_gsd.py +++ /dev/null @@ -1,121 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from pathlib import Path -from typing import Dict, List, Tuple - -import datasets - -from seacrowd.utils.common_parser import load_ud_data, load_ud_data_as_seacrowd_kb -from seacrowd.utils.configs import SEACrowdConfig -from seacrowd.utils.constants import Tasks -from seacrowd.sea_datasets.ud.ud import UDDataset, _DATASETNAME - -_CITATION = "" - -_LANGUAGES = ["ind", "vie"] -_LOCAL = False - -_SUBSET = "id_gsd" - -_DESCRIPTION = """\ -Universal Dependencies (UD) is a project that is developing cross-linguistically consistent treebank annotation - for many languages, with the goal of facilitating multilingual parser development, cross-lingual learning, and - parsing research from a language typology perspective. The annotation scheme is based on an evolution of (universal) - Stanford dependencies (de Marneffe et al., 2006, 2008, 2014), Google universal part-of-speech tags - (Petrov et al., 2012), and the Interset interlingua for morphosyntactic tagsets (Zeman, 2008). - The general philosophy is to provide a universal inventory of categories and guidelines to facilitate consistent - annotation of similar constructions across languages, while allowing language-specific extensions when necessary. -""" - -_HOMEPAGE = "https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-5287" - -_LICENSE = "Apache license 2.0 (apache-2.0)" - -_URLS = { - "id_gsd": { - "train": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-GSD/master/id_gsd-ud-train.conllu", - "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-GSD/master/id_gsd-ud-test.conllu", - "dev": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-GSD/master/id_gsd-ud-dev.conllu", - }, -} - -_SUPPORTED_TASKS = [Tasks.POS_TAGGING] - -_SOURCE_VERSION = "1.0.0" - -_SEACROWD_VERSION = "1.0.0" - - - -class UdIdGSDDataset(UDDataset): - - # def __init__(self, subset): - - SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) - SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) - - BUILDER_CONFIGS = [ - SEACrowdConfig( - name=f"{_DATASETNAME}_{_SUBSET}_source", - version=SOURCE_VERSION, - description=f"{_DATASETNAME}_{_SUBSET} source schema", - schema="source", - subset_id=f"{_DATASETNAME}_{_SUBSET}", - ), - SEACrowdConfig( - name=f"{_DATASETNAME}_{_SUBSET}_seacrowd_seq_label", - version=SEACROWD_VERSION, - description=f"{_DATASETNAME}_{_SUBSET} SEACrowd Seq Label schema", - schema="seacrowd_seq_label", - subset_id=f"{_DATASETNAME}_{_SUBSET}", - ), - ] - - DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{_SUBSET}_source" - - def _split_generators( - self, dl_manager: datasets.DownloadManager - ) -> List[datasets.SplitGenerator]: - """Returns SplitGenerators.""" - urls = _URLS[_SUBSET] - data_path = dl_manager.download(urls) - print(data_path) - - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={ - "filepath": data_path["train"] - }, - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, - gen_kwargs={ - "filepath": data_path["test"], - }, - ), - datasets.SplitGenerator( - name=datasets.Split.VALIDATION, - gen_kwargs={ - "filepath": data_path["dev"], - }, - ), - ] - - -if __name__ == "__main__": - data = datasets.load_dataset(__file__) - diff --git a/seacrowd/sea_datasets/ud/ud_id_pud.py b/seacrowd/sea_datasets/ud/ud_id_pud.py deleted file mode 100755 index d8cb84a0e..000000000 --- a/seacrowd/sea_datasets/ud/ud_id_pud.py +++ /dev/null @@ -1,106 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from pathlib import Path -from typing import Dict, List, Tuple - -import datasets - -from seacrowd.utils.common_parser import load_ud_data, load_ud_data_as_seacrowd_kb -from seacrowd.utils.configs import SEACrowdConfig -from seacrowd.utils.constants import Tasks -from seacrowd.sea_datasets.ud.ud import UDDataset, _DATASETNAME - -_CITATION = "" - -_LANGUAGES = ["ind", "vie"] -_LOCAL = False - -_SUBSET = "id_pud" - -_DESCRIPTION = """\ -Universal Dependencies (UD) is a project that is developing cross-linguistically consistent treebank annotation - for many languages, with the goal of facilitating multilingual parser development, cross-lingual learning, and - parsing research from a language typology perspective. The annotation scheme is based on an evolution of (universal) - Stanford dependencies (de Marneffe et al., 2006, 2008, 2014), Google universal part-of-speech tags - (Petrov et al., 2012), and the Interset interlingua for morphosyntactic tagsets (Zeman, 2008). - The general philosophy is to provide a universal inventory of categories and guidelines to facilitate consistent - annotation of similar constructions across languages, while allowing language-specific extensions when necessary. -""" - -_HOMEPAGE = "https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-5287" - -_LICENSE = "Apache license 2.0 (apache-2.0)" - -_URLS = { - "id_pud": { - "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-PUD/master/id_pud-ud-test.conllu" - }, -} - -_SUPPORTED_TASKS = [Tasks.POS_TAGGING] - -_SOURCE_VERSION = "1.0.0" - -_SEACROWD_VERSION = "1.0.0" - - - -class UdIdPUDDataset(UDDataset): - - # def __init__(self, subset): - - SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) - SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) - - BUILDER_CONFIGS = [ - SEACrowdConfig( - name=f"{_DATASETNAME}_{_SUBSET}_source", - version=SOURCE_VERSION, - description=f"{_DATASETNAME}_{_SUBSET} source schema", - schema="source", - subset_id=f"{_DATASETNAME}_{_SUBSET}", - ), - SEACrowdConfig( - name=f"{_DATASETNAME}_{_SUBSET}_seacrowd_seq_label", - version=SEACROWD_VERSION, - description=f"{_DATASETNAME}_{_SUBSET} SEACrowd Seq Label schema", - schema="seacrowd_seq_label", - subset_id=f"{_DATASETNAME}_{_SUBSET}", - ), - ] - - DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{_SUBSET}_source" - - def _split_generators( - self, dl_manager: datasets.DownloadManager - ) -> List[datasets.SplitGenerator]: - """Returns SplitGenerators.""" - urls = _URLS[_SUBSET] - data_path = dl_manager.download(urls) - print(data_path) - - return [ - datasets.SplitGenerator( - name=datasets.Split.TEST, - gen_kwargs={ - "filepath": data_path["test"], - }, - ) - ] - - -if __name__ == "__main__": - data = datasets.load_dataset(__file__) \ No newline at end of file diff --git a/seacrowd/sea_datasets/ud/ud_vi_vtb.py b/seacrowd/sea_datasets/ud/ud_vi_vtb.py deleted file mode 100755 index 0b6f2970d..000000000 --- a/seacrowd/sea_datasets/ud/ud_vi_vtb.py +++ /dev/null @@ -1,121 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from pathlib import Path -from typing import Dict, List, Tuple - -import datasets - -from seacrowd.utils.common_parser import load_ud_data, load_ud_data_as_seacrowd_kb -from seacrowd.utils.configs import SEACrowdConfig -from seacrowd.utils.constants import Tasks -from seacrowd.sea_datasets.ud.ud import UDDataset, _DATASETNAME - -_CITATION = "" - -_LANGUAGES = ["ind", "vie"] -_LOCAL = False - -_SUBSET = "vi_vtb" - -_DESCRIPTION = """\ -Universal Dependencies (UD) is a project that is developing cross-linguistically consistent treebank annotation - for many languages, with the goal of facilitating multilingual parser development, cross-lingual learning, and - parsing research from a language typology perspective. The annotation scheme is based on an evolution of (universal) - Stanford dependencies (de Marneffe et al., 2006, 2008, 2014), Google universal part-of-speech tags - (Petrov et al., 2012), and the Interset interlingua for morphosyntactic tagsets (Zeman, 2008). - The general philosophy is to provide a universal inventory of categories and guidelines to facilitate consistent - annotation of similar constructions across languages, while allowing language-specific extensions when necessary. -""" - -_HOMEPAGE = "https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-5287" - -_LICENSE = "Apache license 2.0 (apache-2.0)" - -_URLS = { - "vi_vtb": { - "train": "https://raw.githubusercontent.com/UniversalDependencies/UD_Vietnamese-VTB/master/vi_vtb-ud-train.conllu", - "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Vietnamese-VTB/master/vi_vtb-ud-test.conllu", - "dev": "https://raw.githubusercontent.com/UniversalDependencies/UD_Vietnamese-VTB/master/vi_vtb-ud-dev.conllu", - }, -} - -_SUPPORTED_TASKS = [Tasks.POS_TAGGING] - -_SOURCE_VERSION = "1.0.0" - -_SEACROWD_VERSION = "1.0.0" - - - -class UdViVTBDataset(UDDataset): - - # def __init__(self, subset): - - SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) - SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) - - BUILDER_CONFIGS = [ - SEACrowdConfig( - name=f"{_DATASETNAME}_{_SUBSET}_source", - version=SOURCE_VERSION, - description=f"{_DATASETNAME}_{_SUBSET} source schema", - schema="source", - subset_id=f"{_DATASETNAME}_{_SUBSET}", - ), - SEACrowdConfig( - name=f"{_DATASETNAME}_{_SUBSET}_seacrowd_seq_label", - version=SEACROWD_VERSION, - description=f"{_DATASETNAME}_{_SUBSET} SEACrowd Seq Label schema", - schema="seacrowd_seq_label", - subset_id=f"{_DATASETNAME}_{_SUBSET}", - ), - ] - - DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{_SUBSET}_source" - - def _split_generators( - self, dl_manager: datasets.DownloadManager - ) -> List[datasets.SplitGenerator]: - """Returns SplitGenerators.""" - urls = _URLS[_SUBSET] - data_path = dl_manager.download(urls) - print(data_path) - - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={ - "filepath": data_path["train"] - }, - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, - gen_kwargs={ - "filepath": data_path["test"], - }, - ), - datasets.SplitGenerator( - name=datasets.Split.VALIDATION, - gen_kwargs={ - "filepath": data_path["dev"], - }, - ), - ] - - -if __name__ == "__main__": - data = datasets.load_dataset(__file__) - From 775e2094978da23197907cf41fedc31aa9120b95 Mon Sep 17 00:00:00 2001 From: ijindal Date: Mon, 15 Jan 2024 23:36:53 +0530 Subject: [PATCH 06/18] Unifying all subsets implementation --- seacrowd/sea_datasets/ud/ud.py | 250 ++++++++++++++++++++++++++++++--- 1 file changed, 234 insertions(+), 16 deletions(-) diff --git a/seacrowd/sea_datasets/ud/ud.py b/seacrowd/sea_datasets/ud/ud.py index 0768f9720..7e153c2b6 100755 --- a/seacrowd/sea_datasets/ud/ud.py +++ b/seacrowd/sea_datasets/ud/ud.py @@ -49,11 +49,28 @@ _HOMEPAGE = "https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-5287" -_LICENSE = "Apache license 2.0 (apache-2.0)" +_LICENSE = ["Apache license 2.0 (apache-2.0)"] _URLS = { "ud-v2.12": "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-5150/ud-treebanks-v2.12.tgz?sequence=1&isAllowed=y", - "ud-v2.13": "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-5287/ud-treebanks-v2.13.tgz?sequence=1&isAllowed=y" + "ud-v2.13": "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-5287/ud-treebanks-v2.13.tgz?sequence=1&isAllowed=y", + "id_csui": { + "train": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-CSUI/master/id_csui-ud-train.conllu", + "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-CSUI/master/id_csui-ud-test.conllu", + }, + "id_gsd": { + "train": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-GSD/master/id_gsd-ud-train.conllu", + "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-GSD/master/id_gsd-ud-test.conllu", + "dev": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-GSD/master/id_gsd-ud-dev.conllu", + }, + "id_pud": { + "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-PUD/master/id_pud-ud-test.conllu" + }, + "vi_vtb": { + "train": "https://raw.githubusercontent.com/UniversalDependencies/UD_Vietnamese-VTB/master/vi_vtb-ud-train.conllu", + "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Vietnamese-VTB/master/vi_vtb-ud-test.conllu", + "dev": "https://raw.githubusercontent.com/UniversalDependencies/UD_Vietnamese-VTB/master/vi_vtb-ud-dev.conllu", + }, } _SUPPORTED_TASKS = [Tasks.POS_TAGGING] @@ -98,20 +115,20 @@ def _info(self) -> datasets.DatasetInfo: features = datasets.Features( { # metadata - "sent_id": datasets.Value("string"), - "text": datasets.Value("string"), - "text_en": datasets.Value("string"), + "sent_id": datasets.Sequence(datasets.Value("string")), + "text": datasets.Sequence(datasets.Value("string")), + "text_en": datasets.Sequence(datasets.Value("string")), # tokens - "id": [datasets.Value("string")], - "form": [datasets.Value("string")], - "lemma": [datasets.Value("string")], - "upos": [datasets.Value("string")], - "xpos": [datasets.Value("string")], - "feats": [datasets.Value("string")], - "head": [datasets.Value("string")], - "deprel": [datasets.Value("string")], - "deps": [datasets.Value("string")], - "misc": [datasets.Value("string")], + "id": [datasets.Sequence(datasets.Value("string"))], + "form": [datasets.Sequence(datasets.Value("string"))], + "lemma": [datasets.Sequence(datasets.Value("string"))], + "upos": [datasets.Sequence(datasets.Value("string"))], + "xpos": [datasets.Sequence(datasets.Value("string"))], + "feats": [datasets.Sequence(datasets.Value("string"))], + "head": [datasets.Sequence(datasets.Value("string"))], + "deprel": [datasets.Sequence(datasets.Value("string"))], + "deps": [datasets.Sequence(datasets.Value("string"))], + "misc": [datasets.Sequence(datasets.Value("string"))], } ) @@ -171,6 +188,207 @@ def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: yield key, example +class UdIdCSUIDataset(UDDataset): + + # def __init__(self, subset): + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + _SUBSET = "id_csui" + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{_SUBSET}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME}_{_SUBSET} source schema", + schema="source", + subset_id=f"{_DATASETNAME}_{_SUBSET}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_{_SUBSET}_seacrowd_seq_label", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME}_{_SUBSET} SEACrowd Seq Label schema", + schema="seacrowd_seq_label", + subset_id=f"{_DATASETNAME}_{_SUBSET}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{_SUBSET}_source" + + def _split_generators( + self, dl_manager: datasets.DownloadManager + ) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[self._SUBSET] + data_path = dl_manager.download(urls) + + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_path["train"] + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": data_path["test"], + }, + ) + ] + + +class UdIdGSDDataset(UDDataset): + + # def __init__(self, subset): + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + _SUBSET = "id_gsd" + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{_SUBSET}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME}_{_SUBSET} source schema", + schema="source", + subset_id=f"{_DATASETNAME}_{_SUBSET}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_{_SUBSET}_seacrowd_seq_label", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME}_{_SUBSET} SEACrowd Seq Label schema", + schema="seacrowd_seq_label", + subset_id=f"{_DATASETNAME}_{_SUBSET}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{_SUBSET}_source" + + def _split_generators( + self, dl_manager: datasets.DownloadManager + ) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[self._SUBSET] + data_path = dl_manager.download(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_path["train"] + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": data_path["test"], + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": data_path["dev"], + }, + ), + ] + +class UdViVTBDataset(UDDataset): + + # def __init__(self, subset): + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + _SUBSET = "vi_vtb" + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{_SUBSET}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME}_{_SUBSET} source schema", + schema="source", + subset_id=f"{_DATASETNAME}_{_SUBSET}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_{_SUBSET}_seacrowd_seq_label", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME}_{_SUBSET} SEACrowd Seq Label schema", + schema="seacrowd_seq_label", + subset_id=f"{_DATASETNAME}_{_SUBSET}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{_SUBSET}_source" + + def _split_generators( + self, dl_manager: datasets.DownloadManager + ) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[self._SUBSET] + data_path = dl_manager.download(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_path["train"] + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": data_path["test"], + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": data_path["dev"], + }, + ), + ] + +class UdIdPUDDataset(UDDataset): + + # def __init__(self, subset): + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + _SUBSET = "id_pud" + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{_SUBSET}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME}_{_SUBSET} source schema", + schema="source", + subset_id=f"{_DATASETNAME}_{_SUBSET}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_{_SUBSET}_seacrowd_seq_label", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME}_{_SUBSET} SEACrowd Seq Label schema", + schema="seacrowd_seq_label", + subset_id=f"{_DATASETNAME}_{_SUBSET}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{_SUBSET}_source" + + def _split_generators( + self, dl_manager: datasets.DownloadManager + ) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[self._SUBSET] + data_path = dl_manager.download(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": data_path["test"], + }, + ) + ] + + if __name__ == "__main__": data = datasets.load_dataset(__file__) - print("xx") + From a76f1a1ee9e4d5dae795e39970237d69f854d693 Mon Sep 17 00:00:00 2001 From: Ishan Jindal Date: Mon, 15 Jan 2024 23:39:12 +0530 Subject: [PATCH 07/18] Reverting utf encoding --- seacrowd/utils/common_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seacrowd/utils/common_parser.py b/seacrowd/utils/common_parser.py index 5e7350c9a..7c1a27747 100644 --- a/seacrowd/utils/common_parser.py +++ b/seacrowd/utils/common_parser.py @@ -6,7 +6,7 @@ def load_conll_data(file_path): # Read file - data = open(file_path, "r", encoding="utf8").readlines() + data = open(file_path, "r").readlines() # Prepare buffer dataset = [] From 11199c7a0304a970b4aa8b62bf2f619857139ba1 Mon Sep 17 00:00:00 2001 From: Ishan Jindal Date: Sun, 10 Mar 2024 10:24:04 +0530 Subject: [PATCH 08/18] Update seacrowd/sea_datasets/ud/ud.py Co-authored-by: Salsabil Maulana Akbar --- seacrowd/sea_datasets/ud/ud.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/seacrowd/sea_datasets/ud/ud.py b/seacrowd/sea_datasets/ud/ud.py index 7e153c2b6..ae1b3162f 100755 --- a/seacrowd/sea_datasets/ud/ud.py +++ b/seacrowd/sea_datasets/ud/ud.py @@ -25,7 +25,21 @@ from seacrowd.utils.constants import Tasks -_CITATION = "" +_CITATION = """ + @misc{11234/1-5287, + title = {Universal Dependencies 2.13}, + author = {Zeman, Daniel and Nivre, Joakim and Abrams, Mitchell and Ackermann, Elia and Aepli, No{\"e}mi and Aghaei, Hamid and Agi{\'c}, {\v Z}eljko and Ahmadi, Amir and Ahrenberg, Lars and Ajede, Chika Kennedy and Akkurt, + Salih Furkan and Aleksandravi{\v c}i{\=u}t{\.e}, Gabriel{\.e} and Alfina, Ika and Algom, Avner and Alnajjar, Khalid and Alzetta, Chiara and Andersen, Erik and Antonsen, Lene and Aoyama, Tatsuya and Aplonova, Katya and Aquino, Angelina and Aragon, Carolina and Aranes, Glyd and Aranzabe, Maria Jesus and Ar{\i}can, Bilge Nas and Arnard{\'o}ttir, {\t H}{\'o}runn and Arutie, Gashaw and Arwidarasti, Jessica Naraiswari and Asahara, Masayuki and {\'A}sgeirsd{\'o}ttir, Katla and Aslan, Deniz Baran and Asmazo{\u g}lu, Cengiz and Ateyah, Luma and Atmaca, Furkan and Attia, Mohammed and Atutxa, Aitziber and Augustinus, Liesbeth and Avel{\~a}s, Mariana and Badmaeva, Elena and Balasubramani, Keerthana and Ballesteros, Miguel and Banerjee, Esha and Bank, Sebastian and Barbu Mititelu, Verginica and Barkarson, Starkaður and Basile, Rodolfo and Basmov, Victoria and Batchelor, Colin and Bauer, John and Bedir, Seyyit Talha and Behzad, Shabnam and Belieni, Juan and Bengoetxea, Kepa and Benli, İbrahim and Ben Moshe, Yifat and Berk, G{\"o}zde and Bhat, Riyaz Ahmad and Biagetti, Erica and Bick, Eckhard and Bielinskien{\.e}, Agn{\.e} and Bjarnad{\'o}ttir, Krist{\'{\i}}n and Blokland, Rogier and Bobicev, Victoria and Boizou, Lo{\"{\i}}c and Borges V{\"o}lker, Emanuel and B{\"o}rstell, Carl and Bosco, Cristina and Bouma, Gosse and Bowman, Sam and Boyd, Adriane and Braggaar, Anouck and Branco, Ant{\'o}nio and Brokait{\.e}, Kristina and Burchardt, Aljoscha and Campos, Marisa and Candito, Marie and Caron, Bernard and Caron, Gauthier and Carvalheiro, Catarina and Carvalho, Rita and Cassidy, Lauren and Castro, Maria Clara and Castro, S{\'e}rgio and Cavalcanti, Tatiana and Cebiro{\u g}lu Eryi{\u g}it, G{\"u}l{\c s}en and Cecchini, Flavio Massimiliano and Celano, Giuseppe G. A. and {\v C}{\'e}pl{\"o}, Slavom{\'{\i}}r and Cesur, Neslihan and Cetin, Savas and {\c C}etino{\u g}lu, {\"O}zlem and Chalub, Fabricio and Chamila, Liyanage and Chauhan, Shweta and Chi, Ethan and Chika, Taishi and Cho, Yongseok and Choi, Jinho and Chun, Jayeol and Chung, Juyeon and Cignarella, Alessandra T. and Cinkov{\'a}, Silvie and Collomb, Aur{\'e}lie and {\c C}{\"o}ltekin, {\c C}a{\u g}r{\i} and Connor, Miriam and Corbetta, Claudia and Corbetta, Daniela and Costa, Francisco and Courtin, Marine and Crabb{\'e}, Beno{\^{\i}}t and Cristescu, Mihaela and Cvetkoski, Vladimir and Dale, Ingerid L{\o}yning and Daniel, Philemon and Davidson, Elizabeth and de Alencar, Leonel Figueiredo and Dehouck, Mathieu and de Laurentiis, Martina and de Marneffe, Marie-Catherine and de Paiva, Valeria and Derin, Mehmet Oguz and de Souza, Elvis and Diaz de Ilarraza, Arantza and Dickerson, Carly and Dinakaramani, Arawinda and Di Nuovo, Elisa and Dione, Bamba and Dirix, Peter and Dobrovoljc, Kaja and Doyle, Adrian and Dozat, Timothy and Droganova, Kira and Duran, Magali Sanches and Dwivedi, Puneet and Ebert, Christian and Eckhoff, Hanne and Eguchi, Masaki and Eiche, Sandra and Eli, Marhaba and Elkahky, Ali and Ephrem, Binyam and Erina, Olga and Erjavec, Toma{\v z} and Essaidi, Farah and Etienne, Aline and Evelyn, Wograine and Facundes, Sidney and Farkas, Rich{\'a}rd and Favero, Federica and Ferdaousi, Jannatul and Fernanda, Mar{\'{\i}}lia and Fernandez Alcalde, Hector and Fethi, Amal and Foster, Jennifer and Fransen, Theodorus and Freitas, Cl{\'a}udia and Fujita, Kazunori and Gajdo{\v s}ov{\'a}, Katar{\'{\i}}na and Galbraith, Daniel and Gamba, Federica and Garcia, Marcos and G{\"a}rdenfors, Moa and Gerardi, Fabr{\'{\i}}cio Ferraz and Gerdes, Kim and Gessler, Luke and Ginter, Filip and Godoy, Gustavo and Goenaga, Iakes and Gojenola, Koldo and G{\"o}k{\i}rmak, Memduh and Goldberg, Yoav and G{\'o}mez Guinovart, Xavier and Gonz{\'a}lez Saavedra, + Berta and Grici{\=u}t{\.e}, Bernadeta and Grioni, Matias and Grobol, + Lo{\"{\i}}c and Gr{\= +u}z{\={\i}}tis, Normunds and Guillaume, Bruno and Guiller, Kirian and Guillot-Barbance, C{\'e}line and G{\"u}ng{\"o}r, Tunga and Habash, Nizar and Hafsteinsson, Hinrik and Haji{\v c}, Jan and Haji{\v c} jr., Jan and H{\"a}m{\"a}l{\"a}inen, Mika and H{\`a} M{\~y}, Linh and Han, Na-Rae and Hanifmuti, Muhammad Yudistira and Harada, Takahiro and Hardwick, Sam and Harris, Kim and Haug, Dag and Heinecke, Johannes and Hellwig, Oliver and Hennig, Felix and Hladk{\'a}, Barbora and Hlav{\'a}{\v c}ov{\'a}, Jaroslava and Hociung, Florinel and Hohle, Petter and Huang, Yidi and Huerta Mendez, Marivel and Hwang, Jena and Ikeda, Takumi and Ingason, Anton Karl and Ion, Radu and Irimia, Elena and Ishola, {\d O}l{\'a}j{\'{\i}}d{\'e} and Islamaj, Artan and Ito, Kaoru and Jagodzi{\'n}ska, Sandra and Jannat, Siratun and Jel{\'{\i}}nek, Tom{\'a}{\v s} and Jha, Apoorva and Jiang, Katharine and Johannsen, Anders and J{\'o}nsd{\'o}ttir, Hildur and J{\o}rgensen, Fredrik and Juutinen, Markus and Ka{\c s}{\i}kara, H{\"u}ner and Kabaeva, Nadezhda and Kahane, Sylvain and Kanayama, Hiroshi and Kanerva, Jenna and Kara, Neslihan and Karah{\'o}ǧa, Ritv{\'a}n and K{\aa}sen, Andre and Kayadelen, Tolga and Kengatharaiyer, Sarveswaran and Kettnerov{\'a}, V{\'a}clava and Kharatyan, Lilit and Kirchner, Jesse and Klementieva, Elena and Klyachko, Elena and Kocharov, Petr and K{\"o}hn, Arne and K{\"o}ksal, Abdullatif and Kopacewicz, Kamil and Korkiakangas, Timo and K{\"o}se, Mehmet and Koshevoy, Alexey and Kotsyba, Natalia and Kovalevskait{\.e}, Jolanta and Krek, Simon and Krishnamurthy, Parameswari and K{\"u}bler, Sandra and Kuqi, Adrian and Kuyruk{\c c}u, O{\u g}uzhan and Kuzgun, Asl{\i} and Kwak, Sookyoung and Kyle, Kris and Laan, K{\"a}bi and Laippala, Veronika and Lambertino, Lorenzo and Lando, Tatiana and Larasati, Septina Dian and Lavrentiev, Alexei and Lee, John and L{\^e} H{\`{\^o}}ng, Phương and Lenci, Alessandro and Lertpradit, Saran and Leung, Herman and Levina, Maria and Levine, Lauren and Li, Cheuk Ying and Li, Josie and Li, Keying and Li, Yixuan and Li, Yuan and Lim, {KyungTae} and Lima Padovani, Bruna and Lin, Yi-Ju Jessica and Lind{\'e}n, Krister and Liu, Yang Janet and Ljube{\v s}i{\'c}, Nikola and Lobzhanidze, Irina and Loginova, Olga and Lopes, Lucelene and Lusito, Stefano and Luthfi, Andry and Luukko, Mikko and Lyashevskaya, Olga and Lynn, Teresa and Macketanz, Vivien and Mahamdi, Menel and Maillard, Jean and Makarchuk, Ilya and Makazhanov, Aibek and Mandl, Michael and Manning, Christopher and Manurung, Ruli and Mar{\c s}an, B{\"u}{\c s}ra and M{\u a}r{\u a}nduc, C{\u a}t{\u a}lina and Mare{\v c}ek, David and Marheinecke, Katrin and Markantonatou, Stella and Mart{\'{\i}}nez Alonso, H{\'e}ctor and Mart{\'{\i}}n Rodr{\'{\i}}guez, Lorena and Martins, Andr{\'e} and Martins, Cl{\'a}udia and Ma{\v s}ek, Jan and Matsuda, Hiroshi and Matsumoto, Yuji and Mazzei, Alessandro and {McDonald}, Ryan and {McGuinness}, Sarah and Mendon{\c c}a, Gustavo and Merzhevich, Tatiana and Miekka, Niko and Miller, Aaron and Mischenkova, Karina and Missil{\"a}, Anna and Mititelu, C{\u a}t{\u a}lin and Mitrofan, Maria and Miyao, Yusuke and Mojiri Foroushani, {AmirHossein} and Moln{\'a}r, Judit and Moloodi, Amirsaeid and Montemagni, Simonetta and More, Amir and Moreno Romero, Laura and Moretti, Giovanni and Mori, Shinsuke and Morioka, Tomohiko and Moro, Shigeki and Mortensen, Bjartur and Moskalevskyi, Bohdan and Muischnek, Kadri and Munro, Robert and Murawaki, Yugo and M{\"u}{\"u}risep, Kaili and Nainwani, Pinkey and Nakhl{\'e}, Mariam and Navarro Hor{\~n}iacek, Juan Ignacio and Nedoluzhko, + Anna and Ne{\v s}pore-B{\=e}rzkalne, Gunta and Nevaci, Manuela and Nguy{\~{\^e}}n Th{\d i}, Lương and Nguy{\~{\^e}}n Th{\d i} Minh, Huy{\`{\^e}}n and Nikaido, Yoshihiro and Nikolaev, Vitaly and Nitisaroj, Rattima and Nourian, Alireza and Nunes, Maria das Gra{\c c}as Volpe and Nurmi, Hanna and Ojala, Stina and Ojha, Atul Kr. and {\'O}lad{\'o}ttir, Hulda and Ol{\'u}{\`o}kun, Ad{\'e}day{\d o}̀ and Omura, Mai and Onwuegbuzia, Emeka and Ordan, Noam and Osenova, Petya and {\"O}stling, Robert and {\O}vrelid, Lilja and {\"O}zate{\c s}, {\c S}aziye Bet{\"u}l and {\"O}z{\c c}elik, Merve and {\"O}zg{\"u}r, Arzucan and {\"O}zt{\"u}rk Ba{\c s}aran, Balk{\i}z and Paccosi, Teresa and Palmero Aprosio, Alessio and Panova, Anastasia and Pardo, Thiago Alexandre Salgueiro and Park, Hyunji Hayley and Partanen, Niko and Pascual, Elena and Passarotti, Marco and Patejuk, Agnieszka and Paulino-Passos, Guilherme and Pedonese, Giulia and Peljak-{\L}api{\'n}ska, Angelika and Peng, Siyao and Peng, Siyao Logan and Pereira, Rita and Pereira, S{\'{\i}}lvia and Perez, Cenel-Augusto and Perkova, Natalia and Perrier, Guy and Petrov, Slav and Petrova, Daria and Peverelli, Andrea and Phelan, Jason and Pierre-Louis, Claudel and Piitulainen, Jussi and Pinter, Yuval and Pinto, Clara and Pintucci, Rodrigo and Pirinen, Tommi A and Pitler, Emily and Plamada, Magdalena and Plank, Barbara and Poibeau, Thierry and Ponomareva, Larisa and Popel, Martin and Pretkalni{\c n}a, Lauma and Pr{\'e}vost, Sophie and Prokopidis, Prokopis and Przepi{\'o}rkowski, Adam and Pugh, Robert and Puolakainen, Tiina and Pyysalo, Sampo and Qi, Peng and Querido, Andreia and R{\"a}{\"a}bis, Andriela and Rademaker, Alexandre and Rahoman, Mizanur and Rama, Taraka and Ramasamy, Loganathan and Ramisch, Carlos and Ramos, Joana and Rashel, Fam and Rasooli, Mohammad Sadegh and Ravishankar, Vinit and Real, Livy and Rebeja, Petru and Reddy, Siva and Regnault, Mathilde and Rehm, Georg and Riabi, Arij and Riabov, Ivan and Rie{\ss}ler, Michael and Rimkut{\.e}, Erika and Rinaldi, Larissa and Rituma, Laura and Rizqiyah, Putri and Rocha, Luisa and R{\"o}gnvaldsson, Eir{\'{\i}}kur and Roksandic, Ivan and Romanenko, Mykhailo and Rosa, Rudolf and Roșca, Valentin and Rovati, Davide and Rozonoyer, Ben and Rudina, Olga and Rueter, Jack and R{\'u}narsson, Kristj{\'a}n and Sadde, Shoval and Safari, Pegah and Sahala, Aleksi and Saleh, Shadi and Salomoni, Alessio and Samard{\v z}i{\'c}, Tanja and Samson, Stephanie and Sanguinetti, Manuela and San{\i}yar, Ezgi and S{\"a}rg, Dage and Sartor, Marta and Sasaki, + Mitsuya and Saul{\={\i}}te, Baiba and Savary, Agata and Sawanakunanon, Yanin and Saxena, Shefali and Scannell, Kevin and Scarlata, Salvatore and Schang, Emmanuel and Schneider, Nathan and Schuster, Sebastian and Schwartz, Lane and Seddah, Djam{\'e} and Seeker, Wolfgang and Seraji, Mojgan and Shahzadi, Syeda and Shen, Mo and Shimada, Atsuko and Shirasu, Hiroyuki and Shishkina, Yana and Shohibussirri, Muh and Shvedova, Maria and Siewert, Janine and Sigurðsson, Einar Freyr and Silva, Jo{\~a}o and Silveira, Aline and Silveira, Natalia and Silveira, Sara and Simi, Maria and Simionescu, Radu and Simk{\'o}, Katalin and {\v S}imkov{\'a}, M{\'a}ria and S{\'{\i}}monarson, Haukur Barri and Simov, Kiril and Sitchinava, Dmitri and Sither, Ted and Skachedubova, Maria and Smith, Aaron and Soares-Bastos, Isabela and Solberg, Per Erik and Sonnenhauser, Barbara and Sourov, Shafi and Sprugnoli, Rachele and Stamou, Vivian and Steingr{\'{\i}}msson, Stein{\t h}{\'o}r and Stella, Antonio and Stephen, Abishek and Straka, Milan and Strickland, Emmett and Strnadov{\'a}, Jana and Suhr, Alane and Sulestio, Yogi Lesmana and Sulubacak, Umut and Suzuki, Shingo and Swanson, Daniel and Sz{\'a}nt{\'o}, Zsolt and Taguchi, Chihiro and Taji, Dima and Tamburini, Fabio and Tan, Mary Ann C. and Tanaka, Takaaki and Tanaya, Dipta and Tavoni, Mirko and Tella, Samson and Tellier, Isabelle and Testori, Marinella and Thomas, Guillaume and Tonelli, Sara and Torga, Liisi and Toska, Marsida and Trosterud, Trond and Trukhina, Anna and Tsarfaty, Reut and T{\"u}rk, Utku and Tyers, Francis and {\t H}{\'o}rðarson, Sveinbj{\"o}rn and {\t H}orsteinsson, Vilhj{\'a}lmur and Uematsu, Sumire and Untilov, Roman and Ure{\v s}ov{\'a}, Zde{\v n}ka and Uria, Larraitz and Uszkoreit, Hans and Utka, Andrius and Vagnoni, Elena and Vajjala, Sowmya and Vak, Socrates and van der Goot, Rob and Vanhove, Martine and van Niekerk, Daniel and van Noord, Gertjan and Varga, Viktor and Vedenina, Uliana and Venturi, Giulia and Villemonte de la Clergerie, Eric and Vincze, Veronika and Vlasova, Natalia and Wakasa, Aya and Wallenberg, Joel C. and Wallin, Lars and Walsh, Abigail and Washington, Jonathan North and Wendt, Maximilan and Widmer, Paul and Wigderson, Shira and Wijono, Sri Hartati and Wille, Vanessa Berwanger and Williams, Seyi and Wir{\'e}n, Mats and Wittern, Christian and Woldemariam, Tsegay and Wong, Tak-sum and Wr{\'o}blewska, Alina and Wu, Qishen and Yako, Mary and Yamashita, Kayo and Yamazaki, Naoki and Yan, Chunxiao and Yasuoka, Koichi and Yavrumyan, Marat M. and Yenice, Arife Bet{\"u}l and Y{\i}ld{\i}z, Olcay Taner and Yu, Zhuoran and Yuliawati, Arlisa and {\v Z}abokrtsk{\'y}, Zden{\v e}k and Zahra, Shorouq and Zeldes, Amir and Zhou, He and Zhu, Hanzhi and Zhu, Yilun and Zhuravleva, Anna and Ziane, Rayan}, + url = {http://hdl.handle.net/11234/1-5287}, + note = {{LINDAT}/{CLARIAH}-{CZ} digital library at the Institute of Formal and Applied Linguistics ({{\'U}FAL}), Faculty of Mathematics and Physics, Charles University}, + copyright = {Licence Universal Dependencies v2.13}, + year = {2023} } +""" _LANGUAGES = ["ind", "vie"] _LOCAL = False From 7df5993204c872509ba77c4736fbfb108c71df52 Mon Sep 17 00:00:00 2001 From: Ishan Jindal Date: Sun, 10 Mar 2024 10:24:18 +0530 Subject: [PATCH 09/18] Update seacrowd/sea_datasets/ud/ud.py Co-authored-by: Salsabil Maulana Akbar --- seacrowd/sea_datasets/ud/ud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seacrowd/sea_datasets/ud/ud.py b/seacrowd/sea_datasets/ud/ud.py index ae1b3162f..5dec691fa 100755 --- a/seacrowd/sea_datasets/ud/ud.py +++ b/seacrowd/sea_datasets/ud/ud.py @@ -89,7 +89,7 @@ _SUPPORTED_TASKS = [Tasks.POS_TAGGING] -_SOURCE_VERSION = "1.0.0" +_SOURCE_VERSION = "2.13" _SEACROWD_VERSION = "1.0.0" From 21b0b48d3ec8774fe36073f9bd7bc69d871667ef Mon Sep 17 00:00:00 2001 From: ijindal Date: Sat, 4 May 2024 07:08:38 +0530 Subject: [PATCH 10/18] add tl_ugnayan and tl_trg reader --- seacrowd/sea_datasets/ud/ud.py | 96 ++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/seacrowd/sea_datasets/ud/ud.py b/seacrowd/sea_datasets/ud/ud.py index 5dec691fa..0d37b6a45 100755 --- a/seacrowd/sea_datasets/ud/ud.py +++ b/seacrowd/sea_datasets/ud/ud.py @@ -85,8 +85,16 @@ "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Vietnamese-VTB/master/vi_vtb-ud-test.conllu", "dev": "https://raw.githubusercontent.com/UniversalDependencies/UD_Vietnamese-VTB/master/vi_vtb-ud-dev.conllu", }, + "tl_trg": { + "test": "https://github.com/UniversalDependencies/UD_Tagalog-TRG/blob/master/tl_trg-ud-test.conllu", + }, + "tl_ugnayan": { + "test": "https://github.com/UniversalDependencies/UD_Tagalog-Ugnayan/blob/master/tl_ugnayan-ud-test.conllu", + }, } + + _SUPPORTED_TASKS = [Tasks.POS_TAGGING] _SOURCE_VERSION = "2.13" @@ -403,6 +411,94 @@ def _split_generators( ] + +class UdTlTRGDataset(UDDataset): + + # def __init__(self, subset): + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + _SUBSET = "tl_trg" + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{_SUBSET}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME}_{_SUBSET} source schema", + schema="source", + subset_id=f"{_DATASETNAME}_{_SUBSET}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_{_SUBSET}_seacrowd_seq_label", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME}_{_SUBSET} SEACrowd Seq Label schema", + schema="seacrowd_seq_label", + subset_id=f"{_DATASETNAME}_{_SUBSET}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{_SUBSET}_source" + + def _split_generators( + self, dl_manager: datasets.DownloadManager + ) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[self._SUBSET] + data_path = dl_manager.download(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": data_path["test"], + }, + ) + ] + + + +class UdTlUGNAYANDataset(UDDataset): + + # def __init__(self, subset): + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + _SUBSET = "tl_ugnayan" + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{_SUBSET}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME}_{_SUBSET} source schema", + schema="source", + subset_id=f"{_DATASETNAME}_{_SUBSET}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_{_SUBSET}_seacrowd_seq_label", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME}_{_SUBSET} SEACrowd Seq Label schema", + schema="seacrowd_seq_label", + subset_id=f"{_DATASETNAME}_{_SUBSET}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{_SUBSET}_source" + + def _split_generators( + self, dl_manager: datasets.DownloadManager + ) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[self._SUBSET] + data_path = dl_manager.download(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": data_path["test"], + }, + ) + ] + + if __name__ == "__main__": data = datasets.load_dataset(__file__) From 410563c8e1ca658b4d70b6b8a50be85b00e26bda Mon Sep 17 00:00:00 2001 From: ijindal Date: Sat, 4 May 2024 07:16:53 +0530 Subject: [PATCH 11/18] add subset names --- seacrowd/sea_datasets/ud/ud.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/seacrowd/sea_datasets/ud/ud.py b/seacrowd/sea_datasets/ud/ud.py index 0d37b6a45..213c93b98 100755 --- a/seacrowd/sea_datasets/ud/ud.py +++ b/seacrowd/sea_datasets/ud/ud.py @@ -49,7 +49,10 @@ _SUBSETS = {"id_gsd" : "UD_Indonesian-GSD", "id_csui": "UD_Indonesian-CSUI", "id_pud" : "UD_Indonesian-PUD", - "vi_vtb": "UD_Vietnamese-VTB"} + "vi_vtb": "UD_Vietnamese-VTB", + "tl_trg": "UD_Tagalog-TRG", + "tl_ugnayan": "UD_Tagalog-Ugnayan" + } _DESCRIPTION = """\ Universal Dependencies (UD) is a project that is developing cross-linguistically consistent treebank annotation From 899f0894a88641aa96642b50a510250b13e2917f Mon Sep 17 00:00:00 2001 From: ijindal Date: Wed, 22 May 2024 23:03:32 +0530 Subject: [PATCH 12/18] updated as per https://github.com/SEACrowd/seacrowd-datahub/pull/247#issuecomment-2110640700 --- seacrowd/sea_datasets/ud/ud.py | 303 +++------------------------------ 1 file changed, 23 insertions(+), 280 deletions(-) diff --git a/seacrowd/sea_datasets/ud/ud.py b/seacrowd/sea_datasets/ud/ud.py index 213c93b98..b4582b310 100755 --- a/seacrowd/sea_datasets/ud/ud.py +++ b/seacrowd/sea_datasets/ud/ud.py @@ -113,22 +113,23 @@ class UDDataset(datasets.GeneratorBasedBuilder): SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) - BUILDER_CONFIGS = [ + SOURCE_BUILDER_CONFIGS = [ SEACrowdConfig( name=f"{_DATASETNAME}_source", version=SOURCE_VERSION, description=f"{_DATASETNAME} source schema", schema="source", - subset_id=f"{_DATASETNAME}", - ), + subset_id=f"{subset_name}", + ) for subset_name in _SUBSETS.keys()] + SEQUENCE_BUILDER_CONFIGS = [ SEACrowdConfig( name=f"{_DATASETNAME}_seacrowd_seq_label", version=SEACROWD_VERSION, description=f"{_DATASETNAME} SEACrowd Seq Label schema", schema="seacrowd_seq_label", - subset_id=f"{_DATASETNAME}", - ), - ] + subset_id=f"{subset_name}", + ) for subset_name in _SUBSETS.keys()] + BUILDER_CONFIGS = SOURCE_BUILDER_CONFIGS + SEQUENCE_BUILDER_CONFIGS DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" @@ -175,10 +176,8 @@ def _split_generators( self, dl_manager: datasets.DownloadManager ) -> List[datasets.SplitGenerator]: """Returns SplitGenerators.""" - urls = _URLS[_DATASETNAME] - # data_path = dl_manager.download(urls) - return [] + return self._ud_split_generator(dl_manager) def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: @@ -213,293 +212,37 @@ def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: yield key, example -class UdIdCSUIDataset(UDDataset): - # def __init__(self, subset): + def _ud_split_generator(self, dl_manager): - SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) - SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) - _SUBSET = "id_csui" - BUILDER_CONFIGS = [ - SEACrowdConfig( - name=f"{_DATASETNAME}_{_SUBSET}_source", - version=SOURCE_VERSION, - description=f"{_DATASETNAME}_{_SUBSET} source schema", - schema="source", - subset_id=f"{_DATASETNAME}_{_SUBSET}", - ), - SEACrowdConfig( - name=f"{_DATASETNAME}_{_SUBSET}_seacrowd_seq_label", - version=SEACROWD_VERSION, - description=f"{_DATASETNAME}_{_SUBSET} SEACrowd Seq Label schema", - schema="seacrowd_seq_label", - subset_id=f"{_DATASETNAME}_{_SUBSET}", - ), - ] - - DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{_SUBSET}_source" - - def _split_generators( - self, dl_manager: datasets.DownloadManager - ) -> List[datasets.SplitGenerator]: - """Returns SplitGenerators.""" - urls = _URLS[self._SUBSET] + split_dset = [] + if self.config.subset_id not in _SUBSETS: + return split_dset + urls = _URLS[self.config.subset_id] data_path = dl_manager.download(urls) - - - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={ - "filepath": data_path["train"] - }, - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, - gen_kwargs={ - "filepath": data_path["test"], - }, - ) - ] - - -class UdIdGSDDataset(UDDataset): - - # def __init__(self, subset): - - SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) - SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) - _SUBSET = "id_gsd" - BUILDER_CONFIGS = [ - SEACrowdConfig( - name=f"{_DATASETNAME}_{_SUBSET}_source", - version=SOURCE_VERSION, - description=f"{_DATASETNAME}_{_SUBSET} source schema", - schema="source", - subset_id=f"{_DATASETNAME}_{_SUBSET}", - ), - SEACrowdConfig( - name=f"{_DATASETNAME}_{_SUBSET}_seacrowd_seq_label", - version=SEACROWD_VERSION, - description=f"{_DATASETNAME}_{_SUBSET} SEACrowd Seq Label schema", - schema="seacrowd_seq_label", - subset_id=f"{_DATASETNAME}_{_SUBSET}", - ), - ] - - DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{_SUBSET}_source" - - def _split_generators( - self, dl_manager: datasets.DownloadManager - ) -> List[datasets.SplitGenerator]: - """Returns SplitGenerators.""" - urls = _URLS[self._SUBSET] - data_path = dl_manager.download(urls) - - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={ - "filepath": data_path["train"] - }, - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, - gen_kwargs={ - "filepath": data_path["test"], - }, - ), - datasets.SplitGenerator( - name=datasets.Split.VALIDATION, - gen_kwargs={ - "filepath": data_path["dev"], - }, - ), - ] - -class UdViVTBDataset(UDDataset): - - # def __init__(self, subset): - - SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) - SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) - _SUBSET = "vi_vtb" - BUILDER_CONFIGS = [ - SEACrowdConfig( - name=f"{_DATASETNAME}_{_SUBSET}_source", - version=SOURCE_VERSION, - description=f"{_DATASETNAME}_{_SUBSET} source schema", - schema="source", - subset_id=f"{_DATASETNAME}_{_SUBSET}", - ), - SEACrowdConfig( - name=f"{_DATASETNAME}_{_SUBSET}_seacrowd_seq_label", - version=SEACROWD_VERSION, - description=f"{_DATASETNAME}_{_SUBSET} SEACrowd Seq Label schema", - schema="seacrowd_seq_label", - subset_id=f"{_DATASETNAME}_{_SUBSET}", - ), - ] - - DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{_SUBSET}_source" - - def _split_generators( - self, dl_manager: datasets.DownloadManager - ) -> List[datasets.SplitGenerator]: - """Returns SplitGenerators.""" - urls = _URLS[self._SUBSET] - data_path = dl_manager.download(urls) - - return [ - datasets.SplitGenerator( + if "train" in data_path: + split_dset.append(datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "filepath": data_path["train"] }, - ), - datasets.SplitGenerator( + )) + if "test" in data_path: + split_dset.append(datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ "filepath": data_path["test"], }, - ), - datasets.SplitGenerator( + )) + if "dev" in data_path: + split_dset.append(datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={ "filepath": data_path["dev"], }, - ), - ] + )) -class UdIdPUDDataset(UDDataset): - - # def __init__(self, subset): - - SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) - SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) - _SUBSET = "id_pud" - BUILDER_CONFIGS = [ - SEACrowdConfig( - name=f"{_DATASETNAME}_{_SUBSET}_source", - version=SOURCE_VERSION, - description=f"{_DATASETNAME}_{_SUBSET} source schema", - schema="source", - subset_id=f"{_DATASETNAME}_{_SUBSET}", - ), - SEACrowdConfig( - name=f"{_DATASETNAME}_{_SUBSET}_seacrowd_seq_label", - version=SEACROWD_VERSION, - description=f"{_DATASETNAME}_{_SUBSET} SEACrowd Seq Label schema", - schema="seacrowd_seq_label", - subset_id=f"{_DATASETNAME}_{_SUBSET}", - ), - ] - - DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{_SUBSET}_source" - - def _split_generators( - self, dl_manager: datasets.DownloadManager - ) -> List[datasets.SplitGenerator]: - """Returns SplitGenerators.""" - urls = _URLS[self._SUBSET] - data_path = dl_manager.download(urls) - - return [ - datasets.SplitGenerator( - name=datasets.Split.TEST, - gen_kwargs={ - "filepath": data_path["test"], - }, - ) - ] - - - -class UdTlTRGDataset(UDDataset): - - # def __init__(self, subset): - - SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) - SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) - _SUBSET = "tl_trg" - BUILDER_CONFIGS = [ - SEACrowdConfig( - name=f"{_DATASETNAME}_{_SUBSET}_source", - version=SOURCE_VERSION, - description=f"{_DATASETNAME}_{_SUBSET} source schema", - schema="source", - subset_id=f"{_DATASETNAME}_{_SUBSET}", - ), - SEACrowdConfig( - name=f"{_DATASETNAME}_{_SUBSET}_seacrowd_seq_label", - version=SEACROWD_VERSION, - description=f"{_DATASETNAME}_{_SUBSET} SEACrowd Seq Label schema", - schema="seacrowd_seq_label", - subset_id=f"{_DATASETNAME}_{_SUBSET}", - ), - ] - - DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{_SUBSET}_source" - - def _split_generators( - self, dl_manager: datasets.DownloadManager - ) -> List[datasets.SplitGenerator]: - """Returns SplitGenerators.""" - urls = _URLS[self._SUBSET] - data_path = dl_manager.download(urls) - - return [ - datasets.SplitGenerator( - name=datasets.Split.TEST, - gen_kwargs={ - "filepath": data_path["test"], - }, - ) - ] - - - -class UdTlUGNAYANDataset(UDDataset): - - # def __init__(self, subset): - - SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) - SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) - _SUBSET = "tl_ugnayan" - BUILDER_CONFIGS = [ - SEACrowdConfig( - name=f"{_DATASETNAME}_{_SUBSET}_source", - version=SOURCE_VERSION, - description=f"{_DATASETNAME}_{_SUBSET} source schema", - schema="source", - subset_id=f"{_DATASETNAME}_{_SUBSET}", - ), - SEACrowdConfig( - name=f"{_DATASETNAME}_{_SUBSET}_seacrowd_seq_label", - version=SEACROWD_VERSION, - description=f"{_DATASETNAME}_{_SUBSET} SEACrowd Seq Label schema", - schema="seacrowd_seq_label", - subset_id=f"{_DATASETNAME}_{_SUBSET}", - ), - ] - - DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_{_SUBSET}_source" - - def _split_generators( - self, dl_manager: datasets.DownloadManager - ) -> List[datasets.SplitGenerator]: - """Returns SplitGenerators.""" - urls = _URLS[self._SUBSET] - data_path = dl_manager.download(urls) - - return [ - datasets.SplitGenerator( - name=datasets.Split.TEST, - gen_kwargs={ - "filepath": data_path["test"], - }, - ) - ] + return split_dset if __name__ == "__main__": From 8aeab0130c2998daee64fac2c9617b4278179bea Mon Sep 17 00:00:00 2001 From: Ishan Jindal Date: Tue, 28 May 2024 19:32:46 +0530 Subject: [PATCH 13/18] Update seacrowd/sea_datasets/ud/ud.py Co-authored-by: Salsabil Maulana Akbar --- seacrowd/sea_datasets/ud/ud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seacrowd/sea_datasets/ud/ud.py b/seacrowd/sea_datasets/ud/ud.py index b4582b310..b6578b476 100755 --- a/seacrowd/sea_datasets/ud/ud.py +++ b/seacrowd/sea_datasets/ud/ud.py @@ -41,7 +41,7 @@ year = {2023} } """ -_LANGUAGES = ["ind", "vie"] +_LANGUAGES = ["ind", "vie", "tgl"] _LOCAL = False _DATASETNAME = "ud-v2.13" From a4ede1a57a93172a9dec27aaef06d360dbd17c16 Mon Sep 17 00:00:00 2001 From: Ishan Jindal Date: Tue, 28 May 2024 19:33:20 +0530 Subject: [PATCH 14/18] Update seacrowd/sea_datasets/ud/ud.py Co-authored-by: Salsabil Maulana Akbar --- seacrowd/sea_datasets/ud/ud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seacrowd/sea_datasets/ud/ud.py b/seacrowd/sea_datasets/ud/ud.py index b6578b476..0c134abb6 100755 --- a/seacrowd/sea_datasets/ud/ud.py +++ b/seacrowd/sea_datasets/ud/ud.py @@ -44,7 +44,7 @@ _LANGUAGES = ["ind", "vie", "tgl"] _LOCAL = False -_DATASETNAME = "ud-v2.13" +_DATASETNAME = "ud" _SUBSETS = {"id_gsd" : "UD_Indonesian-GSD", "id_csui": "UD_Indonesian-CSUI", From 12b1205018fc8096fbb90e07c7e7e2e1fc0ed98b Mon Sep 17 00:00:00 2001 From: Ishan Jindal Date: Tue, 28 May 2024 19:33:39 +0530 Subject: [PATCH 15/18] Update seacrowd/sea_datasets/ud/ud.py Co-authored-by: Salsabil Maulana Akbar --- seacrowd/sea_datasets/ud/ud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seacrowd/sea_datasets/ud/ud.py b/seacrowd/sea_datasets/ud/ud.py index 0c134abb6..e7afaa555 100755 --- a/seacrowd/sea_datasets/ud/ud.py +++ b/seacrowd/sea_datasets/ud/ud.py @@ -66,7 +66,7 @@ _HOMEPAGE = "https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-5287" -_LICENSE = ["Apache license 2.0 (apache-2.0)"] +_LICENSE = Licenses.APACHE_2_0.value _URLS = { "ud-v2.12": "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-5150/ud-treebanks-v2.12.tgz?sequence=1&isAllowed=y", From 7bef0ce2d055e19da9ba2b971d6d3ba1bd6bbfd5 Mon Sep 17 00:00:00 2001 From: ijindal Date: Tue, 28 May 2024 19:51:21 +0530 Subject: [PATCH 16/18] fix few of the comments --- seacrowd/sea_datasets/ud/ud.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/seacrowd/sea_datasets/ud/ud.py b/seacrowd/sea_datasets/ud/ud.py index e7afaa555..67b3aeeff 100755 --- a/seacrowd/sea_datasets/ud/ud.py +++ b/seacrowd/sea_datasets/ud/ud.py @@ -23,7 +23,7 @@ from seacrowd.utils.common_parser import load_ud_data, load_ud_data_as_seacrowd_kb from seacrowd.utils.configs import SEACrowdConfig from seacrowd.utils.constants import Tasks - +from seacrowd.utils.constants import Licenses _CITATION = """ @misc{11234/1-5287, @@ -68,9 +68,10 @@ _LICENSE = Licenses.APACHE_2_0.value +# "ud-v2.12": "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-5150/ud-treebanks-v2.12.tgz?sequence=1&isAllowed=y" +# "ud-v2.13": "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-5287/ud-treebanks-v2.13.tgz?sequence=1&isAllowed=y" + _URLS = { - "ud-v2.12": "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-5150/ud-treebanks-v2.12.tgz?sequence=1&isAllowed=y", - "ud-v2.13": "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-5287/ud-treebanks-v2.13.tgz?sequence=1&isAllowed=y", "id_csui": { "train": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-CSUI/master/id_csui-ud-train.conllu", "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-CSUI/master/id_csui-ud-test.conllu", @@ -115,7 +116,7 @@ class UDDataset(datasets.GeneratorBasedBuilder): SOURCE_BUILDER_CONFIGS = [ SEACrowdConfig( - name=f"{_DATASETNAME}_source", + name=f"{_DATASETNAME}_{subset_name}_source", version=SOURCE_VERSION, description=f"{_DATASETNAME} source schema", schema="source", @@ -123,7 +124,7 @@ class UDDataset(datasets.GeneratorBasedBuilder): ) for subset_name in _SUBSETS.keys()] SEQUENCE_BUILDER_CONFIGS = [ SEACrowdConfig( - name=f"{_DATASETNAME}_seacrowd_seq_label", + name=f"{_DATASETNAME}_{subset_name}_seacrowd_seq_label", version=SEACROWD_VERSION, description=f"{_DATASETNAME} SEACrowd Seq Label schema", schema="seacrowd_seq_label", From d03db9c8f969b693a6fafb0dce9195e70c61bcfc Mon Sep 17 00:00:00 2001 From: ijindal Date: Tue, 28 May 2024 20:35:22 +0530 Subject: [PATCH 17/18] fix few of the comments --- seacrowd/sea_datasets/ud/ud.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/seacrowd/sea_datasets/ud/ud.py b/seacrowd/sea_datasets/ud/ud.py index 67b3aeeff..7f1298be8 100755 --- a/seacrowd/sea_datasets/ud/ud.py +++ b/seacrowd/sea_datasets/ud/ud.py @@ -25,7 +25,7 @@ from seacrowd.utils.constants import Tasks from seacrowd.utils.constants import Licenses -_CITATION = """ +_CITATION = r""" @misc{11234/1-5287, title = {Universal Dependencies 2.13}, author = {Zeman, Daniel and Nivre, Joakim and Abrams, Mitchell and Ackermann, Elia and Aepli, No{\"e}mi and Aghaei, Hamid and Agi{\'c}, {\v Z}eljko and Ahmadi, Amir and Ahrenberg, Lars and Ajede, Chika Kennedy and Akkurt, @@ -90,10 +90,10 @@ "dev": "https://raw.githubusercontent.com/UniversalDependencies/UD_Vietnamese-VTB/master/vi_vtb-ud-dev.conllu", }, "tl_trg": { - "test": "https://github.com/UniversalDependencies/UD_Tagalog-TRG/blob/master/tl_trg-ud-test.conllu", + "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Tagalog-TRG/master/tl_trg-ud-test.conllu", }, "tl_ugnayan": { - "test": "https://github.com/UniversalDependencies/UD_Tagalog-Ugnayan/blob/master/tl_ugnayan-ud-test.conllu", + "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Tagalog-Ugnayan/master/tl_ugnayan-ud-test.conllu", }, } @@ -101,7 +101,7 @@ _SUPPORTED_TASKS = [Tasks.POS_TAGGING] -_SOURCE_VERSION = "2.13" +_SOURCE_VERSION = "2.13.0" _SEACROWD_VERSION = "1.0.0" @@ -132,8 +132,6 @@ class UDDataset(datasets.GeneratorBasedBuilder): ) for subset_name in _SUBSETS.keys()] BUILDER_CONFIGS = SOURCE_BUILDER_CONFIGS + SEQUENCE_BUILDER_CONFIGS - DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" - UPOS_TAGS = ["ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X"] def _info(self) -> datasets.DatasetInfo: @@ -246,6 +244,4 @@ def _ud_split_generator(self, dl_manager): return split_dset -if __name__ == "__main__": - data = datasets.load_dataset(__file__) From ebc5d669bbe003b291175349f207a85c9f8a3d03 Mon Sep 17 00:00:00 2001 From: Salsabil Maulana Akbar Date: Fri, 31 May 2024 15:25:13 +0700 Subject: [PATCH 18/18] Update ud.py extend `ud` dataloader to multiple tasks, adjust data loading methods based on existing dataloaders, and add custom citations per subsets --- seacrowd/sea_datasets/ud/ud.py | 397 +++++++++++++++++++++++++++------ 1 file changed, 323 insertions(+), 74 deletions(-) diff --git a/seacrowd/sea_datasets/ud/ud.py b/seacrowd/sea_datasets/ud/ud.py index 7f1298be8..26bfd21fc 100755 --- a/seacrowd/sea_datasets/ud/ud.py +++ b/seacrowd/sea_datasets/ud/ud.py @@ -14,16 +14,16 @@ # limitations under the License. from pathlib import Path -from typing import Dict, List, Tuple +from typing import Dict, List, Tuple, Iterable import datasets +from copy import deepcopy from conllu import TokenList from seacrowd.utils import schemas from seacrowd.utils.common_parser import load_ud_data, load_ud_data_as_seacrowd_kb from seacrowd.utils.configs import SEACrowdConfig -from seacrowd.utils.constants import Tasks -from seacrowd.utils.constants import Licenses +from seacrowd.utils.constants import Tasks, Licenses _CITATION = r""" @misc{11234/1-5287, @@ -46,13 +46,17 @@ _DATASETNAME = "ud" -_SUBSETS = {"id_gsd" : "UD_Indonesian-GSD", - "id_csui": "UD_Indonesian-CSUI", - "id_pud" : "UD_Indonesian-PUD", - "vi_vtb": "UD_Vietnamese-VTB", - "tl_trg": "UD_Tagalog-TRG", - "tl_ugnayan": "UD_Tagalog-Ugnayan" - } +_SUPPORTED_TASKS = [Tasks.POS_TAGGING, Tasks.DEPENDENCY_PARSING, Tasks.MACHINE_TRANSLATION] + +#map source subset names to index in `_SUPPORTED_TASKS` +_SOURCE_SUBSETS_TO_TASKS_INDEX = { + "id_csui": [0,1,2], + "id_gsd": [0,1], + "id_pud": [0,2], + "vi_vtb": [0,1], + "tl_trg": [0,1,2], + "tl_ugnayan": [0,2] +} _DESCRIPTION = """\ Universal Dependencies (UD) is a project that is developing cross-linguistically consistent treebank annotation @@ -64,6 +68,12 @@ annotation of similar constructions across languages, while allowing language-specific extensions when necessary. """ +_ISO_LANG_MAPPER_UD = { + "id": "ind", + "vi": "vie", + "tl": "tgl" +} + _HOMEPAGE = "https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-5287" _LICENSE = Licenses.APACHE_2_0.value @@ -72,94 +82,118 @@ # "ud-v2.13": "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-5287/ud-treebanks-v2.13.tgz?sequence=1&isAllowed=y" _URLS = { - "id_csui": { + "ud_id_csui": { "train": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-CSUI/master/id_csui-ud-train.conllu", "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-CSUI/master/id_csui-ud-test.conllu", }, - "id_gsd": { - "train": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-GSD/master/id_gsd-ud-train.conllu", - "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-GSD/master/id_gsd-ud-test.conllu", - "dev": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-GSD/master/id_gsd-ud-dev.conllu", + "ud_id_gsd": { + "train": "https://raw.githubusercontent.com/indolem/indolem/main/dependency_parsing/UD_Indonesian_GSD/id_gsd-ud-train.conllu", + "test": "https://raw.githubusercontent.com/indolem/indolem/main/dependency_parsing/UD_Indonesian_GSD/id_gsd-ud-test.conllu", + "dev": "https://raw.githubusercontent.com/indolem/indolem/main/dependency_parsing/UD_Indonesian_GSD/id_gsd-ud-dev.conllu", }, - "id_pud": { + "ud_id_pud": { "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-PUD/master/id_pud-ud-test.conllu" }, - "vi_vtb": { + "ud_vi_vtb": { "train": "https://raw.githubusercontent.com/UniversalDependencies/UD_Vietnamese-VTB/master/vi_vtb-ud-train.conllu", "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Vietnamese-VTB/master/vi_vtb-ud-test.conllu", "dev": "https://raw.githubusercontent.com/UniversalDependencies/UD_Vietnamese-VTB/master/vi_vtb-ud-dev.conllu", }, - "tl_trg": { + "ud_tl_trg": { "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Tagalog-TRG/master/tl_trg-ud-test.conllu", }, - "tl_ugnayan": { + "ud_tl_ugnayan": { "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Tagalog-Ugnayan/master/tl_ugnayan-ud-test.conllu", }, } - - -_SUPPORTED_TASKS = [Tasks.POS_TAGGING] - _SOURCE_VERSION = "2.13.0" _SEACROWD_VERSION = "1.0.0" - - - class UDDataset(datasets.GeneratorBasedBuilder): - SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) - SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) - SOURCE_BUILDER_CONFIGS = [ SEACrowdConfig( name=f"{_DATASETNAME}_{subset_name}_source", - version=SOURCE_VERSION, + version=datasets.Version(_SOURCE_VERSION), description=f"{_DATASETNAME} source schema", schema="source", - subset_id=f"{subset_name}", - ) for subset_name in _SUBSETS.keys()] + subset_id=f"{_DATASETNAME}_{subset_name}", + ) + for subset_name in _SOURCE_SUBSETS_TO_TASKS_INDEX.keys()] SEQUENCE_BUILDER_CONFIGS = [ SEACrowdConfig( name=f"{_DATASETNAME}_{subset_name}_seacrowd_seq_label", - version=SEACROWD_VERSION, + version=datasets.Version(_SEACROWD_VERSION), description=f"{_DATASETNAME} SEACrowd Seq Label schema", schema="seacrowd_seq_label", - subset_id=f"{subset_name}", - ) for subset_name in _SUBSETS.keys()] - BUILDER_CONFIGS = SOURCE_BUILDER_CONFIGS + SEQUENCE_BUILDER_CONFIGS + subset_id=f"{_DATASETNAME}_{subset_name}", + ) + for subset_name, task_idx in _SOURCE_SUBSETS_TO_TASKS_INDEX.items() if _SUPPORTED_TASKS.index(Tasks.POS_TAGGING) in task_idx] + KB_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset_name}_seacrowd_kb", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd Knowlegde Base schema", + schema="seacrowd_kb", + subset_id=f"{_DATASETNAME}_{subset_name}", + ) + for subset_name, task_idx in _SOURCE_SUBSETS_TO_TASKS_INDEX.items() if _SUPPORTED_TASKS.index(Tasks.DEPENDENCY_PARSING) in task_idx] + T2T_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_{subset_name}_seacrowd_t2t", + version=datasets.Version(_SEACROWD_VERSION), + description=f"{_DATASETNAME} SEACrowd Translation T2T schema EN-XX", + schema="seacrowd_t2t", + subset_id=f"{_DATASETNAME}_{subset_name}", + ) + for subset_name, task_idx in _SOURCE_SUBSETS_TO_TASKS_INDEX.items() if _SUPPORTED_TASKS.index(Tasks.MACHINE_TRANSLATION) in task_idx] + + BUILDER_CONFIGS = SOURCE_BUILDER_CONFIGS + SEQUENCE_BUILDER_CONFIGS + KB_CONFIGS + T2T_CONFIGS UPOS_TAGS = ["ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X"] def _info(self) -> datasets.DatasetInfo: - self.config.schema = "seacrowd_seq_label" if self.config.schema == "source": - features = datasets.Features( - { - # metadata - "sent_id": datasets.Sequence(datasets.Value("string")), - "text": datasets.Sequence(datasets.Value("string")), - "text_en": datasets.Sequence(datasets.Value("string")), - # tokens - "id": [datasets.Sequence(datasets.Value("string"))], - "form": [datasets.Sequence(datasets.Value("string"))], - "lemma": [datasets.Sequence(datasets.Value("string"))], - "upos": [datasets.Sequence(datasets.Value("string"))], - "xpos": [datasets.Sequence(datasets.Value("string"))], - "feats": [datasets.Sequence(datasets.Value("string"))], - "head": [datasets.Sequence(datasets.Value("string"))], - "deprel": [datasets.Sequence(datasets.Value("string"))], - "deps": [datasets.Sequence(datasets.Value("string"))], - "misc": [datasets.Sequence(datasets.Value("string"))], - } - ) + schema_dict = { + # metadata + "sent_id": datasets.Value("string"), + "text": datasets.Value("string"), + # tokens + "id": datasets.Sequence(datasets.Value("string")), + "form": datasets.Sequence(datasets.Value("string")), + "lemma": datasets.Sequence(datasets.Value("string")), + "upos": datasets.Sequence(datasets.Value("string")), + "xpos": datasets.Sequence(datasets.Value("string")), + "feats": datasets.Sequence(datasets.Value("string")), + "head": datasets.Sequence(datasets.Value("string")), + "deprel": datasets.Sequence(datasets.Value("string")), + "deps": datasets.Sequence(datasets.Value("string")), + "misc": datasets.Sequence(datasets.Value("string")), + } + + # add text_en for UD data that has en text (for T2T) + if _SUPPORTED_TASKS.index(Tasks.MACHINE_TRANSLATION) in _SOURCE_SUBSETS_TO_TASKS_INDEX["_".join(self.config.subset_id.split("_")[1:])]: + schema_dict["text_en"] = datasets.Value("string") + + # add "gloss" and "source" for tl_trg subset + if self.config.subset_id == "ud_tl_trg": + schema_dict["gloss"] = datasets.Value("string") + schema_dict["source"] = datasets.Value("string") + + features = datasets.Features(schema_dict) elif self.config.schema == "seacrowd_seq_label": features = schemas.seq_label_features(self.UPOS_TAGS) + elif self.config.schema == "seacrowd_kb": + features = schemas.kb_features + + elif self.config.schema == "seacrowd_t2t": + features = schemas.text2text_features + else: raise NotImplementedError(f"Schema '{self.config.schema}' is not defined.") @@ -168,7 +202,7 @@ def _info(self) -> datasets.DatasetInfo: features=features, homepage=_HOMEPAGE, license=_LICENSE, - citation=_CITATION, + citation=self._generate_additional_citation(self.config.subset_id), ) def _split_generators( @@ -176,48 +210,221 @@ def _split_generators( ) -> List[datasets.SplitGenerator]: """Returns SplitGenerators.""" - return self._ud_split_generator(dl_manager) + return self._ud_split_generator(dl_manager, self.config.subset_id) def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: """instance tuple generated in the form (key, labels)""" - - dataset = list( - load_ud_data( - filepath, - filter_kwargs={"id": lambda i: isinstance(i, int)}, - ) - ) + dataset = self._ud_generate_examples(filepath, self.config.subset_id, self.info.features, self.config.schema == "source") if self.config.schema == "source": pass elif self.config.schema == "seacrowd_seq_label": + #some data has label of "_" which indicates a token that has multiple labels (it has the splitted values in the subsequent/preceeding iterables) + def remove_invalid_labels_from_seq(sent_id: str, tokens: Iterable, labels: Iterable, invalid_tokens: Iterable): + _tokens, _labels = [], [] + for idx, val in enumerate(labels): + if val not in invalid_tokens: + _tokens.append(tokens[idx]) + _labels.append(labels[idx]) + + return sent_id, _tokens, _labels + + dataset = list( + map( + lambda d: dict(zip( + ("id", "tokens", "labels"), + remove_invalid_labels_from_seq(d["sent_id"], d["form"], d["upos"], + invalid_tokens=("_")) + )), + filter(lambda d: len(d["form"]) == len(d["upos"]),dataset) + ) + ) + + elif self.config.schema == "seacrowd_t2t": dataset = list( map( lambda d: { "id": d["sent_id"], - "tokens": d["form"], - "labels": d["upos"], + "text_1": d["text_en"], + "text_2": d["text"], + "text_1_name": "eng", + "text_2_name": _ISO_LANG_MAPPER_UD[self.config.subset_id.split("_")[1]], }, - dataset, + filter(lambda d: d.get("text_en"), dataset), ) ) + elif self.config.schema == "seacrowd_kb": + morph_anomaly = self._get_morph_exceptions(self.config.subset_id) + dataset = load_ud_data_as_seacrowd_kb( + filepath, + dataset, + morph_exceptions=morph_anomaly + ) + else: raise NotImplementedError(f"Schema '{self.config.schema}' is not defined.") for key, example in enumerate(dataset): yield key, example + @staticmethod + def _set_load_ud_source_data_kwargs(subset_name: str): + + def _assert_multispan_range_is_one(token_list: TokenList): + """ + Asserting that all tokens with multiple span can only have 2 span, and \ + no field other than form has important information + """ + for token in token_list.filter(id=lambda i: not isinstance(i, int)): + _id = token["id"] + assert len(_id) == 3, f"Unexpected length of non-int CONLLU Token's id. Expected 3, found {len(_id)};" + assert all(isinstance(a, b) for a, b in zip(_id, [int, str, int])), f"Non-int ID should be in format of '\\d+-\\d+'. Found {_id};" + assert _id[2] - _id[0] == 1, f"Token has more than 2 spans. Found {_id[2] - _id[0] + 1} spans;" + for key in ["lemma", "upos", "xpos", "feats", "head", "deprel", "deps"]: + assert token[key] in {"_", None}, f"Field other than 'form' should not contain extra information. Found: '{key}' = '{token[key]}'" + + kwargs_return = {} + + if subset_name == "ud_id_csui": + kwargs_return = { + "filter_kwargs": {"id": lambda i: isinstance(i, int)}, + "assert_fn": _assert_multispan_range_is_one} + + if subset_name == "ud_jv_csui": + kwargs_return = { + "filter_kwargs": {"id": lambda i: isinstance(i, int)}} + + return kwargs_return + + @staticmethod + def _generate_additional_citation(subset_name: str): + # generate additional citation, which `_CITATION` value defined above is appended to the subset-based UD citation + + if subset_name == "ud_id_csui": + CITATION = r""" + @article {10.3844/jcssp.2020.1585.1597, + author = {Alfina, Ika and Budi, Indra and Suhartanto, Heru}, + title = {Tree Rotations for Dependency Trees: Converting the Head-Directionality of Noun Phrases}, + article_type = {journal}, + volume = {16}, + number = {11}, + year = {2020}, + month = {Nov}, + pages = {1585-1597}, + doi = {10.3844/jcssp.2020.1585.1597}, + url = {https://thescipub.com/abstract/jcssp.2020.1585.1597}, + journal = {Journal of Computer Science}, + publisher = {Science Publications} + } + + """ + _CITATION + + if subset_name == "ud_id_gsd": + CITATION = r""" + @inproceedings{mcdonald-etal-2013-universal, + title = "{U}niversal {D}ependency Annotation for Multilingual Parsing", + author = {McDonald, Ryan and + Nivre, Joakim and + Quirmbach-Brundage, Yvonne and + Goldberg, Yoav and + Das, Dipanjan and + Ganchev, Kuzman and + Hall, Keith and + Petrov, Slav and + Zhang, Hao and + T{\"a}ckstr{\"o}m, Oscar and + Bedini, Claudia and + Bertomeu Castell{\'o}, N{\'u}ria and + Lee, Jungmee}, + booktitle = "Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)", + month = aug, + year = "2013", + address = "Sofia, Bulgaria", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/P13-2017", + pages = "92--97", + } + + @article{DBLP:journals/corr/abs-2011-00677, + author = {Fajri Koto and + Afshin Rahimi and + Jey Han Lau and + Timothy Baldwin}, + title = {IndoLEM and IndoBERT: {A} Benchmark Dataset and Pre-trained Language + Model for Indonesian {NLP}}, + journal = {CoRR}, + volume = {abs/2011.00677}, + year = {2020}, + url = {https://arxiv.org/abs/2011.00677}, + eprinttype = {arXiv}, + eprint = {2011.00677}, + timestamp = {Fri, 06 Nov 2020 15:32:47 +0100}, + biburl = {https://dblp.org/rec/journals/corr/abs-2011-00677.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} + + """ + _CITATION + + if subset_name == "ud_id_gsd": + CITATION = r""" + @conference{2f8c7438a7f44f6b85b773586cff54e8, + title = "A gold standard dependency treebank for Indonesian", + author = "Ika Alfina and Arawinda Dinakaramani and Fanany, {Mohamad Ivan} and Heru Suhartanto", + note = "Publisher Copyright: {\textcopyright} 2019 Proceedings of the 33rd Pacific Asia Conference on Language, Information and Computation, PACLIC 2019. All rights reserved.; \ + 33rd Pacific Asia Conference on Language, Information and Computation, PACLIC 2019 ; Conference date: 13-09-2019 Through 15-09-2019", + year = "2019", + month = jan, + day = "1", + language = "English", + pages = "1--9", + } + + @article{DBLP:journals/corr/abs-2011-00677, + author = {Fajri Koto and + Afshin Rahimi and + Jey Han Lau and + Timothy Baldwin}, + title = {IndoLEM and IndoBERT: {A} Benchmark Dataset and Pre-trained Language + Model for Indonesian {NLP}}, + journal = {CoRR}, + volume = {abs/2011.00677}, + year = {2020}, + url = {https://arxiv.org/abs/2011.00677}, + eprinttype = {arXiv}, + eprint = {2011.00677}, + timestamp = {Fri, 06 Nov 2020 15:32:47 +0100}, + biburl = {https://dblp.org/rec/journals/corr/abs-2011-00677.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} + } + + """ + _CITATION + + else: + CITATION = _CITATION + + return CITATION + + @staticmethod + def _get_morph_exceptions(subset_name: str): + morph_anomaly = [] + # not implemented yet + # if subset_name == "ud_jv_csui": + # morph_anomaly = [ + # # Exceptions due to inconsistencies in the raw data annotation + # ("ne", "e"), + # ("nipun", "ipun"), + # ("me", "e"), # occurrence word: "Esemme" = "Esem" + "e". original text has double 'm'. + # ] + return morph_anomaly - def _ud_split_generator(self, dl_manager): + @staticmethod + def _ud_split_generator(dl_manager, subset_name: str): split_dset = [] - if self.config.subset_id not in _SUBSETS: - return split_dset - urls = _URLS[self.config.subset_id] + urls = _URLS[subset_name] data_path = dl_manager.download(urls) if "train" in data_path: split_dset.append(datasets.SplitGenerator( @@ -242,6 +449,48 @@ def _ud_split_generator(self, dl_manager): )) return split_dset + + @classmethod + def _ud_generate_examples(cls, filepath: str | list, subset_name: str, features: Iterable, is_source: bool): + + #utility to fill data w/ default val + def fill_data(data, col_name, fill_val): + _data = deepcopy(data) + _data[col_name] = _data.get(col_name, fill_val) + return _data + + #utility to remove data + def pop_data(data, col_name): + _data = deepcopy(data) + _data.pop(col_name, None) + return _data + + # allow list of filepath be loaded + if isinstance(filepath, str): + filepath = [filepath] + + dataset = [] + for _filepath in filepath: + dataset.extend(list( + load_ud_data( + _filepath, **cls._set_load_ud_source_data_kwargs(subset_name) + ) + )) - - + # remove the data from source since the occurrence is small (presumably malformed) + # and not listed in misc features (https://tables.grew.fr/?data=ud_feats/MISC) + if subset_name == "ud_tl_ugnayan": + for key in ("newdoc_id", "text_id"): + dataset = list(map(lambda x: pop_data(x, key), dataset)) + + if subset_name == "ud_tl_trg": + for key in ("AP", "BP", "OP", "DP", "PIV"): + dataset = list(map(lambda x: pop_data(x, key), dataset)) + + # fill w/ default only for Source schema + if is_source: + for key, default_val in zip(("text_en", "gloss", "source"), ("", "", "")): + if key in features: + dataset = list(map(lambda x: fill_data(x, key, default_val), dataset)) + + return dataset