From d158d6d5d10a02a8cc46497e342f4e101753f3d1 Mon Sep 17 00:00:00 2001 From: Yuze GAO Date: Thu, 14 Mar 2024 22:55:25 +0800 Subject: [PATCH 1/4] update csubset composition --- seacrowd/sea_datasets/tydiqa/tydiqa.py | 199 ++++++++++++++++--------- 1 file changed, 130 insertions(+), 69 deletions(-) diff --git a/seacrowd/sea_datasets/tydiqa/tydiqa.py b/seacrowd/sea_datasets/tydiqa/tydiqa.py index 627710357..28b086bcb 100644 --- a/seacrowd/sea_datasets/tydiqa/tydiqa.py +++ b/seacrowd/sea_datasets/tydiqa/tydiqa.py @@ -95,28 +95,28 @@ "dev": _URL + "v1.1/tydiqa-goldp-v1.1-dev.json", } -_PRIMARY_DESP = """Passage selection task (SelectP): Given a list of the passages in the article, return either (a) the index of - the passage that answers the question or (b) NULL if no such passage exists. - Minimal answer span task (MinSpan): Given the full text of an article, return one of (a) the start and end - byte indices of the minimal span that completely answers the question; (b) YES or NO if the question requires - a yes/no answer and we can draw a conclusion from the passage; (c) NULL if it is not possible to produce a - minimal answer for this question.""" - -_SECONDARY_DESP = """Gold passage task (GoldP): Given a passage that is guaranteed to contain the +_SELECTP_DESP = """Passage selection task (SelectP): Given a list of the passages in the article, return either (a) the index of + the passage that answers the question or (b) NULL if no such passage exists. + """ +_MINSPAN_DESP = """Minimal answer span task (MinSpan): Given the full text of an article, return one of (a) the start and end + byte indices of the minimal span that completely answers the question; (b) YES or NO if the question requires + a yes/no answer and we can draw a conclusion from the passage; (c) NULL if it is not possible to produce a + minimal answer for this question.""" +_GOLDP_DESP = """Gold passage task (GoldP): Given a passage that is guaranteed to contain the answer, predict the single contiguous span of characters that answers the question. This is more similar to existing reading comprehension datasets (as opposed to the information-seeking task outlined above). - This task is constructed with two goals in mind: (1) more directly comparing with prior work and (2) providing - a simplified way for researchers to use TyDi QA by providing compatibility with existing code for SQuAD 1.1, - XQuAD, and MLQA. Toward these goals, the gold passage task differs from the primary task in several ways: - only the gold answer passage is provided rather than the entire Wikipedia article; - unanswerable questions have been discarded, similar to MLQA and XQuAD; - we evaluate with the SQuAD 1.1 metrics like XQuAD; and - Thai and Japanese are removed since the lack of whitespace breaks some tools. + """ +_ID_DESP = """{I}ndo{NLG}: Benchmark and Resources for Evaluating {I}ndonesian Natural Language Generation, is a benchmark + for evaluating Indonesian natural language generation (NLG) systems. The question-answer pairs are collected + for each language without using translation services. It uses the Indonesian data from the secondary Gold + passage task of the TyDiQA dataset. As the original dataset only provides training and validation sets, + TydiQA-ID randomly split off 15% of the training data and use it as the test set. """ def config_constructor(subset_id, schema, desc, version): - return SEACrowdConfig(name=f"{_DATASETNAME}_{subset_id}_{schema}", description=desc, version=datasets.Version(version), schema=schema, subset_id=subset_id) + return SEACrowdConfig(name=f"{_DATASETNAME}_{subset_id}_{schema}", description=desc, + version=datasets.Version(version), schema=schema, subset_id=subset_id) class TydiqaDataset(datasets.GeneratorBasedBuilder): @@ -128,24 +128,48 @@ class TydiqaDataset(datasets.GeneratorBasedBuilder): BUILDER_CONFIGS = [ # source schema - config_constructor(subset_id="primary_task", schema="source", desc=_PRIMARY_DESP, version=_SOURCE_VERSION_P), - config_constructor(subset_id="primary_task_indonesian", schema="source", desc=_PRIMARY_DESP, version=_SOURCE_VERSION_P), - config_constructor(subset_id="primary_task_thai", schema="source", desc=_PRIMARY_DESP, version=_SOURCE_VERSION_P), - config_constructor(subset_id="secondary_task", schema="source", desc=_SECONDARY_DESP, version=_SOURCE_VERSION_S), - config_constructor(subset_id="secondary_task_indonesian", schema="source", desc=_SECONDARY_DESP, version=_SOURCE_VERSION_S), - config_constructor(subset_id="id", schema="source", desc=_SECONDARY_DESP, version=_SOURCE_VERSION_S), + # selectp source schema + config_constructor(subset_id="selectp", schema="source", desc=_SELECTP_DESP, version=_SOURCE_VERSION_P), + config_constructor(subset_id="selectp_indonesian", schema="source", desc=_SELECTP_DESP, + version=_SOURCE_VERSION_P), + config_constructor(subset_id="selectp_thai", schema="source", desc=_SELECTP_DESP, version=_SOURCE_VERSION_P), + # minspan source schema + config_constructor(subset_id="minspan", schema="source", desc=_MINSPAN_DESP, version=_SOURCE_VERSION_P), + config_constructor(subset_id="minspan_indonesian", schema="source", desc=_MINSPAN_DESP, + version=_SOURCE_VERSION_P), + config_constructor(subset_id="minspan_thai", schema="source", desc=_MINSPAN_DESP, version=_SOURCE_VERSION_P), + # goldp source schema + config_constructor(subset_id="goldp", schema="source", desc=_GOLDP_DESP, version=_SOURCE_VERSION_S), + config_constructor(subset_id="goldp_indonesian", schema="source", desc=_GOLDP_DESP, version=_SOURCE_VERSION_S), + config_constructor(subset_id="goldp_thai", schema="source", desc=_GOLDP_DESP, version=_SOURCE_VERSION_S), + # tydiqa_id source schema + config_constructor(subset_id="id", schema="source", desc=_ID_DESP, version=_SOURCE_VERSION_P), + # seacrowd schema - config_constructor(subset_id="primary_task", schema="seacrowd_qa", desc=_PRIMARY_DESP, version=_SEACROWD_VERSION), - config_constructor(subset_id="primary_task_indonesian", schema="seacrowd_qa", desc=_PRIMARY_DESP, version=_SEACROWD_VERSION), - config_constructor(subset_id="primary_task_thai", schema="seacrowd_qa", desc=_PRIMARY_DESP, version=_SEACROWD_VERSION), - config_constructor(subset_id="secondary_task", schema="seacrowd_qa", desc=_SECONDARY_DESP, version=_SEACROWD_VERSION), - config_constructor(subset_id="secondary_task_indonesian", schema="seacrowd_qa", desc=_SECONDARY_DESP, version=_SEACROWD_VERSION), - config_constructor(subset_id="id", schema="seacrowd_qa", desc=_SECONDARY_DESP, version=_SEACROWD_VERSION), + # selectp seacrowd schema + config_constructor(subset_id="selectp", schema="seacrowd_qa", desc=_SELECTP_DESP, version=_SEACROWD_VERSION), + config_constructor(subset_id="selectp_indonesian", schema="seacrowd_qa", desc=_SELECTP_DESP, + version=_SEACROWD_VERSION), + config_constructor(subset_id="selectp_thai", schema="seacrowd_qa", desc=_SELECTP_DESP, + version=_SEACROWD_VERSION), + # minspan seacrowd schema + config_constructor(subset_id="minspan", schema="seacrowd_qa", desc=_MINSPAN_DESP, version=_SEACROWD_VERSION), + config_constructor(subset_id="minspan_indonesian", schema="seacrowd_qa", desc=_MINSPAN_DESP, + version=_SEACROWD_VERSION), + config_constructor(subset_id="minspan_thai", schema="seacrowd_qa", desc=_MINSPAN_DESP, + version=_SEACROWD_VERSION), + # goldp seacrowd schema + config_constructor(subset_id="goldp", schema="seacrowd_qa", desc=_GOLDP_DESP, version=_SEACROWD_VERSION), + config_constructor(subset_id="goldp_indonesian", schema="seacrowd_qa", desc=_GOLDP_DESP, + version=_SEACROWD_VERSION), + config_constructor(subset_id="goldp_thai", schema="seacrowd_qa", desc=_GOLDP_DESP, version=_SEACROWD_VERSION), + # tydiqa_id seacrowd schema + config_constructor(subset_id="id", schema="seacrowd_qa", desc=_ID_DESP, version=_SEACROWD_VERSION), ] DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_id_source" def _info(self): - if "primary_task" in self.config.name: + if ("selectp" in self.config.name) or ("minspan" in self.config.name): if "source" in self.config.name: features = datasets.Features( { @@ -190,7 +214,7 @@ def _info(self): "language": datasets.Value("string"), } - elif "secondary_task" in self.config.name or "tydiqa_id" in self.config.name: + elif ("goldp" in self.config.name) or ("tydiqa_id" in self.config.name): if "source" in self.config.name: features = datasets.Features( { @@ -224,7 +248,7 @@ def _split_generators(self, dl_manager): primary_downloaded = dl_manager.download_and_extract(_PRIMARY_URLS) secondary_downloaded = dl_manager.download_and_extract(_SECONDARY_URLS) - if "primary_task" in self.config.name: + if ("selectp" in self.config.name) or ("minspan" in self.config.name): return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, @@ -236,7 +260,7 @@ def _split_generators(self, dl_manager): ), ] - elif "secondary_task" in self.config.name: + elif "goldp" in self.config.name: return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, @@ -266,7 +290,7 @@ def _split_generators(self, dl_manager): def _generate_examples(self, filepath, split=None): """Yields examples.""" - if "primary_task" in self.config.name: + if ("selectp" in self.config.name) or ("minspan" in self.config.name): with open(filepath, encoding="utf-8") as f: for id_, row in enumerate(f): data = json.loads(row) @@ -278,38 +302,58 @@ def _generate_examples(self, filepath, split=None): question = data["question_text"] annotations = data["annotations"] yes_no_answers = [annotation["yes_no_answer"] for annotation in annotations] - min_answers_end_byte = [annotation["minimal_answer"]["plaintext_end_byte"] for annotation in annotations] - min_answers_start_byte = [annotation["minimal_answer"]["plaintext_start_byte"] for annotation in annotations] - passage_cand_answers = [annotation["passage_answer"]["candidate_index"] for annotation in annotations] + min_answers_end_byte = [annotation["minimal_answer"]["plaintext_end_byte"] for annotation in + annotations] + min_answers_start_byte = [annotation["minimal_answer"]["plaintext_start_byte"] for annotation in + annotations] + passage_cand_answers = [annotation["passage_answer"]["candidate_index"] for annotation in + annotations] doc = data["document_plaintext"] url = data["document_url"] - if self.config.name == "tydiqa_primary_task_source": - yield id_, primary_source_helper(id_, start_byte, end_byte, question, title, lang, passage_cand_answers, - min_answers_start_byte, min_answers_end_byte, yes_no_answers, doc, url) - elif self.config.name == "tydiqa_primary_task_indonesian_source": + if (self.config.name == "tydiqa_selectp_source") or (self.config.name == "tydiqa_minspan_source"): + yield id_, primary_source_helper(id_, start_byte, end_byte, question, title, lang, + passage_cand_answers, + min_answers_start_byte, min_answers_end_byte, yes_no_answers, + doc, url) + elif (self.config.name == "tydiqa_selectp_indonesian_source") or ( + self.config.name == "tydiqa_minspan_indonesian_source"): if lang == "indonesian": - yield id_, primary_source_helper(id_, start_byte, end_byte, question, title, lang, passage_cand_answers, - min_answers_start_byte, min_answers_end_byte, yes_no_answers, doc, url) - elif self.config.name == "tydiqa_primary_task_thai_source": + yield id_, primary_source_helper(id_, start_byte, end_byte, question, title, lang, + passage_cand_answers, + min_answers_start_byte, min_answers_end_byte, + yes_no_answers, doc, url) + elif (self.config.name == "tydiqa_selectp_thai_source") or ( + self.config.name == "tydiqa_minspan_thai_source"): if lang == "thai": - yield id_, primary_source_helper(id_, start_byte, end_byte, question, title, lang, passage_cand_answers, - min_answers_start_byte, min_answers_end_byte, yes_no_answers, doc, url) + yield id_, primary_source_helper(id_, start_byte, end_byte, question, title, lang, + passage_cand_answers, + min_answers_start_byte, min_answers_end_byte, + yes_no_answers, doc, url) # seacrowd - elif self.config.name == "tydiqa_primary_task_seacrowd_qa": - yield id_, primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, passage_cand_answers, - min_answers_start_byte, min_answers_end_byte, yes_no_answers, lang) - elif self.config.name == "tydiqa_primary_task_indonesian_seacrowd_qa": + elif (self.config.name == "tydiqa_selectp_seacrowd_qa") or ( + self.config.name == "tydiqa_minspan_seacrowd_qa"): + yield id_, primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, + passage_cand_answers, + min_answers_start_byte, min_answers_end_byte, yes_no_answers, + lang) + elif (self.config.name == "tydiqa_selectp_indonesian_seacrowd_qa") or ( + self.config.name == "tydiqa_minspan_indonesian_seacrowd_qa"): if lang == "indonesian": - yield id_, primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, passage_cand_answers, - min_answers_start_byte, min_answers_end_byte, yes_no_answers, lang) - elif self.config.name == "tydiqa_primary_task_thai_seacrowd_qa": + yield id_, primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, + passage_cand_answers, + min_answers_start_byte, min_answers_end_byte, + yes_no_answers, lang) + elif (self.config.name == "tydiqa_selectp_thai_seacrowd_qa") or ( + self.config.name == "tydiqa_minspan_thai_seacrowd_qa"): if lang == "thai": - yield id_, primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, passage_cand_answers, - min_answers_start_byte, min_answers_end_byte, yes_no_answers, lang) + yield id_, primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, + passage_cand_answers, + min_answers_start_byte, min_answers_end_byte, + yes_no_answers, lang) else: raise ValueError(f"No configs to match {self.config.name} in primary_task") - elif ("secondary_task" in self.config.name) or ("tydiqa_id" in self.config.name): + elif ("goldp" in self.config.name) or ("tydiqa_id" in self.config.name): with (open(filepath, encoding="utf-8") as f): data = json.load(f) tydiqa_id_num = 0 @@ -322,36 +366,43 @@ def _generate_examples(self, filepath, split=None): id_ = qa["id"] answer_starts = [answer["answer_start"] for answer in qa["answers"]] answers = [answer["text"].strip() for answer in qa["answers"]] - if self.config.name == "tydiqa_secondary_task_source": + if self.config.name == "tydiqa_goldp_source": yield id_, second_source_helper(id_, title, context, question, answer_starts, answers) - elif self.config.name == "tydiqa_secondary_task_indonesian_source": + elif self.config.name == "tydiqa_goldp_indonesian_source": if id_.startswith("indonesian"): - yield id_, second_source_helper(id_, title, context, question, answer_starts, answers) + yield id_, second_source_helper(id_, title, context, question, answer_starts, + answers) elif self.config.name == "tydiqa_id_source": if id_.startswith("indonesian"): tydiqa_id_num += 1 if split == "train" and tydiqa_id_num >= 856: - yield id_, second_source_helper(id_, title, context, question, answer_starts, answers) + yield id_, second_source_helper(id_, title, context, question, answer_starts, + answers) if split == "test" and tydiqa_id_num < 856: - yield id_, second_source_helper(id_, title, context, question, answer_starts, answers) + yield id_, second_source_helper(id_, title, context, question, answer_starts, + answers) if split == "validation": - yield id_, second_source_helper(id_, title, context, question, answer_starts, answers) + yield id_, second_source_helper(id_, title, context, question, answer_starts, + answers) - elif self.config.name == "tydiqa_secondary_task_seacrowd_qa": + elif self.config.name == "tydiqa_goldp_seacrowd_qa": yield id_, second_seacrowd_helper(id_, question, context, answers, answer_starts) - elif self.config.name == "tydiqa_secondary_task_indonesian_seacrowd_qa": + elif self.config.name == "tydiqa_goldp_indonesian_seacrowd_qa": if id_.startswith("indonesian"): yield id_, second_seacrowd_helper(id_, question, context, answers, answer_starts) elif self.config.name == "tydiqa_id_seacrowd_qa": if id_.startswith("indonesian"): tydiqa_id_num += 1 if split == "train" and tydiqa_id_num >= 856: - yield id_, second_seacrowd_helper(id_, question, context, answers, answer_starts) + yield id_, second_seacrowd_helper(id_, question, context, answers, + answer_starts) if split == "test" and tydiqa_id_num < 856: - yield id_, second_seacrowd_helper(id_, question, context, answers, answer_starts) + yield id_, second_seacrowd_helper(id_, question, context, answers, + answer_starts) if split == "validation": - yield id_, second_seacrowd_helper(id_, question, context, answers, answer_starts) + yield id_, second_seacrowd_helper(id_, question, context, answers, + answer_starts) else: raise ValueError(f"No configs to match {self.config.name} in secondary_task") @@ -418,6 +469,16 @@ def second_source_helper(id_, title, context, question, answer_starts, answers): def second_seacrowd_helper(id_, question, context, answers, answer_starts): - return {"id": id_, "question_id": id_, "document_id": id_, "question": question, - "type": "abstractive", "choices": [], "context": context, "answer": answers, - "meta": {"answer_start": answer_starts}} + return { + "id": id_, + "question_id": id_, + "document_id": id_, + "question": question, + "type": "abstractive", + "choices": [], + "context": context, + "answer": answers, + "meta": { + "answer_start": answer_starts + }, + } From cff1b57ba93f4757b0d0ea26478f2861dc08b0e3 Mon Sep 17 00:00:00 2001 From: Yuze GAO Date: Thu, 14 Mar 2024 23:17:17 +0800 Subject: [PATCH 2/4] Update Subset Composition --- seacrowd/sea_datasets/tydiqa/tydiqa.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/seacrowd/sea_datasets/tydiqa/tydiqa.py b/seacrowd/sea_datasets/tydiqa/tydiqa.py index 28b086bcb..96961d68f 100644 --- a/seacrowd/sea_datasets/tydiqa/tydiqa.py +++ b/seacrowd/sea_datasets/tydiqa/tydiqa.py @@ -141,7 +141,6 @@ class TydiqaDataset(datasets.GeneratorBasedBuilder): # goldp source schema config_constructor(subset_id="goldp", schema="source", desc=_GOLDP_DESP, version=_SOURCE_VERSION_S), config_constructor(subset_id="goldp_indonesian", schema="source", desc=_GOLDP_DESP, version=_SOURCE_VERSION_S), - config_constructor(subset_id="goldp_thai", schema="source", desc=_GOLDP_DESP, version=_SOURCE_VERSION_S), # tydiqa_id source schema config_constructor(subset_id="id", schema="source", desc=_ID_DESP, version=_SOURCE_VERSION_P), @@ -162,7 +161,6 @@ class TydiqaDataset(datasets.GeneratorBasedBuilder): config_constructor(subset_id="goldp", schema="seacrowd_qa", desc=_GOLDP_DESP, version=_SEACROWD_VERSION), config_constructor(subset_id="goldp_indonesian", schema="seacrowd_qa", desc=_GOLDP_DESP, version=_SEACROWD_VERSION), - config_constructor(subset_id="goldp_thai", schema="seacrowd_qa", desc=_GOLDP_DESP, version=_SEACROWD_VERSION), # tydiqa_id seacrowd schema config_constructor(subset_id="id", schema="seacrowd_qa", desc=_ID_DESP, version=_SEACROWD_VERSION), ] From e9d1212e40a828751b0ac474a7abdf6bc35b1ae5 Mon Sep 17 00:00:00 2001 From: Yuze GAO Date: Thu, 14 Mar 2024 23:22:16 +0800 Subject: [PATCH 3/4] Update Subset Composition --- seacrowd/sea_datasets/tydiqa/tydiqa.py | 118 ++++++++----------------- 1 file changed, 36 insertions(+), 82 deletions(-) diff --git a/seacrowd/sea_datasets/tydiqa/tydiqa.py b/seacrowd/sea_datasets/tydiqa/tydiqa.py index 96961d68f..b5a07bc1f 100644 --- a/seacrowd/sea_datasets/tydiqa/tydiqa.py +++ b/seacrowd/sea_datasets/tydiqa/tydiqa.py @@ -106,17 +106,16 @@ answer, predict the single contiguous span of characters that answers the question. This is more similar to existing reading comprehension datasets (as opposed to the information-seeking task outlined above). """ -_ID_DESP = """{I}ndo{NLG}: Benchmark and Resources for Evaluating {I}ndonesian Natural Language Generation, is a benchmark - for evaluating Indonesian natural language generation (NLG) systems. The question-answer pairs are collected - for each language without using translation services. It uses the Indonesian data from the secondary Gold - passage task of the TyDiQA dataset. As the original dataset only provides training and validation sets, +_ID_DESP = """{I}ndo{NLG}: Benchmark and Resources for Evaluating {I}ndonesian Natural Language Generation, is a benchmark + for evaluating Indonesian natural language generation (NLG) systems. The question-answer pairs are collected + for each language without using translation services. It uses the Indonesian data from the secondary Gold + passage task of the TyDiQA dataset. As the original dataset only provides training and validation sets, TydiQA-ID randomly split off 15% of the training data and use it as the test set. """ def config_constructor(subset_id, schema, desc, version): - return SEACrowdConfig(name=f"{_DATASETNAME}_{subset_id}_{schema}", description=desc, - version=datasets.Version(version), schema=schema, subset_id=subset_id) + return SEACrowdConfig(name=f"{_DATASETNAME}_{subset_id}_{schema}", description=desc, version=datasets.Version(version), schema=schema, subset_id=subset_id) class TydiqaDataset(datasets.GeneratorBasedBuilder): @@ -130,37 +129,29 @@ class TydiqaDataset(datasets.GeneratorBasedBuilder): # source schema # selectp source schema config_constructor(subset_id="selectp", schema="source", desc=_SELECTP_DESP, version=_SOURCE_VERSION_P), - config_constructor(subset_id="selectp_indonesian", schema="source", desc=_SELECTP_DESP, - version=_SOURCE_VERSION_P), + config_constructor(subset_id="selectp_indonesian", schema="source", desc=_SELECTP_DESP, version=_SOURCE_VERSION_P), config_constructor(subset_id="selectp_thai", schema="source", desc=_SELECTP_DESP, version=_SOURCE_VERSION_P), # minspan source schema config_constructor(subset_id="minspan", schema="source", desc=_MINSPAN_DESP, version=_SOURCE_VERSION_P), - config_constructor(subset_id="minspan_indonesian", schema="source", desc=_MINSPAN_DESP, - version=_SOURCE_VERSION_P), + config_constructor(subset_id="minspan_indonesian", schema="source", desc=_MINSPAN_DESP, version=_SOURCE_VERSION_P), config_constructor(subset_id="minspan_thai", schema="source", desc=_MINSPAN_DESP, version=_SOURCE_VERSION_P), # goldp source schema config_constructor(subset_id="goldp", schema="source", desc=_GOLDP_DESP, version=_SOURCE_VERSION_S), config_constructor(subset_id="goldp_indonesian", schema="source", desc=_GOLDP_DESP, version=_SOURCE_VERSION_S), # tydiqa_id source schema config_constructor(subset_id="id", schema="source", desc=_ID_DESP, version=_SOURCE_VERSION_P), - # seacrowd schema # selectp seacrowd schema config_constructor(subset_id="selectp", schema="seacrowd_qa", desc=_SELECTP_DESP, version=_SEACROWD_VERSION), - config_constructor(subset_id="selectp_indonesian", schema="seacrowd_qa", desc=_SELECTP_DESP, - version=_SEACROWD_VERSION), - config_constructor(subset_id="selectp_thai", schema="seacrowd_qa", desc=_SELECTP_DESP, - version=_SEACROWD_VERSION), + config_constructor(subset_id="selectp_indonesian", schema="seacrowd_qa", desc=_SELECTP_DESP, version=_SEACROWD_VERSION), + config_constructor(subset_id="selectp_thai", schema="seacrowd_qa", desc=_SELECTP_DESP, version=_SEACROWD_VERSION), # minspan seacrowd schema config_constructor(subset_id="minspan", schema="seacrowd_qa", desc=_MINSPAN_DESP, version=_SEACROWD_VERSION), - config_constructor(subset_id="minspan_indonesian", schema="seacrowd_qa", desc=_MINSPAN_DESP, - version=_SEACROWD_VERSION), - config_constructor(subset_id="minspan_thai", schema="seacrowd_qa", desc=_MINSPAN_DESP, - version=_SEACROWD_VERSION), + config_constructor(subset_id="minspan_indonesian", schema="seacrowd_qa", desc=_MINSPAN_DESP, version=_SEACROWD_VERSION), + config_constructor(subset_id="minspan_thai", schema="seacrowd_qa", desc=_MINSPAN_DESP, version=_SEACROWD_VERSION), # goldp seacrowd schema config_constructor(subset_id="goldp", schema="seacrowd_qa", desc=_GOLDP_DESP, version=_SEACROWD_VERSION), - config_constructor(subset_id="goldp_indonesian", schema="seacrowd_qa", desc=_GOLDP_DESP, - version=_SEACROWD_VERSION), + config_constructor(subset_id="goldp_indonesian", schema="seacrowd_qa", desc=_GOLDP_DESP, version=_SEACROWD_VERSION), # tydiqa_id seacrowd schema config_constructor(subset_id="id", schema="seacrowd_qa", desc=_ID_DESP, version=_SEACROWD_VERSION), ] @@ -300,54 +291,28 @@ def _generate_examples(self, filepath, split=None): question = data["question_text"] annotations = data["annotations"] yes_no_answers = [annotation["yes_no_answer"] for annotation in annotations] - min_answers_end_byte = [annotation["minimal_answer"]["plaintext_end_byte"] for annotation in - annotations] - min_answers_start_byte = [annotation["minimal_answer"]["plaintext_start_byte"] for annotation in - annotations] - passage_cand_answers = [annotation["passage_answer"]["candidate_index"] for annotation in - annotations] + min_answers_end_byte = [annotation["minimal_answer"]["plaintext_end_byte"] for annotation in annotations] + min_answers_start_byte = [annotation["minimal_answer"]["plaintext_start_byte"] for annotation in annotations] + passage_cand_answers = [annotation["passage_answer"]["candidate_index"] for annotation in annotations] doc = data["document_plaintext"] url = data["document_url"] if (self.config.name == "tydiqa_selectp_source") or (self.config.name == "tydiqa_minspan_source"): - yield id_, primary_source_helper(id_, start_byte, end_byte, question, title, lang, - passage_cand_answers, - min_answers_start_byte, min_answers_end_byte, yes_no_answers, - doc, url) - elif (self.config.name == "tydiqa_selectp_indonesian_source") or ( - self.config.name == "tydiqa_minspan_indonesian_source"): + yield id_, primary_source_helper(id_, start_byte, end_byte, question, title, lang, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, doc, url) + elif (self.config.name == "tydiqa_selectp_indonesian_source") or (self.config.name == "tydiqa_minspan_indonesian_source"): if lang == "indonesian": - yield id_, primary_source_helper(id_, start_byte, end_byte, question, title, lang, - passage_cand_answers, - min_answers_start_byte, min_answers_end_byte, - yes_no_answers, doc, url) - elif (self.config.name == "tydiqa_selectp_thai_source") or ( - self.config.name == "tydiqa_minspan_thai_source"): + yield id_, primary_source_helper(id_, start_byte, end_byte, question, title, lang, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, doc, url) + elif (self.config.name == "tydiqa_selectp_thai_source") or (self.config.name == "tydiqa_minspan_thai_source"): if lang == "thai": - yield id_, primary_source_helper(id_, start_byte, end_byte, question, title, lang, - passage_cand_answers, - min_answers_start_byte, min_answers_end_byte, - yes_no_answers, doc, url) + yield id_, primary_source_helper(id_, start_byte, end_byte, question, title, lang, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, doc, url) # seacrowd - elif (self.config.name == "tydiqa_selectp_seacrowd_qa") or ( - self.config.name == "tydiqa_minspan_seacrowd_qa"): - yield id_, primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, - passage_cand_answers, - min_answers_start_byte, min_answers_end_byte, yes_no_answers, - lang) - elif (self.config.name == "tydiqa_selectp_indonesian_seacrowd_qa") or ( - self.config.name == "tydiqa_minspan_indonesian_seacrowd_qa"): + elif (self.config.name == "tydiqa_selectp_seacrowd_qa") or (self.config.name == "tydiqa_minspan_seacrowd_qa"): + yield id_, primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, lang) + elif (self.config.name == "tydiqa_selectp_indonesian_seacrowd_qa") or (self.config.name == "tydiqa_minspan_indonesian_seacrowd_qa"): if lang == "indonesian": - yield id_, primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, - passage_cand_answers, - min_answers_start_byte, min_answers_end_byte, - yes_no_answers, lang) - elif (self.config.name == "tydiqa_selectp_thai_seacrowd_qa") or ( - self.config.name == "tydiqa_minspan_thai_seacrowd_qa"): + yield id_, primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, lang) + elif (self.config.name == "tydiqa_selectp_thai_seacrowd_qa") or (self.config.name == "tydiqa_minspan_thai_seacrowd_qa"): if lang == "thai": - yield id_, primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, - passage_cand_answers, - min_answers_start_byte, min_answers_end_byte, - yes_no_answers, lang) + yield id_, primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, lang) else: raise ValueError(f"No configs to match {self.config.name} in primary_task") @@ -369,20 +334,16 @@ def _generate_examples(self, filepath, split=None): elif self.config.name == "tydiqa_goldp_indonesian_source": if id_.startswith("indonesian"): - yield id_, second_source_helper(id_, title, context, question, answer_starts, - answers) + yield id_, second_source_helper(id_, title, context, question, answer_starts, answers) elif self.config.name == "tydiqa_id_source": if id_.startswith("indonesian"): tydiqa_id_num += 1 if split == "train" and tydiqa_id_num >= 856: - yield id_, second_source_helper(id_, title, context, question, answer_starts, - answers) + yield id_, second_source_helper(id_, title, context, question, answer_starts, answers) if split == "test" and tydiqa_id_num < 856: - yield id_, second_source_helper(id_, title, context, question, answer_starts, - answers) + yield id_, second_source_helper(id_, title, context, question, answer_starts, answers) if split == "validation": - yield id_, second_source_helper(id_, title, context, question, answer_starts, - answers) + yield id_, second_source_helper(id_, title, context, question, answer_starts, answers) elif self.config.name == "tydiqa_goldp_seacrowd_qa": yield id_, second_seacrowd_helper(id_, question, context, answers, answer_starts) @@ -393,20 +354,16 @@ def _generate_examples(self, filepath, split=None): if id_.startswith("indonesian"): tydiqa_id_num += 1 if split == "train" and tydiqa_id_num >= 856: - yield id_, second_seacrowd_helper(id_, question, context, answers, - answer_starts) + yield id_, second_seacrowd_helper(id_, question, context, answers, answer_starts) if split == "test" and tydiqa_id_num < 856: - yield id_, second_seacrowd_helper(id_, question, context, answers, - answer_starts) + yield id_, second_seacrowd_helper(id_, question, context, answers, answer_starts) if split == "validation": - yield id_, second_seacrowd_helper(id_, question, context, answers, - answer_starts) + yield id_, second_seacrowd_helper(id_, question, context, answers, answer_starts) else: raise ValueError(f"No configs to match {self.config.name} in secondary_task") -def primary_source_helper(id_, start_byte, end_byte, question, title, lang, passage_cand_answers, - min_answers_start_byte, min_answers_end_byte, yes_no_answers, doc, url): +def primary_source_helper(id_, start_byte, end_byte, question, title, lang, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, doc, url): return { "passage_answer_candidates": { "plaintext_start_byte": start_byte, @@ -426,8 +383,7 @@ def primary_source_helper(id_, start_byte, end_byte, question, title, lang, pass } -def primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, passage_cand_answers, - min_answers_start_byte, min_answers_end_byte, yes_no_answers, lang): +def primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, lang): return { "id": str(id_), "question_id": title, @@ -476,7 +432,5 @@ def second_seacrowd_helper(id_, question, context, answers, answer_starts): "choices": [], "context": context, "answer": answers, - "meta": { - "answer_start": answer_starts - }, + "meta": {"answer_start": answer_starts}, } From 2dd457be79c4bd5739acf6ba1356ae3d29f0e16f Mon Sep 17 00:00:00 2001 From: Yuze GAO Date: Tue, 2 Apr 2024 10:56:24 +0800 Subject: [PATCH 4/4] update subset name indonesian --> ind thai --> tha --- seacrowd/sea_datasets/tydiqa/tydiqa.py | 32 +++++++++++++------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/seacrowd/sea_datasets/tydiqa/tydiqa.py b/seacrowd/sea_datasets/tydiqa/tydiqa.py index b5a07bc1f..2379144e6 100644 --- a/seacrowd/sea_datasets/tydiqa/tydiqa.py +++ b/seacrowd/sea_datasets/tydiqa/tydiqa.py @@ -129,29 +129,29 @@ class TydiqaDataset(datasets.GeneratorBasedBuilder): # source schema # selectp source schema config_constructor(subset_id="selectp", schema="source", desc=_SELECTP_DESP, version=_SOURCE_VERSION_P), - config_constructor(subset_id="selectp_indonesian", schema="source", desc=_SELECTP_DESP, version=_SOURCE_VERSION_P), - config_constructor(subset_id="selectp_thai", schema="source", desc=_SELECTP_DESP, version=_SOURCE_VERSION_P), + config_constructor(subset_id="selectp_ind", schema="source", desc=_SELECTP_DESP, version=_SOURCE_VERSION_P), + config_constructor(subset_id="selectp_tha", schema="source", desc=_SELECTP_DESP, version=_SOURCE_VERSION_P), # minspan source schema config_constructor(subset_id="minspan", schema="source", desc=_MINSPAN_DESP, version=_SOURCE_VERSION_P), - config_constructor(subset_id="minspan_indonesian", schema="source", desc=_MINSPAN_DESP, version=_SOURCE_VERSION_P), - config_constructor(subset_id="minspan_thai", schema="source", desc=_MINSPAN_DESP, version=_SOURCE_VERSION_P), + config_constructor(subset_id="minspan_ind", schema="source", desc=_MINSPAN_DESP, version=_SOURCE_VERSION_P), + config_constructor(subset_id="minspan_tha", schema="source", desc=_MINSPAN_DESP, version=_SOURCE_VERSION_P), # goldp source schema config_constructor(subset_id="goldp", schema="source", desc=_GOLDP_DESP, version=_SOURCE_VERSION_S), - config_constructor(subset_id="goldp_indonesian", schema="source", desc=_GOLDP_DESP, version=_SOURCE_VERSION_S), + config_constructor(subset_id="goldp_ind", schema="source", desc=_GOLDP_DESP, version=_SOURCE_VERSION_S), # tydiqa_id source schema config_constructor(subset_id="id", schema="source", desc=_ID_DESP, version=_SOURCE_VERSION_P), # seacrowd schema # selectp seacrowd schema config_constructor(subset_id="selectp", schema="seacrowd_qa", desc=_SELECTP_DESP, version=_SEACROWD_VERSION), - config_constructor(subset_id="selectp_indonesian", schema="seacrowd_qa", desc=_SELECTP_DESP, version=_SEACROWD_VERSION), - config_constructor(subset_id="selectp_thai", schema="seacrowd_qa", desc=_SELECTP_DESP, version=_SEACROWD_VERSION), + config_constructor(subset_id="selectp_ind", schema="seacrowd_qa", desc=_SELECTP_DESP, version=_SEACROWD_VERSION), + config_constructor(subset_id="selectp_tha", schema="seacrowd_qa", desc=_SELECTP_DESP, version=_SEACROWD_VERSION), # minspan seacrowd schema config_constructor(subset_id="minspan", schema="seacrowd_qa", desc=_MINSPAN_DESP, version=_SEACROWD_VERSION), - config_constructor(subset_id="minspan_indonesian", schema="seacrowd_qa", desc=_MINSPAN_DESP, version=_SEACROWD_VERSION), - config_constructor(subset_id="minspan_thai", schema="seacrowd_qa", desc=_MINSPAN_DESP, version=_SEACROWD_VERSION), + config_constructor(subset_id="minspan_ind", schema="seacrowd_qa", desc=_MINSPAN_DESP, version=_SEACROWD_VERSION), + config_constructor(subset_id="minspan_tha", schema="seacrowd_qa", desc=_MINSPAN_DESP, version=_SEACROWD_VERSION), # goldp seacrowd schema config_constructor(subset_id="goldp", schema="seacrowd_qa", desc=_GOLDP_DESP, version=_SEACROWD_VERSION), - config_constructor(subset_id="goldp_indonesian", schema="seacrowd_qa", desc=_GOLDP_DESP, version=_SEACROWD_VERSION), + config_constructor(subset_id="goldp_ind", schema="seacrowd_qa", desc=_GOLDP_DESP, version=_SEACROWD_VERSION), # tydiqa_id seacrowd schema config_constructor(subset_id="id", schema="seacrowd_qa", desc=_ID_DESP, version=_SEACROWD_VERSION), ] @@ -298,19 +298,19 @@ def _generate_examples(self, filepath, split=None): url = data["document_url"] if (self.config.name == "tydiqa_selectp_source") or (self.config.name == "tydiqa_minspan_source"): yield id_, primary_source_helper(id_, start_byte, end_byte, question, title, lang, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, doc, url) - elif (self.config.name == "tydiqa_selectp_indonesian_source") or (self.config.name == "tydiqa_minspan_indonesian_source"): + elif (self.config.name == "tydiqa_selectp_ind_source") or (self.config.name == "tydiqa_minspan_ind_source"): if lang == "indonesian": yield id_, primary_source_helper(id_, start_byte, end_byte, question, title, lang, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, doc, url) - elif (self.config.name == "tydiqa_selectp_thai_source") or (self.config.name == "tydiqa_minspan_thai_source"): + elif (self.config.name == "tydiqa_selectp_tha_source") or (self.config.name == "tydiqa_minspan_tha_source"): if lang == "thai": yield id_, primary_source_helper(id_, start_byte, end_byte, question, title, lang, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, doc, url) # seacrowd elif (self.config.name == "tydiqa_selectp_seacrowd_qa") or (self.config.name == "tydiqa_minspan_seacrowd_qa"): yield id_, primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, lang) - elif (self.config.name == "tydiqa_selectp_indonesian_seacrowd_qa") or (self.config.name == "tydiqa_minspan_indonesian_seacrowd_qa"): + elif (self.config.name == "tydiqa_selectp_ind_seacrowd_qa") or (self.config.name == "tydiqa_minspan_ind_seacrowd_qa"): if lang == "indonesian": yield id_, primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, lang) - elif (self.config.name == "tydiqa_selectp_thai_seacrowd_qa") or (self.config.name == "tydiqa_minspan_thai_seacrowd_qa"): + elif (self.config.name == "tydiqa_selectp_tha_seacrowd_qa") or (self.config.name == "tydiqa_minspan_tha_seacrowd_qa"): if lang == "thai": yield id_, primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, lang) else: @@ -332,7 +332,7 @@ def _generate_examples(self, filepath, split=None): if self.config.name == "tydiqa_goldp_source": yield id_, second_source_helper(id_, title, context, question, answer_starts, answers) - elif self.config.name == "tydiqa_goldp_indonesian_source": + elif self.config.name == "tydiqa_goldp_ind_source": if id_.startswith("indonesian"): yield id_, second_source_helper(id_, title, context, question, answer_starts, answers) elif self.config.name == "tydiqa_id_source": @@ -347,7 +347,7 @@ def _generate_examples(self, filepath, split=None): elif self.config.name == "tydiqa_goldp_seacrowd_qa": yield id_, second_seacrowd_helper(id_, question, context, answers, answer_starts) - elif self.config.name == "tydiqa_goldp_indonesian_seacrowd_qa": + elif self.config.name == "tydiqa_goldp_ind_seacrowd_qa": if id_.startswith("indonesian"): yield id_, second_seacrowd_helper(id_, question, context, answers, answer_starts) elif self.config.name == "tydiqa_id_seacrowd_qa":