Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PR for update subset composition of TydiQA | Close #465 #503

Merged
merged 4 commits into from
Apr 2, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 75 additions & 62 deletions seacrowd/sea_datasets/tydiqa/tydiqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,23 +95,22 @@
"dev": _URL + "v1.1/tydiqa-goldp-v1.1-dev.json",
}

_PRIMARY_DESP = """Passage selection task (SelectP): Given a list of the passages in the article, return either (a) the index of
the passage that answers the question or (b) NULL if no such passage exists.
Minimal answer span task (MinSpan): Given the full text of an article, return one of (a) the start and end
byte indices of the minimal span that completely answers the question; (b) YES or NO if the question requires
a yes/no answer and we can draw a conclusion from the passage; (c) NULL if it is not possible to produce a
minimal answer for this question."""

_SECONDARY_DESP = """Gold passage task (GoldP): Given a passage that is guaranteed to contain the
_SELECTP_DESP = """Passage selection task (SelectP): Given a list of the passages in the article, return either (a) the index of
the passage that answers the question or (b) NULL if no such passage exists.
"""
_MINSPAN_DESP = """Minimal answer span task (MinSpan): Given the full text of an article, return one of (a) the start and end
byte indices of the minimal span that completely answers the question; (b) YES or NO if the question requires
a yes/no answer and we can draw a conclusion from the passage; (c) NULL if it is not possible to produce a
minimal answer for this question."""
_GOLDP_DESP = """Gold passage task (GoldP): Given a passage that is guaranteed to contain the
answer, predict the single contiguous span of characters that answers the question. This is more similar to
existing reading comprehension datasets (as opposed to the information-seeking task outlined above).
This task is constructed with two goals in mind: (1) more directly comparing with prior work and (2) providing
a simplified way for researchers to use TyDi QA by providing compatibility with existing code for SQuAD 1.1,
XQuAD, and MLQA. Toward these goals, the gold passage task differs from the primary task in several ways:
only the gold answer passage is provided rather than the entire Wikipedia article;
unanswerable questions have been discarded, similar to MLQA and XQuAD;
we evaluate with the SQuAD 1.1 metrics like XQuAD; and
Thai and Japanese are removed since the lack of whitespace breaks some tools.
"""
_ID_DESP = """{I}ndo{NLG}: Benchmark and Resources for Evaluating {I}ndonesian Natural Language Generation, is a benchmark
for evaluating Indonesian natural language generation (NLG) systems. The question-answer pairs are collected
for each language without using translation services. It uses the Indonesian data from the secondary Gold
passage task of the TyDiQA dataset. As the original dataset only provides training and validation sets,
TydiQA-ID randomly split off 15% of the training data and use it as the test set.
"""


Expand All @@ -128,24 +127,38 @@ class TydiqaDataset(datasets.GeneratorBasedBuilder):

BUILDER_CONFIGS = [
# source schema
config_constructor(subset_id="primary_task", schema="source", desc=_PRIMARY_DESP, version=_SOURCE_VERSION_P),
config_constructor(subset_id="primary_task_indonesian", schema="source", desc=_PRIMARY_DESP, version=_SOURCE_VERSION_P),
config_constructor(subset_id="primary_task_thai", schema="source", desc=_PRIMARY_DESP, version=_SOURCE_VERSION_P),
config_constructor(subset_id="secondary_task", schema="source", desc=_SECONDARY_DESP, version=_SOURCE_VERSION_S),
config_constructor(subset_id="secondary_task_indonesian", schema="source", desc=_SECONDARY_DESP, version=_SOURCE_VERSION_S),
config_constructor(subset_id="id", schema="source", desc=_SECONDARY_DESP, version=_SOURCE_VERSION_S),
# selectp source schema
config_constructor(subset_id="selectp", schema="source", desc=_SELECTP_DESP, version=_SOURCE_VERSION_P),
config_constructor(subset_id="selectp_ind", schema="source", desc=_SELECTP_DESP, version=_SOURCE_VERSION_P),
config_constructor(subset_id="selectp_tha", schema="source", desc=_SELECTP_DESP, version=_SOURCE_VERSION_P),
# minspan source schema
config_constructor(subset_id="minspan", schema="source", desc=_MINSPAN_DESP, version=_SOURCE_VERSION_P),
config_constructor(subset_id="minspan_ind", schema="source", desc=_MINSPAN_DESP, version=_SOURCE_VERSION_P),
config_constructor(subset_id="minspan_tha", schema="source", desc=_MINSPAN_DESP, version=_SOURCE_VERSION_P),
# goldp source schema
config_constructor(subset_id="goldp", schema="source", desc=_GOLDP_DESP, version=_SOURCE_VERSION_S),
config_constructor(subset_id="goldp_ind", schema="source", desc=_GOLDP_DESP, version=_SOURCE_VERSION_S),
# tydiqa_id source schema
config_constructor(subset_id="id", schema="source", desc=_ID_DESP, version=_SOURCE_VERSION_P),
# seacrowd schema
config_constructor(subset_id="primary_task", schema="seacrowd_qa", desc=_PRIMARY_DESP, version=_SEACROWD_VERSION),
config_constructor(subset_id="primary_task_indonesian", schema="seacrowd_qa", desc=_PRIMARY_DESP, version=_SEACROWD_VERSION),
config_constructor(subset_id="primary_task_thai", schema="seacrowd_qa", desc=_PRIMARY_DESP, version=_SEACROWD_VERSION),
config_constructor(subset_id="secondary_task", schema="seacrowd_qa", desc=_SECONDARY_DESP, version=_SEACROWD_VERSION),
config_constructor(subset_id="secondary_task_indonesian", schema="seacrowd_qa", desc=_SECONDARY_DESP, version=_SEACROWD_VERSION),
config_constructor(subset_id="id", schema="seacrowd_qa", desc=_SECONDARY_DESP, version=_SEACROWD_VERSION),
# selectp seacrowd schema
config_constructor(subset_id="selectp", schema="seacrowd_qa", desc=_SELECTP_DESP, version=_SEACROWD_VERSION),
config_constructor(subset_id="selectp_ind", schema="seacrowd_qa", desc=_SELECTP_DESP, version=_SEACROWD_VERSION),
config_constructor(subset_id="selectp_tha", schema="seacrowd_qa", desc=_SELECTP_DESP, version=_SEACROWD_VERSION),
# minspan seacrowd schema
config_constructor(subset_id="minspan", schema="seacrowd_qa", desc=_MINSPAN_DESP, version=_SEACROWD_VERSION),
config_constructor(subset_id="minspan_ind", schema="seacrowd_qa", desc=_MINSPAN_DESP, version=_SEACROWD_VERSION),
config_constructor(subset_id="minspan_tha", schema="seacrowd_qa", desc=_MINSPAN_DESP, version=_SEACROWD_VERSION),
# goldp seacrowd schema
config_constructor(subset_id="goldp", schema="seacrowd_qa", desc=_GOLDP_DESP, version=_SEACROWD_VERSION),
config_constructor(subset_id="goldp_ind", schema="seacrowd_qa", desc=_GOLDP_DESP, version=_SEACROWD_VERSION),
# tydiqa_id seacrowd schema
config_constructor(subset_id="id", schema="seacrowd_qa", desc=_ID_DESP, version=_SEACROWD_VERSION),
]
DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_id_source"

def _info(self):
if "primary_task" in self.config.name:
if ("selectp" in self.config.name) or ("minspan" in self.config.name):
if "source" in self.config.name:
features = datasets.Features(
{
Expand Down Expand Up @@ -190,7 +203,7 @@ def _info(self):
"language": datasets.Value("string"),
}

elif "secondary_task" in self.config.name or "tydiqa_id" in self.config.name:
elif ("goldp" in self.config.name) or ("tydiqa_id" in self.config.name):
if "source" in self.config.name:
features = datasets.Features(
{
Expand Down Expand Up @@ -224,7 +237,7 @@ def _split_generators(self, dl_manager):
primary_downloaded = dl_manager.download_and_extract(_PRIMARY_URLS)
secondary_downloaded = dl_manager.download_and_extract(_SECONDARY_URLS)

if "primary_task" in self.config.name:
if ("selectp" in self.config.name) or ("minspan" in self.config.name):
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
Expand All @@ -236,7 +249,7 @@ def _split_generators(self, dl_manager):
),
]

elif "secondary_task" in self.config.name:
elif "goldp" in self.config.name:
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
Expand Down Expand Up @@ -266,7 +279,7 @@ def _split_generators(self, dl_manager):
def _generate_examples(self, filepath, split=None):
"""Yields examples."""

if "primary_task" in self.config.name:
if ("selectp" in self.config.name) or ("minspan" in self.config.name):
with open(filepath, encoding="utf-8") as f:
for id_, row in enumerate(f):
data = json.loads(row)
Expand All @@ -283,33 +296,27 @@ def _generate_examples(self, filepath, split=None):
passage_cand_answers = [annotation["passage_answer"]["candidate_index"] for annotation in annotations]
doc = data["document_plaintext"]
url = data["document_url"]
if self.config.name == "tydiqa_primary_task_source":
yield id_, primary_source_helper(id_, start_byte, end_byte, question, title, lang, passage_cand_answers,
min_answers_start_byte, min_answers_end_byte, yes_no_answers, doc, url)
elif self.config.name == "tydiqa_primary_task_indonesian_source":
if (self.config.name == "tydiqa_selectp_source") or (self.config.name == "tydiqa_minspan_source"):
yield id_, primary_source_helper(id_, start_byte, end_byte, question, title, lang, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, doc, url)
elif (self.config.name == "tydiqa_selectp_ind_source") or (self.config.name == "tydiqa_minspan_ind_source"):
if lang == "indonesian":
yield id_, primary_source_helper(id_, start_byte, end_byte, question, title, lang, passage_cand_answers,
min_answers_start_byte, min_answers_end_byte, yes_no_answers, doc, url)
elif self.config.name == "tydiqa_primary_task_thai_source":
yield id_, primary_source_helper(id_, start_byte, end_byte, question, title, lang, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, doc, url)
elif (self.config.name == "tydiqa_selectp_tha_source") or (self.config.name == "tydiqa_minspan_tha_source"):
if lang == "thai":
yield id_, primary_source_helper(id_, start_byte, end_byte, question, title, lang, passage_cand_answers,
min_answers_start_byte, min_answers_end_byte, yes_no_answers, doc, url)
yield id_, primary_source_helper(id_, start_byte, end_byte, question, title, lang, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, doc, url)
# seacrowd
elif self.config.name == "tydiqa_primary_task_seacrowd_qa":
yield id_, primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, passage_cand_answers,
min_answers_start_byte, min_answers_end_byte, yes_no_answers, lang)
elif self.config.name == "tydiqa_primary_task_indonesian_seacrowd_qa":
elif (self.config.name == "tydiqa_selectp_seacrowd_qa") or (self.config.name == "tydiqa_minspan_seacrowd_qa"):
yield id_, primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, lang)
elif (self.config.name == "tydiqa_selectp_ind_seacrowd_qa") or (self.config.name == "tydiqa_minspan_ind_seacrowd_qa"):
if lang == "indonesian":
yield id_, primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, passage_cand_answers,
min_answers_start_byte, min_answers_end_byte, yes_no_answers, lang)
elif self.config.name == "tydiqa_primary_task_thai_seacrowd_qa":
yield id_, primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, lang)
elif (self.config.name == "tydiqa_selectp_tha_seacrowd_qa") or (self.config.name == "tydiqa_minspan_tha_seacrowd_qa"):
if lang == "thai":
yield id_, primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, passage_cand_answers,
min_answers_start_byte, min_answers_end_byte, yes_no_answers, lang)
yield id_, primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, lang)
else:
raise ValueError(f"No configs to match {self.config.name} in primary_task")

elif ("secondary_task" in self.config.name) or ("tydiqa_id" in self.config.name):
elif ("goldp" in self.config.name) or ("tydiqa_id" in self.config.name):
with (open(filepath, encoding="utf-8") as f):
data = json.load(f)
tydiqa_id_num = 0
Expand All @@ -322,10 +329,10 @@ def _generate_examples(self, filepath, split=None):
id_ = qa["id"]
answer_starts = [answer["answer_start"] for answer in qa["answers"]]
answers = [answer["text"].strip() for answer in qa["answers"]]
if self.config.name == "tydiqa_secondary_task_source":
if self.config.name == "tydiqa_goldp_source":
yield id_, second_source_helper(id_, title, context, question, answer_starts, answers)

elif self.config.name == "tydiqa_secondary_task_indonesian_source":
elif self.config.name == "tydiqa_goldp_ind_source":
if id_.startswith("indonesian"):
yield id_, second_source_helper(id_, title, context, question, answer_starts, answers)
elif self.config.name == "tydiqa_id_source":
Expand All @@ -338,9 +345,9 @@ def _generate_examples(self, filepath, split=None):
if split == "validation":
yield id_, second_source_helper(id_, title, context, question, answer_starts, answers)

elif self.config.name == "tydiqa_secondary_task_seacrowd_qa":
elif self.config.name == "tydiqa_goldp_seacrowd_qa":
yield id_, second_seacrowd_helper(id_, question, context, answers, answer_starts)
elif self.config.name == "tydiqa_secondary_task_indonesian_seacrowd_qa":
elif self.config.name == "tydiqa_goldp_ind_seacrowd_qa":
if id_.startswith("indonesian"):
yield id_, second_seacrowd_helper(id_, question, context, answers, answer_starts)
elif self.config.name == "tydiqa_id_seacrowd_qa":
Expand All @@ -356,8 +363,7 @@ def _generate_examples(self, filepath, split=None):
raise ValueError(f"No configs to match {self.config.name} in secondary_task")


def primary_source_helper(id_, start_byte, end_byte, question, title, lang, passage_cand_answers,
min_answers_start_byte, min_answers_end_byte, yes_no_answers, doc, url):
def primary_source_helper(id_, start_byte, end_byte, question, title, lang, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, doc, url):
return {
"passage_answer_candidates": {
"plaintext_start_byte": start_byte,
Expand All @@ -377,8 +383,7 @@ def primary_source_helper(id_, start_byte, end_byte, question, title, lang, pass
}


def primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, passage_cand_answers,
min_answers_start_byte, min_answers_end_byte, yes_no_answers, lang):
def primary_seacrowd_helper(id_, title, question, doc, start_byte, end_byte, passage_cand_answers, min_answers_start_byte, min_answers_end_byte, yes_no_answers, lang):
return {
"id": str(id_),
"question_id": title,
Expand Down Expand Up @@ -418,6 +423,14 @@ def second_source_helper(id_, title, context, question, answer_starts, answers):


def second_seacrowd_helper(id_, question, context, answers, answer_starts):
return {"id": id_, "question_id": id_, "document_id": id_, "question": question,
"type": "abstractive", "choices": [], "context": context, "answer": answers,
"meta": {"answer_start": answer_starts}}
return {
"id": id_,
"question_id": id_,
"document_id": id_,
"question": question,
"type": "abstractive",
"choices": [],
"context": context,
"answer": answers,
"meta": {"answer_start": answer_starts},
}