diff --git a/seacrowd/sea_datasets/onto4all/onto4all.py b/seacrowd/sea_datasets/onto4all/onto4all.py index 99b0c8960..c08c73d18 100644 --- a/seacrowd/sea_datasets/onto4all/onto4all.py +++ b/seacrowd/sea_datasets/onto4all/onto4all.py @@ -52,7 +52,7 @@ _URLS = "https://huggingface.co/datasets/ontocord/onto4all/resolve/main/data/train-00000-of-00001.parquet?download=true" -_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING] +_SUPPORTED_TASKS = [Tasks.MULTI_TURN_CONVERSATION] _SOURCE_VERSION = "1.0.0" @@ -73,10 +73,10 @@ class Onto4AllDataset(datasets.GeneratorBasedBuilder): subset_id=f"{_DATASETNAME}", ), SEACrowdConfig( - name=f"{_DATASETNAME}_seacrowd_qa", + name=f"{_DATASETNAME}_seacrowd_chat", version=SEACROWD_VERSION, description=f"{_DATASETNAME} SEACrowd schema", - schema="seacrowd_qa", + schema="seacrowd_chat", subset_id=f"{_DATASETNAME}", ), ] @@ -98,8 +98,11 @@ def _info(self) -> datasets.DatasetInfo: } ) - elif self.config.schema == "seacrowd_qa": - features = schemas.qa_features + elif self.config.schema == "seacrowd_chat": + features = schemas.chat_features + features["meta"] = { + "type": datasets.Value("string") + } return datasets.DatasetInfo( description=_DESCRIPTION, @@ -140,9 +143,8 @@ def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: "type": row["type"], "conversation": conversation, } - break - elif self.config.schema == "seacrowd_qa": + elif self.config.schema == "seacrowd_chat": for i, row in df.iterrows(): context = "" question = "" @@ -158,12 +160,18 @@ def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: yield i, { "id": row["id"], - "question_id": row["id"], - "document_id": "", - "question": question, - "type": row["type"], - "choices": [], - "context": context, - "answer": [answer], - "meta": {}, + "input": [ + { + "role": "system", + "content": context, + }, + { + "role": "user", + "content": question, + }, + ], + "output": answer, + "meta": { + "type": row["type"], + }, } diff --git a/seacrowd/utils/constants.py b/seacrowd/utils/constants.py index cef293257..fc4c1752b 100644 --- a/seacrowd/utils/constants.py +++ b/seacrowd/utils/constants.py @@ -10,6 +10,7 @@ pairs_features_score, pairs_multi_features, qa_features, + chat_features, image_features, image_multi_features, imqa_features, @@ -105,6 +106,7 @@ class Tasks(Enum): # Multi Text Generation DIALOGUE_SYSTEM = "DS" E2E_TASK_ORIENTED_DIALOGUE = "TOD" + MULTI_TURN_CONVERSATION = "MTC" # Self Supervised & Unsupervised Text PROMPTING = "PRT" @@ -246,6 +248,7 @@ class Licenses(Enum): Tasks.TOKEN_LEVEL_LANGUAGE_IDENTIFICATION: "SEQ_LABEL", Tasks.COMMONSENSE_REASONING: "QA", Tasks.QUESTION_ANSWERING: "QA", + Tasks.MULTI_TURN_CONVERSATION: "CHAT", Tasks.CONCEPT_ALIGNMENT_CLASSIFICATION: "PAIRS", Tasks.NEXT_SENTENCE_PREDICTION: "PAIRS", Tasks.TEXT_RETRIEVAL: "PAIRS", @@ -313,6 +316,7 @@ class Licenses(Enum): "KB": kb_features, "TREE": tree_features, "QA": qa_features, + "CHAT": chat_features, "T2T": text2text_features, "TEXT": text_features(), "TEXT_MULTI": text_multi_features(), diff --git a/seacrowd/utils/schemas/__init__.py b/seacrowd/utils/schemas/__init__.py index ec4c035f8..5a30ac568 100644 --- a/seacrowd/utils/schemas/__init__.py +++ b/seacrowd/utils/schemas/__init__.py @@ -5,6 +5,7 @@ from .pairs import features_with_continuous_label as pairs_features_score from .pairs_multilabel import features as pairs_multi_features from .qa import features as qa_features +from .chat import features as chat_features from .image import features as image_features from .image import multi_features as image_multi_features from .imqa import features as imqa_features @@ -28,6 +29,7 @@ "pairs_features_score", "pairs_multi_features", "qa_features", + "chat_features", "image_features", "image_multi_features", "imqa_features", diff --git a/seacrowd/utils/schemas/chat.py b/seacrowd/utils/schemas/chat.py new file mode 100644 index 000000000..f88ea8f4b --- /dev/null +++ b/seacrowd/utils/schemas/chat.py @@ -0,0 +1,24 @@ +""" +Conversational Chat Schema +""" +import datasets + +features = datasets.Features( + { + "id": datasets.Value("string"), + "input": datasets.Sequence({ + "role": datasets.ClassLabel(names=["system", "user", "assistant"]), + "content": datasets.Value("string"), + }), + "output": datasets.Value("string"), + + # the schema of 'meta' aren't specified either to allow some flexibility + "meta": {} + + # notes on how to use this field of 'meta' + # you can choose two of options: + # 1. defining as empty dict if you don't think it's usable in `_generate_examples`, or + # 2. defining meta as dict of key with intended colname meta and its val with dataset.Features class + # in `_info` Dataloader method then populate it with the values in `_general_examples` Dataloader method + } +)