Skip to content

Commit

Permalink
feat: chat agent schema
Browse files Browse the repository at this point in the history
  • Loading branch information
patrickamadeus committed May 13, 2024
1 parent 30a7a69 commit efa8d51
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 15 deletions.
38 changes: 23 additions & 15 deletions seacrowd/sea_datasets/onto4all/onto4all.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@

_URLS = "https://huggingface.co/datasets/ontocord/onto4all/resolve/main/data/train-00000-of-00001.parquet?download=true"

_SUPPORTED_TASKS = [Tasks.QUESTION_ANSWERING]
_SUPPORTED_TASKS = [Tasks.MULTI_TURN_CONVERSATION]

_SOURCE_VERSION = "1.0.0"

Expand All @@ -73,10 +73,10 @@ class Onto4AllDataset(datasets.GeneratorBasedBuilder):
subset_id=f"{_DATASETNAME}",
),
SEACrowdConfig(
name=f"{_DATASETNAME}_seacrowd_qa",
name=f"{_DATASETNAME}_seacrowd_chat",
version=SEACROWD_VERSION,
description=f"{_DATASETNAME} SEACrowd schema",
schema="seacrowd_qa",
schema="seacrowd_chat",
subset_id=f"{_DATASETNAME}",
),
]
Expand All @@ -98,8 +98,11 @@ def _info(self) -> datasets.DatasetInfo:
}
)

elif self.config.schema == "seacrowd_qa":
features = schemas.qa_features
elif self.config.schema == "seacrowd_chat":
features = schemas.chat_features
features["meta"] = {
"type": datasets.Value("string")
}

return datasets.DatasetInfo(
description=_DESCRIPTION,
Expand Down Expand Up @@ -140,9 +143,8 @@ def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]:
"type": row["type"],
"conversation": conversation,
}
break

elif self.config.schema == "seacrowd_qa":
elif self.config.schema == "seacrowd_chat":
for i, row in df.iterrows():
context = ""
question = ""
Expand All @@ -158,12 +160,18 @@ def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]:

yield i, {
"id": row["id"],
"question_id": row["id"],
"document_id": "",
"question": question,
"type": row["type"],
"choices": [],
"context": context,
"answer": [answer],
"meta": {},
"input": [
{
"role": "system",
"content": context,
},
{
"role": "user",
"content": question,
},
],
"output": answer,
"meta": {
"type": row["type"],
},
}
4 changes: 4 additions & 0 deletions seacrowd/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
pairs_features_score,
pairs_multi_features,
qa_features,
chat_features,
image_features,
image_multi_features,
imqa_features,
Expand Down Expand Up @@ -105,6 +106,7 @@ class Tasks(Enum):
# Multi Text Generation
DIALOGUE_SYSTEM = "DS"
E2E_TASK_ORIENTED_DIALOGUE = "TOD"
MULTI_TURN_CONVERSATION = "MTC"

# Self Supervised & Unsupervised Text
PROMPTING = "PRT"
Expand Down Expand Up @@ -246,6 +248,7 @@ class Licenses(Enum):
Tasks.TOKEN_LEVEL_LANGUAGE_IDENTIFICATION: "SEQ_LABEL",
Tasks.COMMONSENSE_REASONING: "QA",
Tasks.QUESTION_ANSWERING: "QA",
Tasks.MULTI_TURN_CONVERSATION: "CHAT",
Tasks.CONCEPT_ALIGNMENT_CLASSIFICATION: "PAIRS",
Tasks.NEXT_SENTENCE_PREDICTION: "PAIRS",
Tasks.TEXT_RETRIEVAL: "PAIRS",
Expand Down Expand Up @@ -313,6 +316,7 @@ class Licenses(Enum):
"KB": kb_features,
"TREE": tree_features,
"QA": qa_features,
"CHAT": chat_features,
"T2T": text2text_features,
"TEXT": text_features(),
"TEXT_MULTI": text_multi_features(),
Expand Down
2 changes: 2 additions & 0 deletions seacrowd/utils/schemas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .pairs import features_with_continuous_label as pairs_features_score
from .pairs_multilabel import features as pairs_multi_features
from .qa import features as qa_features
from .chat import features as chat_features
from .image import features as image_features
from .image import multi_features as image_multi_features
from .imqa import features as imqa_features
Expand All @@ -28,6 +29,7 @@
"pairs_features_score",
"pairs_multi_features",
"qa_features",
"chat_features",
"image_features",
"image_multi_features",
"imqa_features",
Expand Down
24 changes: 24 additions & 0 deletions seacrowd/utils/schemas/chat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""
Conversational Chat Schema
"""
import datasets

features = datasets.Features(
{
"id": datasets.Value("string"),
"input": datasets.Sequence({
"role": datasets.ClassLabel(names=["system", "user", "assistant"]),
"content": datasets.Value("string"),
}),
"output": datasets.Value("string"),

# the schema of 'meta' aren't specified either to allow some flexibility
"meta": {}

# notes on how to use this field of 'meta'
# you can choose two of options:
# 1. defining as empty dict if you don't think it's usable in `_generate_examples`, or
# 2. defining meta as dict of key with intended colname meta and its val with dataset.Features class
# in `_info` Dataloader method then populate it with the values in `_general_examples` Dataloader method
}
)

0 comments on commit efa8d51

Please sign in to comment.