diff --git a/seacrowd/utils/constants.py b/seacrowd/utils/constants.py index 388ac1c62..707b4cf8f 100644 --- a/seacrowd/utils/constants.py +++ b/seacrowd/utils/constants.py @@ -14,6 +14,7 @@ ssp_features, speech_text_features, speech2speech_features, + speech_multi_features, image_text_features, ) @@ -78,6 +79,7 @@ class Tasks(Enum): # SpeechText SPEECH_RECOGNITION = "ASR" SPEECH_TO_TEXT_TRANSLATION = "STTT" + SPEECH_TO_TEXT_CLASSIFICATION = "STTC" TEXT_TO_SPEECH = "TTS" # SpeechSpeech @@ -205,6 +207,7 @@ class Licenses(Enum): Tasks.SPEECH_TO_TEXT_TRANSLATION: "SPTEXT", Tasks.TEXT_TO_SPEECH: "SPTEXT", Tasks.SPEECH_TO_SPEECH_TRANSLATION: "S2S", + Tasks.SPEECH_TO_TEXT_CLASSIFICATION: "SC", Tasks.IMAGE_CAPTIONING: "IMTEXT", Tasks.STYLIZED_IMAGE_CAPTIONING: "IMTEXT", Tasks.VISUALLY_GROUNDED_REASONING: "IMTEXT", @@ -234,6 +237,7 @@ class Licenses(Enum): "SSP": ssp_features, "SPTEXT": speech_text_features, "S2S": speech2speech_features, + "SC": speech_multi_features(), "IMTEXT": image_text_features(), } diff --git a/seacrowd/utils/schemas/__init__.py b/seacrowd/utils/schemas/__init__.py index d75c752c4..c8fc54bc6 100644 --- a/seacrowd/utils/schemas/__init__.py +++ b/seacrowd/utils/schemas/__init__.py @@ -10,6 +10,7 @@ from .self_supervised_pretraining import features as ssp_features from .speech_text import features as speech_text_features from .speech_to_speech import features as speech2speech_features +from .speech_classification import features as speech_multi_features from .image_text import features as image_text_features -__all__ = ["kb_features", "qa_features", "text2text_features", "text_features", "text_multi_features", "pairs_features", "pairs_multi_features", "pairs_features_score", "seq_label_features", "ssp_features", "speech_text_features", "speech2speech_features", "image_text_features"] +__all__ = ["kb_features", "qa_features", "text2text_features", "text_features", "text_multi_features", "pairs_features", "pairs_multi_features", "pairs_features_score", "seq_label_features", "ssp_features", "speech_text_features", "speech2speech_features", "speech_multi_features", "image_text_features"] diff --git a/seacrowd/utils/schemas/speech_classification.py b/seacrowd/utils/schemas/speech_classification.py new file mode 100644 index 000000000..d3f1b50af --- /dev/null +++ b/seacrowd/utils/schemas/speech_classification.py @@ -0,0 +1,20 @@ +""" +Speech Classification Schema (Speech to Label, be it binary or multiclass) +""" +import datasets + +def features(label_names = ["Yes", "No"]): + return datasets.Features( + { + "id": datasets.Value("string"), + "path": datasets.Value("string"), + "audio": datasets.Audio(sampling_rate=16_000), + "text": datasets.Value("string"), + "speaker_id": datasets.Value("string"), + "labels": datasets.Sequence(datasets.ClassLabel(names=label_names)), + "metadata": { + "speaker_age": datasets.Value("int64"), + "speaker_gender": datasets.Value("string"), + } + } + )