Skip to content

Commit 6bcef7f

Browse files
authored
Closes SEACrowd#102 | Add Dataloader for Thai SER (SEACrowd#505)
* add dataloader for thai_ser * add attributes * add attributes * update data splits
1 parent 490d668 commit 6bcef7f

File tree

2 files changed

+233
-0
lines changed

2 files changed

+233
-0
lines changed

seacrowd/sea_datasets/thai_ser/__init__.py

Whitespace-only changes.
Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
# coding=utf-8
2+
# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
import glob
17+
import json
18+
import os
19+
from pathlib import Path
20+
from typing import Dict, List, Tuple, Union
21+
22+
import datasets
23+
24+
from seacrowd.utils.configs import SEACrowdConfig
25+
from seacrowd.utils.constants import Licenses, Tasks
26+
27+
# no paper citation
28+
_CITATION = """\
29+
"""
30+
_DATASETNAME = "thai_ser"
31+
_DESCRIPTION = """\
32+
THAI SER dataset consists of 5 main emotions assigned to actors: Neutral,
33+
Anger, Happiness, Sadness, and Frustration. The recordings were 41 hours,
34+
36 minutes long (27,854 utterances), and were performed by 200 professional
35+
actors (112 female, 88 male) and directed by students, former alumni, and
36+
professors from the Faculty of Arts, Chulalongkorn University. The THAI SER
37+
contains 100 recordings and is separated into two main categories: Studio and
38+
Zoom. Studio recordings also consist of two studio environments: Studio A, a
39+
controlled studio room with soundproof walls, and Studio B, a normal room
40+
without soundproof or noise control.
41+
"""
42+
_HOMEPAGE = "https://github.com/vistec-AI/dataset-releases/releases/tag/v1"
43+
_LANGUAGES = ["tha"]
44+
_LICENSE = Licenses.CC_BY_SA_4_0.value
45+
_LOCAL = False
46+
47+
_URLS = {
48+
"actor_demography": "https://github.com/vistec-AI/dataset-releases/releases/download/v1/actor_demography.json",
49+
"emotion_label": "https://github.com/vistec-AI/dataset-releases/releases/download/v1/emotion_label.json",
50+
"studio": {
51+
"studio1-10": "https://github.com/vistec-AI/dataset-releases/releases/download/v1/studio1-10.zip",
52+
"studio11-20": "https://github.com/vistec-AI/dataset-releases/releases/download/v1/studio11-20.zip",
53+
"studio21-30": "https://github.com/vistec-AI/dataset-releases/releases/download/v1/studio21-30.zip",
54+
"studio31-40": "https://github.com/vistec-AI/dataset-releases/releases/download/v1/studio31-40.zip",
55+
"studio41-50": "https://github.com/vistec-AI/dataset-releases/releases/download/v1/studio41-50.zip",
56+
"studio51-60": "https://github.com/vistec-AI/dataset-releases/releases/download/v1/studio51-60.zip",
57+
"studio61-70": "https://github.com/vistec-AI/dataset-releases/releases/download/v1/studio61-70.zip",
58+
"studio71-80": "https://github.com/vistec-AI/dataset-releases/releases/download/v1/studio71-80.zip",
59+
},
60+
"zoom": {"zoom1-10": "https://github.com/vistec-AI/dataset-releases/releases/download/v1/zoom1-10.zip", "zoom11-20": "https://github.com/vistec-AI/dataset-releases/releases/download/v1/zoom11-20.zip"},
61+
}
62+
_URLS["studio_zoom"] = {**_URLS["studio"], **_URLS["zoom"]}
63+
64+
_SUPPORTED_TASKS = [Tasks.SPEECH_EMOTION_RECOGNITION]
65+
66+
_SOURCE_VERSION = "1.0.0"
67+
_SEACROWD_VERSION = "1.0.0"
68+
69+
70+
class ThaiSER(datasets.GeneratorBasedBuilder):
71+
"""Thai speech emotion recognition dataset THAI SER contains 100 recordings (80 studios and 20 zooms)."""
72+
73+
SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
74+
SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
75+
76+
SEACROWD_SCHEMA_NAME = "speech"
77+
_LABELS = ["Neutral", "Angry", "Happy", "Sad", "Frustrated"]
78+
79+
BUILDER_CONFIGS = [
80+
# studio
81+
SEACrowdConfig(
82+
name=f"{_DATASETNAME}_source",
83+
version=SOURCE_VERSION,
84+
description=f"{_DATASETNAME} source schema",
85+
schema="source",
86+
subset_id=f"{_DATASETNAME}",
87+
),
88+
SEACrowdConfig(
89+
name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}",
90+
version=SEACROWD_VERSION,
91+
description=f"{_DATASETNAME} SEACrowd schema",
92+
schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}",
93+
subset_id=f"{_DATASETNAME}",
94+
),
95+
# studio and zoom
96+
SEACrowdConfig(
97+
name=f"{_DATASETNAME}_include_zoom_source",
98+
version=SOURCE_VERSION,
99+
description=f"{_DATASETNAME} source schema",
100+
schema="source",
101+
subset_id=f"{_DATASETNAME}_include_zoom",
102+
),
103+
SEACrowdConfig(
104+
name=f"{_DATASETNAME}_include_zoom_seacrowd_{SEACROWD_SCHEMA_NAME}",
105+
version=SEACROWD_VERSION,
106+
description=f"{_DATASETNAME} SEACrowd schema",
107+
schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}",
108+
subset_id=f"{_DATASETNAME}_include_zoom",
109+
),
110+
]
111+
112+
DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"
113+
114+
def _info(self) -> datasets.DatasetInfo:
115+
116+
if self.config.schema == "source":
117+
features = datasets.Features(
118+
{
119+
"id": datasets.Value("string"),
120+
"path": datasets.Value("string"),
121+
"audio": datasets.Audio(sampling_rate=44_100),
122+
"speaker_id": datasets.Value("string"),
123+
"labels": datasets.ClassLabel(names=self._LABELS),
124+
"majority_emo": datasets.Value("string"), # 'None' when no single majority
125+
"annotated": datasets.Value("string"),
126+
"agreement": datasets.Value("float32"),
127+
"metadata": {
128+
"speaker_age": datasets.Value("int64"),
129+
"speaker_gender": datasets.Value("string"),
130+
},
131+
}
132+
)
133+
elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}":
134+
# same as schemas.speech_features(self._LABELS) except for sampling_rate
135+
features = datasets.Features(
136+
{
137+
"id": datasets.Value("string"),
138+
"path": datasets.Value("string"),
139+
"audio": datasets.Audio(sampling_rate=44_100),
140+
"speaker_id": datasets.Value("string"),
141+
"labels": datasets.ClassLabel(names=self._LABELS),
142+
"metadata": {
143+
"speaker_age": datasets.Value("int64"),
144+
"speaker_gender": datasets.Value("string"),
145+
},
146+
}
147+
)
148+
149+
return datasets.DatasetInfo(
150+
description=_DESCRIPTION,
151+
features=features,
152+
homepage=_HOMEPAGE,
153+
license=_LICENSE,
154+
citation=_CITATION,
155+
)
156+
157+
def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
158+
"""Returns SplitGenerators."""
159+
160+
setting = "studio_zoom" if "zoom" in self.config.name else "studio"
161+
162+
data_paths = {"actor_demography": Path(dl_manager.download_and_extract(_URLS["actor_demography"])), "emotion_label": Path(dl_manager.download_and_extract(_URLS["emotion_label"])), setting: {}}
163+
for url_name, url_path in _URLS[setting].items():
164+
data_paths[setting][url_name] = Path(dl_manager.download_and_extract(url_path))
165+
166+
return [
167+
datasets.SplitGenerator(
168+
name=datasets.Split.TRAIN,
169+
gen_kwargs={
170+
"actor_demography_filepath": data_paths["actor_demography"],
171+
"emotion_label_filepath": data_paths["emotion_label"],
172+
"data_filepath": data_paths[setting],
173+
"split": "train",
174+
},
175+
)
176+
]
177+
178+
def _generate_examples(self, actor_demography_filepath: Path, emotion_label_filepath: Path, data_filepath: Dict[str, Union[Path, Dict]], split: str) -> Tuple[int, Dict]:
179+
"""Yields examples as (key, example) tuples."""
180+
# read actor_demography file
181+
with open(actor_demography_filepath, "r", encoding="utf-8") as actor_demography_file:
182+
actor_demography = json.load(actor_demography_file)
183+
actor_demography_dict = {actor["Actor's ID"]: {"speaker_age": actor["Age"], "speaker_gender": actor["Sex"].lower()} for actor in actor_demography["data"]}
184+
185+
# read emotion_label file
186+
with open(emotion_label_filepath, "r", encoding="utf-8") as emotion_label_file:
187+
emotion_label = json.load(emotion_label_file)
188+
189+
# iterate through data folders
190+
for folder_path in data_filepath.values():
191+
flac_files = glob.glob(os.path.join(folder_path, "**/*.flac"), recursive=True)
192+
# iterate through recordings
193+
for audio_path in flac_files:
194+
id = audio_path.split("/")[-1]
195+
speaker_id = id.split("_")[2].strip("actor")
196+
# labels in emotion_label are incomplete, labels only provided for microphone types: mic, con
197+
# otherwise, obtain label from id for scripted utterances and skip sample for the improvised utterances
198+
if id in emotion_label.keys():
199+
assigned_emo = emotion_label[id][0]["assigned_emo"]
200+
majority_emo = emotion_label[id][0]["majority_emo"]
201+
agreement = emotion_label[id][0]["agreement"]
202+
annotated = emotion_label[id][0]["annotated"]
203+
else:
204+
if "script" in id:
205+
label = id.split("_")[-1][0] # Emotion (1 = Neutral, 2 = Angry, 3 = Happy, 4 = Sad, 5 = Frustrated)
206+
assigned_emo = self._LABELS[int(label) - 1]
207+
majority_emo = agreement = annotated = None
208+
else:
209+
continue
210+
211+
if self.config.schema == "source":
212+
example = {
213+
"id": id.strip(".flac"),
214+
"path": audio_path,
215+
"audio": audio_path,
216+
"speaker_id": speaker_id,
217+
"labels": assigned_emo,
218+
"majority_emo": majority_emo,
219+
"agreement": agreement,
220+
"annotated": annotated,
221+
"metadata": {"speaker_age": actor_demography_dict[speaker_id]["speaker_age"], "speaker_gender": actor_demography_dict[speaker_id]["speaker_gender"]},
222+
}
223+
elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}":
224+
example = {
225+
"id": id.strip(".flac"),
226+
"path": audio_path,
227+
"audio": audio_path,
228+
"speaker_id": speaker_id,
229+
"labels": assigned_emo,
230+
"metadata": {"speaker_age": actor_demography_dict[speaker_id]["speaker_age"], "speaker_gender": actor_demography_dict[speaker_id]["speaker_gender"]},
231+
}
232+
233+
yield id.strip(".flac"), example

0 commit comments

Comments
 (0)