Skip to content

Commit 4e40815

Browse files
authored
Implement CommonVoice 12.0 dataloader (SEACrowd#452)
1 parent c3d61a8 commit 4e40815

File tree

2 files changed

+208
-0
lines changed

2 files changed

+208
-0
lines changed

seacrowd/sea_datasets/commonvoice_120/__init__.py

Whitespace-only changes.
Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
# coding=utf-8
2+
import csv
3+
import json
4+
import os
5+
from pathlib import Path
6+
from typing import Dict, List, Tuple
7+
8+
import datasets
9+
10+
from seacrowd.utils import schemas
11+
from seacrowd.utils.configs import SEACrowdConfig
12+
from seacrowd.utils.constants import Licenses, Tasks
13+
14+
_CITATION = """\
15+
@inproceedings{commonvoice:2020,
16+
author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},
17+
title = {Common Voice: A Massively-Multilingual Speech Corpus},
18+
booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},
19+
pages = {4211--4215},
20+
year = 2020
21+
}
22+
"""
23+
24+
_DATASETNAME = "commonvoice_120"
25+
26+
_DESCRIPTION = """\
27+
The Common Mozilla Voice dataset consists of a unique MP3 and corresponding text file.
28+
Many of the 26119 recorded hours in the dataset also include demographic metadata like age, sex, and accent that can help improve the accuracy of speech recognition engines.
29+
The dataset currently consists of 17127 validated hours in 104 languages, but more voices and languages are always added.
30+
31+
Before using this dataloader, please accept the acknowledgement at https://huggingface.co/datasets/mozilla-foundation/common_voice_12_0 and use huggingface-cli login for authentication
32+
"""
33+
34+
_HOMEPAGE = "https://commonvoice.mozilla.org/en/datasets"
35+
36+
_LANGUAGES = ["cnh", "ind", "tha", "vie"]
37+
_LANG_TO_CVLANG = {"cnh": "cnh", "ind": "id", "tha": "th", "vie": "vi"}
38+
39+
_AGE_TO_INT = {"": None, "teens": 10, "twenties": 20, "thirties": 30, "fourties": 40, "fifties": 50, "sixties": 60, "seventies": 70, "eighties": 80}
40+
41+
_LICENSE = Licenses.CC0_1_0.value
42+
43+
# Note: the dataset is gated in HuggingFace. It's public after providing access token
44+
_LOCAL = False
45+
46+
_COMMONVOICE_URL_TEMPLATE = "https://huggingface.co/datasets/mozilla-foundation/common_voice_12_0/resolve/main/"
47+
_URLS = {"audio": _COMMONVOICE_URL_TEMPLATE + "audio/{lang}/{split}/{lang}_{split}_{shard_idx}.tar", "transcript": _COMMONVOICE_URL_TEMPLATE + "transcript/{lang}/{split}.tsv", "n_shards": _COMMONVOICE_URL_TEMPLATE + "n_shards.json"}
48+
49+
_SUPPORTED_TASKS = [Tasks.SPEECH_RECOGNITION, Tasks.TEXT_TO_SPEECH]
50+
51+
_SOURCE_VERSION = "1.0.0"
52+
53+
_SEACROWD_VERSION = "1.0.0"
54+
55+
56+
class Commonvoice120(datasets.GeneratorBasedBuilder):
57+
"""This is the dataloader for CommonVoice 12.0 Mozilla"""
58+
59+
SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
60+
SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
61+
62+
BUILDER_CONFIGS = (
63+
*[
64+
SEACrowdConfig(
65+
name=f"{_DATASETNAME}_{lang}{'_' if lang else ''}source",
66+
version=datasets.Version(_SOURCE_VERSION),
67+
description=f"{_DATASETNAME} source schema for {lang}",
68+
schema="source",
69+
subset_id=f"{_DATASETNAME}{'_' if lang else ''}{lang}",
70+
)
71+
for lang in ["", *_LANGUAGES]
72+
],
73+
*[
74+
SEACrowdConfig(
75+
name=f"{_DATASETNAME}_{lang}{'_' if lang else ''}seacrowd_sptext",
76+
version=datasets.Version(_SEACROWD_VERSION),
77+
description=f"{_DATASETNAME} SEACrowd schema for {lang}",
78+
schema="seacrowd_sptext",
79+
subset_id=f"{_DATASETNAME}{'_' if lang else ''}{lang}",
80+
)
81+
for lang in ["", *_LANGUAGES]
82+
],
83+
)
84+
85+
DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"
86+
87+
def _info(self) -> datasets.DatasetInfo:
88+
if self.config.schema == "source":
89+
features = datasets.Features(
90+
{
91+
"client_id": datasets.Value("string"),
92+
"path": datasets.Value("string"),
93+
"audio": datasets.features.Audio(sampling_rate=48_000),
94+
"sentence": datasets.Value("string"),
95+
"up_votes": datasets.Value("int64"),
96+
"down_votes": datasets.Value("int64"),
97+
"age": datasets.Value("string"),
98+
"gender": datasets.Value("string"),
99+
"accent": datasets.Value("string"),
100+
"locale": datasets.Value("string"),
101+
"segment": datasets.Value("string"),
102+
}
103+
)
104+
elif self.config.schema == "seacrowd_sptext":
105+
features = schemas.speech_text_features
106+
107+
return datasets.DatasetInfo(
108+
description=_DESCRIPTION,
109+
features=features,
110+
homepage=_HOMEPAGE,
111+
license=_LICENSE,
112+
citation=_CITATION,
113+
)
114+
115+
def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
116+
lang_code = self.config.subset_id.split("_")[-1]
117+
languages = [_LANG_TO_CVLANG.get(lang, lang) for lang in (_LANGUAGES if lang_code == "120" else [lang_code])]
118+
n_shards_path = dl_manager.download_and_extract(_URLS["n_shards"])
119+
with open(n_shards_path, encoding="utf-8") as f:
120+
n_shards = json.load(f)
121+
122+
audio_urls = {}
123+
meta_urls = {}
124+
splits = ("train", "dev", "test")
125+
for split in splits:
126+
audio_urls[split] = [_URLS["audio"].format(lang=lang, split=split, shard_idx=i) for lang in languages for i in range(n_shards[lang][split])]
127+
meta_urls[split] = [_URLS["transcript"].format(lang=lang, split=split) for lang in languages]
128+
archive_paths = dl_manager.download(audio_urls)
129+
local_extracted_archive_paths = dl_manager.extract(archive_paths)
130+
meta_paths = dl_manager.download_and_extract(meta_urls)
131+
132+
split_names = {
133+
"train": datasets.Split.TRAIN,
134+
"dev": datasets.Split.VALIDATION,
135+
"test": datasets.Split.TEST,
136+
}
137+
return [
138+
datasets.SplitGenerator(
139+
name=split_names.get(split, split),
140+
gen_kwargs={
141+
"local_extracted_archive_paths": local_extracted_archive_paths.get(split),
142+
"audio_archives": [dl_manager.iter_archive(path) for path in archive_paths.get(split)],
143+
"meta_paths": meta_paths[split],
144+
"split": "train",
145+
},
146+
)
147+
for split in splits
148+
]
149+
150+
def _generate_examples(self, local_extracted_archive_paths: [Path], audio_archives: [Path], meta_paths: [Path], split: str) -> Tuple[int, Dict]:
151+
data_fields = list(self._info().features.keys())
152+
metadata = {}
153+
for meta_path in meta_paths:
154+
with open(meta_path, encoding="utf-8") as f:
155+
reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
156+
for row in reader:
157+
if not row["path"].endswith(".mp3"):
158+
row["path"] += ".mp3"
159+
if "accents" in row:
160+
row["accent"] = row["accents"]
161+
del row["accents"]
162+
for field in data_fields:
163+
if field not in row:
164+
row[field] = ""
165+
metadata[row["path"]] = row
166+
167+
if self.config.schema == "source":
168+
for i, audio_archive in enumerate(audio_archives):
169+
for path, file in audio_archive:
170+
_, filename = os.path.split(path)
171+
if filename in metadata:
172+
src_result = dict(metadata[filename])
173+
path = os.path.join(local_extracted_archive_paths[i], path)
174+
result = {
175+
"client_id": src_result["client_id"],
176+
"path": path,
177+
"audio": {"path": path, "bytes": file.read()},
178+
"sentence": src_result["sentence"],
179+
"up_votes": src_result["up_votes"],
180+
"down_votes": src_result["down_votes"],
181+
"age": src_result["age"],
182+
"gender": src_result["gender"],
183+
"accent": src_result["accent"],
184+
"locale": src_result["locale"],
185+
"segment": src_result["segment"],
186+
}
187+
yield path, result
188+
189+
elif self.config.schema == "seacrowd_sptext":
190+
for i, audio_archive in enumerate(audio_archives):
191+
for path, file in audio_archive:
192+
_, filename = os.path.split(path)
193+
if filename in metadata:
194+
src_result = dict(metadata[filename])
195+
# set the audio feature and the path to the extracted file
196+
path = os.path.join(local_extracted_archive_paths[i], path)
197+
result = {
198+
"id": src_result["path"].replace(".mp3", ""),
199+
"path": path,
200+
"audio": {"path": path, "bytes": file.read()},
201+
"text": src_result["sentence"],
202+
"speaker_id": src_result["client_id"],
203+
"metadata": {
204+
"speaker_age": _AGE_TO_INT[src_result["age"]],
205+
"speaker_gender": src_result["gender"],
206+
},
207+
}
208+
yield path, result

0 commit comments

Comments
 (0)