|
| 1 | +# coding=utf-8 |
| 2 | +import csv |
| 3 | +import json |
| 4 | +import os |
| 5 | +from pathlib import Path |
| 6 | +from typing import Dict, List, Tuple |
| 7 | + |
| 8 | +import datasets |
| 9 | + |
| 10 | +from seacrowd.utils import schemas |
| 11 | +from seacrowd.utils.configs import SEACrowdConfig |
| 12 | +from seacrowd.utils.constants import Licenses, Tasks |
| 13 | + |
| 14 | +_CITATION = """\ |
| 15 | +@inproceedings{commonvoice:2020, |
| 16 | + author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.}, |
| 17 | + title = {Common Voice: A Massively-Multilingual Speech Corpus}, |
| 18 | + booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)}, |
| 19 | + pages = {4211--4215}, |
| 20 | + year = 2020 |
| 21 | +} |
| 22 | +""" |
| 23 | + |
| 24 | +_DATASETNAME = "commonvoice_120" |
| 25 | + |
| 26 | +_DESCRIPTION = """\ |
| 27 | +The Common Mozilla Voice dataset consists of a unique MP3 and corresponding text file. |
| 28 | +Many of the 26119 recorded hours in the dataset also include demographic metadata like age, sex, and accent that can help improve the accuracy of speech recognition engines. |
| 29 | +The dataset currently consists of 17127 validated hours in 104 languages, but more voices and languages are always added. |
| 30 | +
|
| 31 | +Before using this dataloader, please accept the acknowledgement at https://huggingface.co/datasets/mozilla-foundation/common_voice_12_0 and use huggingface-cli login for authentication |
| 32 | +""" |
| 33 | + |
| 34 | +_HOMEPAGE = "https://commonvoice.mozilla.org/en/datasets" |
| 35 | + |
| 36 | +_LANGUAGES = ["cnh", "ind", "tha", "vie"] |
| 37 | +_LANG_TO_CVLANG = {"cnh": "cnh", "ind": "id", "tha": "th", "vie": "vi"} |
| 38 | + |
| 39 | +_AGE_TO_INT = {"": None, "teens": 10, "twenties": 20, "thirties": 30, "fourties": 40, "fifties": 50, "sixties": 60, "seventies": 70, "eighties": 80} |
| 40 | + |
| 41 | +_LICENSE = Licenses.CC0_1_0.value |
| 42 | + |
| 43 | +# Note: the dataset is gated in HuggingFace. It's public after providing access token |
| 44 | +_LOCAL = False |
| 45 | + |
| 46 | +_COMMONVOICE_URL_TEMPLATE = "https://huggingface.co/datasets/mozilla-foundation/common_voice_12_0/resolve/main/" |
| 47 | +_URLS = {"audio": _COMMONVOICE_URL_TEMPLATE + "audio/{lang}/{split}/{lang}_{split}_{shard_idx}.tar", "transcript": _COMMONVOICE_URL_TEMPLATE + "transcript/{lang}/{split}.tsv", "n_shards": _COMMONVOICE_URL_TEMPLATE + "n_shards.json"} |
| 48 | + |
| 49 | +_SUPPORTED_TASKS = [Tasks.SPEECH_RECOGNITION, Tasks.TEXT_TO_SPEECH] |
| 50 | + |
| 51 | +_SOURCE_VERSION = "1.0.0" |
| 52 | + |
| 53 | +_SEACROWD_VERSION = "1.0.0" |
| 54 | + |
| 55 | + |
| 56 | +class Commonvoice120(datasets.GeneratorBasedBuilder): |
| 57 | + """This is the dataloader for CommonVoice 12.0 Mozilla""" |
| 58 | + |
| 59 | + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) |
| 60 | + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) |
| 61 | + |
| 62 | + BUILDER_CONFIGS = ( |
| 63 | + *[ |
| 64 | + SEACrowdConfig( |
| 65 | + name=f"{_DATASETNAME}_{lang}{'_' if lang else ''}source", |
| 66 | + version=datasets.Version(_SOURCE_VERSION), |
| 67 | + description=f"{_DATASETNAME} source schema for {lang}", |
| 68 | + schema="source", |
| 69 | + subset_id=f"{_DATASETNAME}{'_' if lang else ''}{lang}", |
| 70 | + ) |
| 71 | + for lang in ["", *_LANGUAGES] |
| 72 | + ], |
| 73 | + *[ |
| 74 | + SEACrowdConfig( |
| 75 | + name=f"{_DATASETNAME}_{lang}{'_' if lang else ''}seacrowd_sptext", |
| 76 | + version=datasets.Version(_SEACROWD_VERSION), |
| 77 | + description=f"{_DATASETNAME} SEACrowd schema for {lang}", |
| 78 | + schema="seacrowd_sptext", |
| 79 | + subset_id=f"{_DATASETNAME}{'_' if lang else ''}{lang}", |
| 80 | + ) |
| 81 | + for lang in ["", *_LANGUAGES] |
| 82 | + ], |
| 83 | + ) |
| 84 | + |
| 85 | + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" |
| 86 | + |
| 87 | + def _info(self) -> datasets.DatasetInfo: |
| 88 | + if self.config.schema == "source": |
| 89 | + features = datasets.Features( |
| 90 | + { |
| 91 | + "client_id": datasets.Value("string"), |
| 92 | + "path": datasets.Value("string"), |
| 93 | + "audio": datasets.features.Audio(sampling_rate=48_000), |
| 94 | + "sentence": datasets.Value("string"), |
| 95 | + "up_votes": datasets.Value("int64"), |
| 96 | + "down_votes": datasets.Value("int64"), |
| 97 | + "age": datasets.Value("string"), |
| 98 | + "gender": datasets.Value("string"), |
| 99 | + "accent": datasets.Value("string"), |
| 100 | + "locale": datasets.Value("string"), |
| 101 | + "segment": datasets.Value("string"), |
| 102 | + } |
| 103 | + ) |
| 104 | + elif self.config.schema == "seacrowd_sptext": |
| 105 | + features = schemas.speech_text_features |
| 106 | + |
| 107 | + return datasets.DatasetInfo( |
| 108 | + description=_DESCRIPTION, |
| 109 | + features=features, |
| 110 | + homepage=_HOMEPAGE, |
| 111 | + license=_LICENSE, |
| 112 | + citation=_CITATION, |
| 113 | + ) |
| 114 | + |
| 115 | + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: |
| 116 | + lang_code = self.config.subset_id.split("_")[-1] |
| 117 | + languages = [_LANG_TO_CVLANG.get(lang, lang) for lang in (_LANGUAGES if lang_code == "120" else [lang_code])] |
| 118 | + n_shards_path = dl_manager.download_and_extract(_URLS["n_shards"]) |
| 119 | + with open(n_shards_path, encoding="utf-8") as f: |
| 120 | + n_shards = json.load(f) |
| 121 | + |
| 122 | + audio_urls = {} |
| 123 | + meta_urls = {} |
| 124 | + splits = ("train", "dev", "test") |
| 125 | + for split in splits: |
| 126 | + audio_urls[split] = [_URLS["audio"].format(lang=lang, split=split, shard_idx=i) for lang in languages for i in range(n_shards[lang][split])] |
| 127 | + meta_urls[split] = [_URLS["transcript"].format(lang=lang, split=split) for lang in languages] |
| 128 | + archive_paths = dl_manager.download(audio_urls) |
| 129 | + local_extracted_archive_paths = dl_manager.extract(archive_paths) |
| 130 | + meta_paths = dl_manager.download_and_extract(meta_urls) |
| 131 | + |
| 132 | + split_names = { |
| 133 | + "train": datasets.Split.TRAIN, |
| 134 | + "dev": datasets.Split.VALIDATION, |
| 135 | + "test": datasets.Split.TEST, |
| 136 | + } |
| 137 | + return [ |
| 138 | + datasets.SplitGenerator( |
| 139 | + name=split_names.get(split, split), |
| 140 | + gen_kwargs={ |
| 141 | + "local_extracted_archive_paths": local_extracted_archive_paths.get(split), |
| 142 | + "audio_archives": [dl_manager.iter_archive(path) for path in archive_paths.get(split)], |
| 143 | + "meta_paths": meta_paths[split], |
| 144 | + "split": "train", |
| 145 | + }, |
| 146 | + ) |
| 147 | + for split in splits |
| 148 | + ] |
| 149 | + |
| 150 | + def _generate_examples(self, local_extracted_archive_paths: [Path], audio_archives: [Path], meta_paths: [Path], split: str) -> Tuple[int, Dict]: |
| 151 | + data_fields = list(self._info().features.keys()) |
| 152 | + metadata = {} |
| 153 | + for meta_path in meta_paths: |
| 154 | + with open(meta_path, encoding="utf-8") as f: |
| 155 | + reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE) |
| 156 | + for row in reader: |
| 157 | + if not row["path"].endswith(".mp3"): |
| 158 | + row["path"] += ".mp3" |
| 159 | + if "accents" in row: |
| 160 | + row["accent"] = row["accents"] |
| 161 | + del row["accents"] |
| 162 | + for field in data_fields: |
| 163 | + if field not in row: |
| 164 | + row[field] = "" |
| 165 | + metadata[row["path"]] = row |
| 166 | + |
| 167 | + if self.config.schema == "source": |
| 168 | + for i, audio_archive in enumerate(audio_archives): |
| 169 | + for path, file in audio_archive: |
| 170 | + _, filename = os.path.split(path) |
| 171 | + if filename in metadata: |
| 172 | + src_result = dict(metadata[filename]) |
| 173 | + path = os.path.join(local_extracted_archive_paths[i], path) |
| 174 | + result = { |
| 175 | + "client_id": src_result["client_id"], |
| 176 | + "path": path, |
| 177 | + "audio": {"path": path, "bytes": file.read()}, |
| 178 | + "sentence": src_result["sentence"], |
| 179 | + "up_votes": src_result["up_votes"], |
| 180 | + "down_votes": src_result["down_votes"], |
| 181 | + "age": src_result["age"], |
| 182 | + "gender": src_result["gender"], |
| 183 | + "accent": src_result["accent"], |
| 184 | + "locale": src_result["locale"], |
| 185 | + "segment": src_result["segment"], |
| 186 | + } |
| 187 | + yield path, result |
| 188 | + |
| 189 | + elif self.config.schema == "seacrowd_sptext": |
| 190 | + for i, audio_archive in enumerate(audio_archives): |
| 191 | + for path, file in audio_archive: |
| 192 | + _, filename = os.path.split(path) |
| 193 | + if filename in metadata: |
| 194 | + src_result = dict(metadata[filename]) |
| 195 | + # set the audio feature and the path to the extracted file |
| 196 | + path = os.path.join(local_extracted_archive_paths[i], path) |
| 197 | + result = { |
| 198 | + "id": src_result["path"].replace(".mp3", ""), |
| 199 | + "path": path, |
| 200 | + "audio": {"path": path, "bytes": file.read()}, |
| 201 | + "text": src_result["sentence"], |
| 202 | + "speaker_id": src_result["client_id"], |
| 203 | + "metadata": { |
| 204 | + "speaker_age": _AGE_TO_INT[src_result["age"]], |
| 205 | + "speaker_gender": src_result["gender"], |
| 206 | + }, |
| 207 | + } |
| 208 | + yield path, result |
0 commit comments