Skip to content

Commit 9284a9c

Browse files
authored
Merge branch 'lhotse-speech:master' into gigaspeech2
2 parents 03b6d17 + 41269ff commit 9284a9c

File tree

9 files changed

+871
-0
lines changed

9 files changed

+871
-0
lines changed

docs/corpus.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,8 @@ a CLI tool that create the manifests given a corpus directory.
109109
- :func:`lhotse.recipes.prepare_fisher_english`
110110
* - Fisher Spanish
111111
- :func:`lhotse.recipes.prepare_fisher_spanish`
112+
* - FLEURS
113+
- :func:`lhotse.recipes.prepare_fleurs`
112114
* - Fluent Speech Commands
113115
- :func:`lhotse.recipes.slu`
114116
* - GALE Arabic Broadcast Speech
@@ -211,6 +213,8 @@ a CLI tool that create the manifests given a corpus directory.
211213
- :func:`lhotse.recipes.prepare_wenetspeech4tts`
212214
* - YesNo
213215
- :func:`lhotse.recipes.prepare_yesno`
216+
* - Emilia
217+
- :func:`lhotse.recipes.prepare_emilia`
214218
* - Eval2000
215219
- :func:`lhotse.recipes.prepare_eval2000`
216220
* - MGB2

lhotse/bin/modes/recipes/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,11 @@
3131
from .earnings22 import *
3232
from .ears import *
3333
from .edacc import *
34+
from .emilia import *
3435
from .eval2000 import *
3536
from .fisher_english import *
3637
from .fisher_spanish import *
38+
from .fleurs import *
3739
from .gale_arabic import *
3840
from .gale_mandarin import *
3941
from .gigaspeech import *
@@ -66,6 +68,7 @@
6668
from .nsc import *
6769
from .peoples_speech import *
6870
from .primewords import *
71+
from .radio import *
6972
from .reazonspeech import *
7073
from .rir_noise import *
7174
from .sbcsae import *

lhotse/bin/modes/recipes/emilia.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import click
2+
3+
from lhotse.bin.modes import prepare
4+
from lhotse.recipes.emilia import prepare_emilia
5+
from lhotse.utils import Pathlike
6+
7+
8+
@prepare.command(context_settings=dict(show_default=True))
9+
@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
10+
@click.argument("output_dir", type=click.Path())
11+
@click.option(
12+
"-l",
13+
"--lang",
14+
type=str,
15+
help="The language to process. Valid values: zh, en, ja, ko, de, fr",
16+
)
17+
@click.option(
18+
"-j",
19+
"--num-jobs",
20+
type=int,
21+
default=1,
22+
help="How many threads to use (can give good speed-ups with slow disks).",
23+
)
24+
def emilia(
25+
corpus_dir: Pathlike,
26+
output_dir: Pathlike,
27+
lang: str,
28+
num_jobs: int = 1,
29+
):
30+
"""Prepare the Emilia corpus manifests."""
31+
prepare_emilia(
32+
corpus_dir=corpus_dir,
33+
output_dir=output_dir,
34+
lang=lang,
35+
num_jobs=num_jobs,
36+
)

lhotse/bin/modes/recipes/fleurs.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
from typing import Optional, Sequence, Union
2+
3+
import click
4+
5+
from lhotse.bin.modes import download, prepare
6+
from lhotse.recipes.fleurs import download_fleurs, prepare_fleurs
7+
from lhotse.utils import Pathlike
8+
9+
__all__ = ["fleurs"]
10+
11+
12+
@prepare.command(context_settings=dict(show_default=True))
13+
@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
14+
@click.argument("output_dir", type=click.Path())
15+
@click.option(
16+
"-j",
17+
"--num-jobs",
18+
type=int,
19+
default=1,
20+
help="How many threads to use (can give good speed-ups with slow disks).",
21+
)
22+
@click.option(
23+
"-l",
24+
"--lang",
25+
multiple=True,
26+
default=["all"],
27+
help="Specify which languages to prepare, e.g., "
28+
" lhoste prepare librispeech mtedx_corpus data -l de -l fr -l es ",
29+
)
30+
def fleurs(
31+
corpus_dir: Pathlike,
32+
output_dir: Pathlike,
33+
num_jobs: int,
34+
lang: Optional[Union[str, Sequence[str]]],
35+
):
36+
"""Fleurs ASR data preparation."""
37+
prepare_fleurs(corpus_dir, output_dir=output_dir, num_jobs=num_jobs, languages=lang)
38+
39+
40+
@download.command(context_settings=dict(show_default=True))
41+
@click.argument("target_dir", type=click.Path())
42+
@click.option(
43+
"-l",
44+
"--lang",
45+
multiple=True,
46+
default=["all"],
47+
help="Specify which languages to download, e.g., "
48+
" lhotse download fleurs . -l hi_in -l en_us "
49+
" lhotse download fleurs",
50+
)
51+
@click.option(
52+
"--force-download",
53+
type=bool,
54+
is_flag=True,
55+
default=False,
56+
help="Specify whether to overwrite an existing archive",
57+
)
58+
def fleurs(
59+
target_dir: Pathlike,
60+
lang: Optional[Union[str, Sequence[str]]],
61+
force_download: bool = False,
62+
):
63+
"""FLEURS download."""
64+
download_fleurs(
65+
target_dir,
66+
languages=lang,
67+
force_download=force_download,
68+
)

lhotse/bin/modes/recipes/radio.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
from typing import List, Optional, Sequence, Tuple, Union
2+
3+
import click
4+
5+
from lhotse.bin.modes import prepare
6+
from lhotse.recipes.radio import prepare_radio
7+
from lhotse.utils import Pathlike
8+
9+
__all__ = ["radio"]
10+
11+
12+
@prepare.command(context_settings=dict(show_default=True))
13+
@click.argument("corpus_dir", type=click.Path(dir_okay=True))
14+
@click.argument("output_dir", type=click.Path(dir_okay=True))
15+
@click.option(
16+
"-d",
17+
"--min-seg-dur",
18+
type=float,
19+
default=0.5,
20+
help="The minimum segment duration",
21+
)
22+
@click.option(
23+
"-j",
24+
"--num-jobs",
25+
type=int,
26+
default=4,
27+
help="The number of parallel threads to use for data preparation",
28+
)
29+
def radio(
30+
corpus_dir: Pathlike,
31+
output_dir: Pathlike,
32+
min_seg_dur: float = 0.5,
33+
num_jobs: int = 4,
34+
):
35+
"""Data preparation"""
36+
prepare_radio(
37+
corpus_dir,
38+
output_dir=output_dir,
39+
num_jobs=num_jobs,
40+
min_segment_duration=min_seg_dur,
41+
)

lhotse/recipes/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
from .eval2000 import prepare_eval2000
3232
from .fisher_english import prepare_fisher_english
3333
from .fisher_spanish import prepare_fisher_spanish
34+
from .fleurs import download_fleurs, prepare_fleurs
3435
from .gale_arabic import prepare_gale_arabic
3536
from .gale_mandarin import prepare_gale_mandarin
3637
from .gigaspeech import prepare_gigaspeech
@@ -67,6 +68,7 @@
6768
from .musan import download_musan, prepare_musan
6869
from .nsc import prepare_nsc
6970
from .peoples_speech import prepare_peoples_speech
71+
from .radio import prepare_radio
7072
from .reazonspeech import download_reazonspeech, prepare_reazonspeech
7173
from .rir_noise import download_rir_noise, prepare_rir_noise
7274
from .sbcsae import download_sbcsae, prepare_sbcsae
@@ -146,6 +148,8 @@
146148
"prepare_eval2000",
147149
"prepare_fisher_english",
148150
"prepare_fisher_spanish",
151+
"download_fleurs",
152+
"prepare_fleurs",
149153
"prepare_gale_arabic",
150154
"prepare_gale_mandarin",
151155
"prepare_gigaspeech",
@@ -196,6 +200,7 @@
196200
"prepare_peoples_speech",
197201
"download_reazonspeech",
198202
"prepare_reazonspeech",
203+
"prepare_radio",
199204
"download_rir_noise",
200205
"prepare_rir_noise",
201206
"prepare_slu",

lhotse/recipes/emilia.py

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
"""
2+
The Emilia dataset is constructed from a vast collection of speech data sourced
3+
from diverse video platforms and podcasts on the Internet, covering various
4+
content genres such as talk shows, interviews, debates, sports commentary, and
5+
audiobooks. This variety ensures the dataset captures a wide array of real
6+
human speaking styles. The initial version of the Emilia dataset includes a
7+
total of 101,654 hours of multilingual speech data in six different languages:
8+
English, French, German, Chinese, Japanese, and Korean.
9+
10+
See also
11+
https://emilia-dataset.github.io/Emilia-Demo-Page/
12+
13+
Please note that Emilia does not own the copyright to the audio files; the
14+
copyright remains with the original owners of the videos or audio. Users are
15+
permitted to use this dataset only for non-commercial purposes under the
16+
CC BY-NC-4.0 license.
17+
18+
Please refer to
19+
https://huggingface.co/datasets/amphion/Emilia-Dataset
20+
or
21+
https://openxlab.org.cn/datasets/Amphion/Emilia
22+
to download the dataset.
23+
24+
Note that you need to apply for downloading.
25+
26+
"""
27+
28+
from concurrent.futures.thread import ThreadPoolExecutor
29+
from pathlib import Path
30+
from typing import Optional, Tuple
31+
32+
from tqdm.auto import tqdm
33+
34+
from lhotse import CutSet, MonoCut
35+
from lhotse.audio import Recording
36+
from lhotse.serialization import load_jsonl
37+
from lhotse.supervision import SupervisionSegment
38+
from lhotse.utils import Pathlike
39+
40+
41+
def _parse_utterance(
42+
data_dir: Path,
43+
line: dict,
44+
) -> Optional[Tuple[Recording, SupervisionSegment]]:
45+
"""
46+
:param data_dir: Path to the data directory
47+
:param line: dict, it looks like below::
48+
49+
{
50+
"id": "DE_B00000_S00000_W000029",
51+
"wav": "DE_B00000/DE_B00000_S00000/mp3/DE_B00000_S00000_W000029.mp3",
52+
"text": " Und es gibt auch einen Stadtplan von Tegun zu sehen.",
53+
"duration": 3.228,
54+
"speaker": "DE_B00000_S00000",
55+
"language": "de",
56+
"dnsmos": 3.3697
57+
}
58+
59+
:return: a tuple of "recording" and "supervision"
60+
"""
61+
full_path = data_dir / line["wav"]
62+
63+
if not full_path.is_file():
64+
return None
65+
66+
recording = Recording.from_file(
67+
path=full_path,
68+
recording_id=full_path.stem,
69+
)
70+
segment = SupervisionSegment(
71+
id=recording.id,
72+
recording_id=recording.id,
73+
start=0.0,
74+
duration=recording.duration,
75+
channel=0,
76+
text=line["text"],
77+
language=line["language"],
78+
speaker=line["speaker"],
79+
custom={"dnsmos": line["dnsmos"]},
80+
)
81+
82+
return recording, segment
83+
84+
85+
def prepare_emilia(
86+
corpus_dir: Pathlike,
87+
lang: str,
88+
num_jobs: int,
89+
output_dir: Optional[Pathlike] = None,
90+
) -> CutSet:
91+
"""
92+
Returns the manifests which consist of the Recordings and Supervisions
93+
94+
:param corpus_dir: Pathlike, the path of the data dir.
95+
We assume the directory has the following structure:
96+
corpus_dir/raw/openemilia_all.tar.gz,
97+
corpus_dir/raw/DE,
98+
corpus_dir/raw/DE/DE_B00000.jsonl,
99+
corpus_dir/raw/DE/DE_B00000/DE_B00000_S00000/mp3/DE_B00000_S00000_W000000.mp3,
100+
corpus_dir/raw/EN, etc.
101+
:param lang: str, one of en, zh, de, ko, ja, fr
102+
:param num_jobs: int, number of threads for processing jsonl files
103+
:param output_dir: Pathlike, the path where to write the manifests.
104+
:return: The CutSet containing the data for the given language.
105+
"""
106+
if lang is None:
107+
raise ValueError("Please provide --lang")
108+
109+
lang_uppercase = lang.upper()
110+
if lang_uppercase not in ("DE", "EN", "FR", "JA", "KO", "ZH"):
111+
raise ValueError(
112+
"Please provide a valid language. "
113+
f"Choose from de, en, fr, ja, ko, zh. Given: {lang}"
114+
)
115+
116+
corpus_dir = Path(corpus_dir)
117+
assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
118+
data_dir = corpus_dir / "raw" / lang_uppercase
119+
assert data_dir.is_dir(), f"No such directory: {data_dir}"
120+
121+
jsonl_files = data_dir.glob("*.jsonl")
122+
123+
cuts = []
124+
futures = []
125+
126+
with ThreadPoolExecutor(num_jobs) as ex:
127+
for jsonl_file in jsonl_files:
128+
for item in tqdm(
129+
# Note: People's Speech manifest.json is really a JSONL.
130+
load_jsonl(jsonl_file),
131+
desc=f"Processing {jsonl_file} with {num_jobs} jobs",
132+
):
133+
futures.append(
134+
ex.submit(
135+
_parse_utterance,
136+
data_dir,
137+
item,
138+
)
139+
)
140+
141+
for future in tqdm(futures, desc="Collecting futures"):
142+
result = future.result()
143+
if result is None:
144+
continue
145+
146+
recording, segment = result
147+
148+
cuts.append(
149+
MonoCut(
150+
id=recording.id,
151+
recording=recording,
152+
start=0,
153+
duration=recording.duration,
154+
supervisions=[segment],
155+
channel=0,
156+
)
157+
)
158+
159+
cut_set = CutSet.from_cuts(cuts)
160+
161+
if output_dir is not None:
162+
output_dir = Path(output_dir)
163+
cut_set.to_file(output_dir / f"emilia_cuts_{lang_uppercase}.jsonl.gz")
164+
165+
return cut_set

0 commit comments

Comments
 (0)