Merge branch 'lhotse-speech:master' into gigaspeech2

yfyeung · web-flow · commit 9284a9c2143c · 2024-10-22T10:42:47.000+08:00
diff --git a/docs/corpus.rst b/docs/corpus.rst
@@ -109,6 +109,8 @@ a CLI tool that create the manifests given a corpus directory.
     - :func:`lhotse.recipes.prepare_fisher_english`
   * - Fisher Spanish
     - :func:`lhotse.recipes.prepare_fisher_spanish`
+  * - FLEURS
+    - :func:`lhotse.recipes.prepare_fleurs`
   * - Fluent Speech Commands
     - :func:`lhotse.recipes.slu`
   * - GALE Arabic Broadcast Speech
@@ -211,6 +213,8 @@ a CLI tool that create the manifests given a corpus directory.
     - :func:`lhotse.recipes.prepare_wenetspeech4tts`
   * - YesNo
     - :func:`lhotse.recipes.prepare_yesno`
+  * - Emilia
+    - :func:`lhotse.recipes.prepare_emilia`
   * - Eval2000
     - :func:`lhotse.recipes.prepare_eval2000`
   * - MGB2
diff --git a/lhotse/bin/modes/recipes/__init__.py b/lhotse/bin/modes/recipes/__init__.py
@@ -31,9 +31,11 @@
 from .earnings22 import *
 from .ears import *
 from .edacc import *
+from .emilia import *
 from .eval2000 import *
 from .fisher_english import *
 from .fisher_spanish import *
+from .fleurs import *
 from .gale_arabic import *
 from .gale_mandarin import *
 from .gigaspeech import *
@@ -66,6 +68,7 @@
 from .nsc import *
 from .peoples_speech import *
 from .primewords import *
+from .radio import *
 from .reazonspeech import *
 from .rir_noise import *
 from .sbcsae import *
diff --git a/lhotse/bin/modes/recipes/emilia.py b/lhotse/bin/modes/recipes/emilia.py
@@ -0,0 +1,36 @@
+import click
+
+from lhotse.bin.modes import prepare
+from lhotse.recipes.emilia import prepare_emilia
+from lhotse.utils import Pathlike
+
+
+@prepare.command(context_settings=dict(show_default=True))
+@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
+@click.argument("output_dir", type=click.Path())
+@click.option(
+    "-l",
+    "--lang",
+    type=str,
+    help="The language to process. Valid values: zh, en, ja, ko, de, fr",
+)
+@click.option(
+    "-j",
+    "--num-jobs",
+    type=int,
+    default=1,
+    help="How many threads to use (can give good speed-ups with slow disks).",
+)
+def emilia(
+    corpus_dir: Pathlike,
+    output_dir: Pathlike,
+    lang: str,
+    num_jobs: int = 1,
+):
+    """Prepare the Emilia corpus manifests."""
+    prepare_emilia(
+        corpus_dir=corpus_dir,
+        output_dir=output_dir,
+        lang=lang,
+        num_jobs=num_jobs,
+    )
diff --git a/lhotse/bin/modes/recipes/fleurs.py b/lhotse/bin/modes/recipes/fleurs.py
@@ -0,0 +1,68 @@
+from typing import Optional, Sequence, Union
+
+import click
+
+from lhotse.bin.modes import download, prepare
+from lhotse.recipes.fleurs import download_fleurs, prepare_fleurs
+from lhotse.utils import Pathlike
+
+__all__ = ["fleurs"]
+
+
+@prepare.command(context_settings=dict(show_default=True))
+@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
+@click.argument("output_dir", type=click.Path())
+@click.option(
+    "-j",
+    "--num-jobs",
+    type=int,
+    default=1,
+    help="How many threads to use (can give good speed-ups with slow disks).",
+)
+@click.option(
+    "-l",
+    "--lang",
+    multiple=True,
+    default=["all"],
+    help="Specify which languages to prepare, e.g., "
+    "        lhoste prepare librispeech mtedx_corpus data -l de -l fr -l es ",
+)
+def fleurs(
+    corpus_dir: Pathlike,
+    output_dir: Pathlike,
+    num_jobs: int,
+    lang: Optional[Union[str, Sequence[str]]],
+):
+    """Fleurs ASR data preparation."""
+    prepare_fleurs(corpus_dir, output_dir=output_dir, num_jobs=num_jobs, languages=lang)
+
+
+@download.command(context_settings=dict(show_default=True))
+@click.argument("target_dir", type=click.Path())
+@click.option(
+    "-l",
+    "--lang",
+    multiple=True,
+    default=["all"],
+    help="Specify which languages to download, e.g., "
+    "        lhotse download fleurs . -l hi_in -l en_us "
+    "        lhotse download fleurs",
+)
+@click.option(
+    "--force-download",
+    type=bool,
+    is_flag=True,
+    default=False,
+    help="Specify whether to overwrite an existing archive",
+)
+def fleurs(
+    target_dir: Pathlike,
+    lang: Optional[Union[str, Sequence[str]]],
+    force_download: bool = False,
+):
+    """FLEURS download."""
+    download_fleurs(
+        target_dir,
+        languages=lang,
+        force_download=force_download,
+    )
diff --git a/lhotse/bin/modes/recipes/radio.py b/lhotse/bin/modes/recipes/radio.py
@@ -0,0 +1,41 @@
+from typing import List, Optional, Sequence, Tuple, Union
+
+import click
+
+from lhotse.bin.modes import prepare
+from lhotse.recipes.radio import prepare_radio
+from lhotse.utils import Pathlike
+
+__all__ = ["radio"]
+
+
+@prepare.command(context_settings=dict(show_default=True))
+@click.argument("corpus_dir", type=click.Path(dir_okay=True))
+@click.argument("output_dir", type=click.Path(dir_okay=True))
+@click.option(
+    "-d",
+    "--min-seg-dur",
+    type=float,
+    default=0.5,
+    help="The minimum segment duration",
+)
+@click.option(
+    "-j",
+    "--num-jobs",
+    type=int,
+    default=4,
+    help="The number of parallel threads to use for data preparation",
+)
+def radio(
+    corpus_dir: Pathlike,
+    output_dir: Pathlike,
+    min_seg_dur: float = 0.5,
+    num_jobs: int = 4,
+):
+    """Data preparation"""
+    prepare_radio(
+        corpus_dir,
+        output_dir=output_dir,
+        num_jobs=num_jobs,
+        min_segment_duration=min_seg_dur,
+    )
diff --git a/lhotse/recipes/__init__.py b/lhotse/recipes/__init__.py
@@ -31,6 +31,7 @@
 from .eval2000 import prepare_eval2000
 from .fisher_english import prepare_fisher_english
 from .fisher_spanish import prepare_fisher_spanish
+from .fleurs import download_fleurs, prepare_fleurs
 from .gale_arabic import prepare_gale_arabic
 from .gale_mandarin import prepare_gale_mandarin
 from .gigaspeech import prepare_gigaspeech
@@ -67,6 +68,7 @@
 from .musan import download_musan, prepare_musan
 from .nsc import prepare_nsc
 from .peoples_speech import prepare_peoples_speech
+from .radio import prepare_radio
 from .reazonspeech import download_reazonspeech, prepare_reazonspeech
 from .rir_noise import download_rir_noise, prepare_rir_noise
 from .sbcsae import download_sbcsae, prepare_sbcsae
@@ -146,6 +148,8 @@
     "prepare_eval2000",
     "prepare_fisher_english",
     "prepare_fisher_spanish",
+    "download_fleurs",
+    "prepare_fleurs",
     "prepare_gale_arabic",
     "prepare_gale_mandarin",
     "prepare_gigaspeech",
@@ -196,6 +200,7 @@
     "prepare_peoples_speech",
     "download_reazonspeech",
     "prepare_reazonspeech",
+    "prepare_radio",
     "download_rir_noise",
     "prepare_rir_noise",
     "prepare_slu",
diff --git a/lhotse/recipes/emilia.py b/lhotse/recipes/emilia.py
@@ -0,0 +1,165 @@
+"""
+The Emilia dataset is constructed from a vast collection of speech data sourced
+from diverse video platforms and podcasts on the Internet, covering various
+content genres such as talk shows, interviews, debates, sports commentary, and
+audiobooks. This variety ensures the dataset captures a wide array of real
+human speaking styles. The initial version of the Emilia dataset includes a
+total of 101,654 hours of multilingual speech data in six different languages:
+English, French, German, Chinese, Japanese, and Korean.
+
+See also
+https://emilia-dataset.github.io/Emilia-Demo-Page/
+
+Please note that Emilia does not own the copyright to the audio files; the
+copyright remains with the original owners of the videos or audio. Users are
+permitted to use this dataset only for non-commercial purposes under the
+CC BY-NC-4.0 license.
+
+Please refer to
+https://huggingface.co/datasets/amphion/Emilia-Dataset
+or
+https://openxlab.org.cn/datasets/Amphion/Emilia
+to download the dataset.
+
+Note that you need to apply for downloading.
+
+"""
+
+from concurrent.futures.thread import ThreadPoolExecutor
+from pathlib import Path
+from typing import Optional, Tuple
+
+from tqdm.auto import tqdm
+
+from lhotse import CutSet, MonoCut
+from lhotse.audio import Recording
+from lhotse.serialization import load_jsonl
+from lhotse.supervision import SupervisionSegment
+from lhotse.utils import Pathlike
+
+
+def _parse_utterance(
+    data_dir: Path,
+    line: dict,
+) -> Optional[Tuple[Recording, SupervisionSegment]]:
+    """
+    :param data_dir: Path to the data directory
+    :param line: dict, it looks like below::
+
+        {
+          "id": "DE_B00000_S00000_W000029",
+          "wav": "DE_B00000/DE_B00000_S00000/mp3/DE_B00000_S00000_W000029.mp3",
+          "text": " Und es gibt auch einen Stadtplan von Tegun zu sehen.",
+          "duration": 3.228,
+          "speaker": "DE_B00000_S00000",
+          "language": "de",
+          "dnsmos": 3.3697
+        }
+
+    :return: a tuple of "recording" and "supervision"
+    """
+    full_path = data_dir / line["wav"]
+
+    if not full_path.is_file():
+        return None
+
+    recording = Recording.from_file(
+        path=full_path,
+        recording_id=full_path.stem,
+    )
+    segment = SupervisionSegment(
+        id=recording.id,
+        recording_id=recording.id,
+        start=0.0,
+        duration=recording.duration,
+        channel=0,
+        text=line["text"],
+        language=line["language"],
+        speaker=line["speaker"],
+        custom={"dnsmos": line["dnsmos"]},
+    )
+
+    return recording, segment
+
+
+def prepare_emilia(
+    corpus_dir: Pathlike,
+    lang: str,
+    num_jobs: int,
+    output_dir: Optional[Pathlike] = None,
+) -> CutSet:
+    """
+    Returns the manifests which consist of the Recordings and Supervisions
+
+    :param corpus_dir: Pathlike, the path of the data dir.
+                       We assume the directory has the following structure:
+                       corpus_dir/raw/openemilia_all.tar.gz,
+                       corpus_dir/raw/DE,
+                       corpus_dir/raw/DE/DE_B00000.jsonl,
+                       corpus_dir/raw/DE/DE_B00000/DE_B00000_S00000/mp3/DE_B00000_S00000_W000000.mp3,
+                       corpus_dir/raw/EN, etc.
+    :param lang: str, one of en, zh, de, ko, ja, fr
+    :param num_jobs: int, number of threads for processing jsonl files
+    :param output_dir: Pathlike, the path where to write the manifests.
+    :return: The CutSet containing the data for the given language.
+    """
+    if lang is None:
+        raise ValueError("Please provide --lang")
+
+    lang_uppercase = lang.upper()
+    if lang_uppercase not in ("DE", "EN", "FR", "JA", "KO", "ZH"):
+        raise ValueError(
+            "Please provide a valid language. "
+            f"Choose from de, en, fr, ja, ko, zh. Given: {lang}"
+        )
+
+    corpus_dir = Path(corpus_dir)
+    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
+    data_dir = corpus_dir / "raw" / lang_uppercase
+    assert data_dir.is_dir(), f"No such directory: {data_dir}"
+
+    jsonl_files = data_dir.glob("*.jsonl")
+
+    cuts = []
+    futures = []
+
+    with ThreadPoolExecutor(num_jobs) as ex:
+        for jsonl_file in jsonl_files:
+            for item in tqdm(
+                # Note: People's Speech manifest.json is really a JSONL.
+                load_jsonl(jsonl_file),
+                desc=f"Processing {jsonl_file} with {num_jobs} jobs",
+            ):
+                futures.append(
+                    ex.submit(
+                        _parse_utterance,
+                        data_dir,
+                        item,
+                    )
+                )
+
+        for future in tqdm(futures, desc="Collecting futures"):
+            result = future.result()
+            if result is None:
+                continue
+
+            recording, segment = result
+
+            cuts.append(
+                MonoCut(
+                    id=recording.id,
+                    recording=recording,
+                    start=0,
+                    duration=recording.duration,
+                    supervisions=[segment],
+                    channel=0,
+                )
+            )
+
+    cut_set = CutSet.from_cuts(cuts)
+
+    if output_dir is not None:
+        output_dir = Path(output_dir)
+        cut_set.to_file(output_dir / f"emilia_cuts_{lang_uppercase}.jsonl.gz")
+
+    return cut_set
diff --git a/lhotse/recipes/fleurs.py b/lhotse/recipes/fleurs.py
diff --git a/lhotse/recipes/radio.py b/lhotse/recipes/radio.py