bookbot-hive · w11wo · Dec 16, 2024 · Jun 28, 2024 · Jun 28, 2024 · Jun 28, 2024
diff --git a/docs/corpus.rst b/docs/corpus.rst
@@ -119,6 +119,8 @@ a CLI tool that create the manifests given a corpus directory.
     - :func:`lhotse.recipes.prepare_gale_mandarin`
   * - GigaSpeech
     - :func:`lhotse.recipes.prepare_gigaspeech`
+  * - GigaSpeech 2
+    - :func:`lhotse.recipes.prepare_gigaspeech2`
   * - GigaST
     - :func:`lhotse.recipes.prepare_gigast`
   * - Heroico

diff --git a/docs/datasets.rst b/docs/datasets.rst
@@ -28,7 +28,7 @@ It allows for interesting collation methods - e.g. **padding the speech with noi
 
 The items for mini-batch creation are selected by the ``Sampler``.
 Lhotse defines ``Sampler`` classes that are initialized with :class:`~lhotse.cut.CutSet`'s, so that they can look up specific properties of an utterance to stratify the sampling.
-For example, :class:`~lhotse.dataset.sampling.SimpleCutSampler` has a defined ``max_frames`` attribute, and it will keep sampling cuts for a batch until they do not exceed the specified number of frames.
+For example, :class:`~lhotse.dataset.sampling.SimpleCutSampler` has a defined ``max_duration`` attribute, and it will keep sampling cuts for a batch until they do not exceed the specified number of seconds.
 Another strategy — used in :class:`~lhotse.dataset.sampling.BucketingSampler` — will first group the cuts of similar durations into buckets, and then randomly select a bucket to draw the whole batch from.
 
 For tasks where both input and output of the model are speech utterances, we can use the :class:`~lhotse.dataset.sampling.CutPairsSampler`, which accepts two :class:`~lhotse.cut.CutSet`'s and will match the cuts in them by their IDs.
@@ -38,11 +38,11 @@ A typical Lhotse's dataset API usage might look like this:
 .. code-block::
 
     from torch.utils.data import DataLoader
-    from lhotse.dataset import SpeechRecognitionDataset, SimpleCutSampler
+    from lhotse.dataset import K2SpeechRecognitionDataset, SimpleCutSampler
 
     cuts = CutSet(...)
-    dset = SpeechRecognitionDataset(cuts)
-    sampler = SimpleCutSampler(cuts, max_frames=50000)
+    dset = K2SpeechRecognitionDataset(cuts)
+    sampler = SimpleCutSampler(cuts, max_duration=500)
     # Dataset performs batching by itself, so we have to indicate that
     # to the DataLoader with batch_size=None
     dloader = DataLoader(dset, sampler=sampler, batch_size=None, num_workers=1)

diff --git a/lhotse/audio/recording.py b/lhotse/audio/recording.py
@@ -168,6 +168,23 @@ def is_placeholder(self) -> bool:
     def num_channels(self) -> int:
         return len(self.channel_ids)
 
+    @property
+    def source_format(self) -> str:
+        """Infer format of the audio sources.
+        If all sources have the same format, return it.
+        If sources have different formats, raise an error.
+        """
+        source_formats = list(set([s.format for s in self.sources]))
+
+        if len(source_formats) == 1:
+            # if all sources have the same format, return it
+            return source_formats[0]
+        else:
+            # at the moment, we don't resolve different formats
+            raise NotImplementedError(
+                "Sources have different formats. Resolving to a single format not implemented."
+            )
+
     @staticmethod
     def from_file(
         path: Pathlike,

diff --git a/lhotse/audio/source.py b/lhotse/audio/source.py
@@ -1,3 +1,5 @@
+import io
+import os
 import warnings
 from dataclasses import dataclass
 from io import BytesIO, FileIO
@@ -6,6 +8,7 @@
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
+import soundfile as sf
 import torch
 
 from lhotse.audio.backend import read_audio
@@ -64,6 +67,10 @@ class AudioSource:
     def has_video(self) -> bool:
         return self.video is not None
 
+    @property
+    def format(self) -> str:
+        return self._get_format()
+
     def load_audio(
         self,
         offset: Seconds = 0.0,
@@ -316,3 +323,24 @@ def _prepare_for_reading(
             )
 
         return source
+
+    def _get_format(self) -> str:
+        """Get format for the audio source.
+        If using 'file' or 'url' types, the format is inferred from the file extension, as in soundfile.
+        If using 'memory' type, the format is inferred from the binary data.
+        """
+        if self.type in ("file", "url"):
+            # Resolve audio format based on the filename
+            format = os.path.splitext(self.source)[-1][1:]
+            return format.lower()
+        elif self.type == "memory":
+            sf_info = sf.info(io.BytesIO(self.source))
+            if sf_info.format == "OGG" and sf_info.subtype == "OPUS":
+                # soundfile describes opus as ogg container with opus coding
+                return "opus"
+            else:
+                return sf_info.format.lower()
+        else:
+            raise NotImplementedError(
+                f"Getting format not implemented for source type {self.type}"
+            )
diff --git a/lhotse/bin/modes/recipes/__init__.py b/lhotse/bin/modes/recipes/__init__.py
@@ -39,6 +39,7 @@
 from .gale_arabic import *
 from .gale_mandarin import *
 from .gigaspeech import *
+from .gigaspeech2 import *
 from .gigast import *
 from .grid import *
 from .heroico import *

diff --git a/lhotse/bin/modes/recipes/gigaspeech2.py b/lhotse/bin/modes/recipes/gigaspeech2.py
@@ -0,0 +1,38 @@
+from typing import Optional, Sequence, Union
+
+import click
+
+from lhotse.bin.modes import prepare
+from lhotse.recipes.gigaspeech2 import prepare_gigaspeech2
+from lhotse.utils import Pathlike
+
+
+@prepare.command(context_settings=dict(show_default=True))
+@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
+@click.argument("output_dir", type=click.Path())
+@click.option(
+    "-l",
+    "--languages",
+    default="auto",
+    help="Languages to prepare (scans CORPUS_DIR for language codes by default).",
+)
+@click.option(
+    "-j",
+    "--num-jobs",
+    type=int,
+    default=1,
+    help="How many threads to use (can give good speed-ups with slow disks).",
+)
+def gigaspeech2(
+    corpus_dir: Pathlike,
+    output_dir: Optional[Pathlike] = None,
+    languages: Union[str, Sequence[str]] = "auto",
+    num_jobs: int = 1,
+):
+    """GigaSpeech 2 data preparation."""
+    prepare_gigaspeech2(
+        corpus_dir=corpus_dir,
+        output_dir=output_dir,
+        languages=languages,
+        num_jobs=num_jobs,
+    )
diff --git a/lhotse/bin/modes/shar.py b/lhotse/bin/modes/shar.py
@@ -27,8 +27,8 @@ def shar():
     "-a",
     "--audio",
     default="none",
-    type=click.Choice(["none", "wav", "flac", "mp3", "opus"]),
-    help="Format in which to export audio (disabled by default, enabling will make a copy of the data)",
+    type=click.Choice(["none", "wav", "flac", "mp3", "opus", "original"]),
+    help="Format in which to export audio. Original will save in the same format as the original audio (disabled by default, enabling will make a copy of the data)",
 )
 @click.option(
     "-f",

diff --git a/lhotse/cut/data.py b/lhotse/cut/data.py
@@ -723,7 +723,7 @@ def pad(
         """
         Return a new MixedCut, padded with zeros in the recording, and ``pad_feat_value`` in each feature bin.
 
-        The user can choose to pad either to a specific `duration`; a specific number of frames `max_frames`;
+        The user can choose to pad either to a specific `duration`; a specific number of frames `num_frames`;
         or a specific number of samples `num_samples`. The three arguments are mutually exclusive.
 
         :param duration: The cut's minimal duration after padding.

diff --git a/lhotse/cut/mixed.py b/lhotse/cut/mixed.py
@@ -622,7 +622,7 @@ def pad(
         """
         Return a new MixedCut, padded with zeros in the recording, and ``pad_feat_value`` in each feature bin.
 
-        The user can choose to pad either to a specific `duration`; a specific number of frames `max_frames`;
+        The user can choose to pad either to a specific `duration`; a specific number of frames `num_frames`;
         or a specific number of samples `num_samples`. The three arguments are mutually exclusive.
 
         :param duration: The cut's minimal duration after padding.

diff --git a/lhotse/cut/padding.py b/lhotse/cut/padding.py
@@ -236,7 +236,7 @@ def pad(
         """
         Return a new MixedCut, padded with zeros in the recording, and ``pad_feat_value`` in each feature bin.
 
-        The user can choose to pad either to a specific `duration`; a specific number of frames `max_frames`;
+        The user can choose to pad either to a specific `duration`; a specific number of frames `num_frames`;
         or a specific number of samples `num_samples`. The three arguments are mutually exclusive.
 
         :param duration: The cut's minimal duration after padding.

diff --git a/lhotse/cut/set.py b/lhotse/cut/set.py
@@ -2821,7 +2821,7 @@ def pad(
     """
     Return a new MixedCut, padded with zeros in the recording, and ``pad_feat_value`` in each feature bin.
 
-    The user can choose to pad either to a specific `duration`; a specific number of frames `max_frames`;
+    The user can choose to pad either to a specific `duration`; a specific number of frames `num_frames`;
     or a specific number of samples `num_samples`. The three arguments are mutually exclusive.
 
     :param cut: DataCut to be padded.

diff --git a/lhotse/dataset/audio_tagging.py b/lhotse/dataset/audio_tagging.py
@@ -78,7 +78,7 @@ def __init__(
     def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List[str]]]:
         """
         Return a new batch, with the batch size automatically determined using the constraints
-        of max_frames and max_cuts.
+        of max_duration and max_cuts.
         """
         self.hdf5_fix.update()
 

diff --git a/lhotse/dataset/sampling/bucketing.py b/lhotse/dataset/sampling/bucketing.py
@@ -30,7 +30,7 @@ class BucketingSampler(CutSampler):
         ...    # BucketingSampler specific args
         ...    sampler_type=SimpleCutSampler, num_buckets=20,
         ...    # Args passed into SimpleCutSampler
-        ...    max_frames=20000
+        ...    max_duration=200
         ... )
 
     Bucketing sampler with 20 buckets, sampling pairs of source-target cuts::
@@ -40,7 +40,7 @@ class BucketingSampler(CutSampler):
         ...    # BucketingSampler specific args
         ...    sampler_type=CutPairsSampler, num_buckets=20,
         ...    # Args passed into CutPairsSampler
-        ...    max_source_frames=20000, max_target_frames=15000
+        ...    max_source_duration=200, max_target_duration=150
         ... )
     """
 

diff --git a/lhotse/dataset/sampling/cut_pairs.py b/lhotse/dataset/sampling/cut_pairs.py
@@ -12,10 +12,10 @@ class CutPairsSampler(CutSampler):
     It expects that both CutSet's strictly consist of Cuts with corresponding IDs.
     It behaves like an iterable that yields lists of strings (cut IDs).
 
-    When one of :attr:`max_frames`, :attr:`max_samples`, or :attr:`max_duration` is specified,
+    When one of :attr:`max_source_duration`, :attr:`max_target_duration`, or :attr:`max_cuts` is specified,
     the batch size is dynamic.
     Exactly zero or one of those constraints can be specified.
-    Padding required to collate the batch does not contribute to max frames/samples/duration.
+    Padding required to collate the batch does not contribute to max source_duration/target_duration.
     """
 
     def __init__(
@@ -229,7 +229,7 @@ def _next_batch(self) -> Tuple[CutSet, CutSet]:
             self.source_constraints.add(next_source_cut)
             self.target_constraints.add(next_target_cut)
 
-            # Did we exceed the max_source_frames and max_cuts constraints?
+            # Did we exceed the max_source_duration and max_cuts constraints?
             if (
                 not self.source_constraints.exceeded()
                 and not self.target_constraints.exceeded()
@@ -249,7 +249,7 @@ def _next_batch(self) -> Tuple[CutSet, CutSet]:
                     # and return the cut anyway.
                     warnings.warn(
                         "The first cut drawn in batch collection violates one of the max_... constraints"
-                        "we'll return it anyway. Consider increasing max_source_frames/max_cuts/etc."
+                        "we'll return it anyway. Consider increasing max_source_duration/max_cuts/etc."
                     )
                     source_cuts.append(next_source_cut)
                     target_cuts.append(next_target_cut)

diff --git a/lhotse/dataset/sampling/dynamic.py b/lhotse/dataset/sampling/dynamic.py
@@ -335,7 +335,7 @@ def detuplify(
                 else next_cut_or_tpl
             )
 
-            # Did we exceed the max_frames and max_cuts constraints?
+            # Did we exceed the max_duration and max_cuts constraints?
             if self.constraint.close_to_exceeding():
                 # Yes. Finish sampling this batch.
                 if self.constraint.exceeded() and len(cuts) == 1:

diff --git a/lhotse/dataset/sampling/simple.py b/lhotse/dataset/sampling/simple.py
@@ -11,10 +11,10 @@ class SimpleCutSampler(CutSampler):
     Samples cuts from a CutSet to satisfy the input constraints.
     It behaves like an iterable that yields lists of strings (cut IDs).
 
-    When one of :attr:`max_frames`, :attr:`max_samples`, or :attr:`max_duration` is specified,
+    When one of :attr:`max_duration`, or :attr:`max_cuts` is specified,
     the batch size is dynamic.
     Exactly zero or one of those constraints can be specified.
-    Padding required to collate the batch does not contribute to max frames/samples/duration.
+    Padding required to collate the batch does not contribute to max duration.
 
     Example usage::
 
@@ -197,10 +197,10 @@ def _next_batch(self) -> CutSet:
                 self.diagnostics.discard_single(next_cut)
                 continue
 
-            # Track the duration/frames/etc. constraints.
+            # Track the duration/etc. constraints.
             self.time_constraint.add(next_cut)
 
-            # Did we exceed the max_frames and max_cuts constraints?
+            # Did we exceed the max_duration and max_cuts constraints?
             if not self.time_constraint.exceeded():
                 # No - add the next cut to the batch, and keep trying.
                 cuts.append(next_cut)
@@ -215,9 +215,9 @@ def _next_batch(self) -> CutSet:
                     # and return the cut anyway.
                     warnings.warn(
                         "The first cut drawn in batch collection violates "
-                        "the max_frames, max_cuts, or max_duration constraints - "
+                        "the max_duration, or max_cuts constraints - "
                         "we'll return it anyway. "
-                        "Consider increasing max_frames/max_cuts/max_duration."
+                        "Consider increasing max_duration/max_cuts."
                     )
                     cuts.append(next_cut)
 

diff --git a/lhotse/dataset/sampling/weighted_simple.py b/lhotse/dataset/sampling/weighted_simple.py
@@ -15,7 +15,7 @@ class WeightedSimpleCutSampler(SimpleCutSampler):
     When performing sampling, it avoids having duplicated cuts in the same batch.
     The sampler terminates if the number of sampled cuts reach :attr:`num_samples`
 
-    When one of :attr:`max_frames`, :attr:`max_samples`, or :attr:`max_duration` is specified,
+    When one of :attr:`max_duration`, or :attr:`max_cuts` is specified,
     the batch size is dynamic.
 
     Example usage:

diff --git a/lhotse/dataset/speech_recognition.py b/lhotse/dataset/speech_recognition.py
@@ -94,7 +94,7 @@ def __init__(
     def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List[str]]]:
         """
         Return a new batch, with the batch size automatically determined using the constraints
-        of max_frames and max_cuts.
+        of max_duration and max_cuts.
         """
         validate_for_asr(cuts)
 

diff --git a/lhotse/dataset/speech_translation.py b/lhotse/dataset/speech_translation.py
@@ -97,7 +97,7 @@ def __init__(
     def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List[str]]]:
         """
         Return a new batch, with the batch size automatically determined using the constraints
-        of max_frames and max_cuts.
+        of max_duration and max_cuts.
         """
         validate_for_asr(cuts)
         self.hdf5_fix.update()

diff --git a/lhotse/dataset/surt.py b/lhotse/dataset/surt.py
@@ -170,7 +170,7 @@ def __init__(
     def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List[str]]]:
         """
         Return a new batch, with the batch size automatically determined using the constraints
-        of max_frames and max_cuts.
+        of max_duration and max_cuts.
         """
         validate_for_asr(cuts)
 

diff --git a/lhotse/parallel.py b/lhotse/parallel.py
@@ -88,7 +88,7 @@ class ParallelExecutor:
 
         >>> class MyRunner:
         ...     def __init__(self):
-        ...         self.name = name
+        ...         pass
         ...     def __call__(self, x):
         ...         return f'processed: {x}'
         ...

diff --git a/lhotse/recipes/__init__.py b/lhotse/recipes/__init__.py
@@ -35,6 +35,7 @@
 from .gale_arabic import prepare_gale_arabic
 from .gale_mandarin import prepare_gale_mandarin
 from .gigaspeech import prepare_gigaspeech
+from .gigaspeech2 import prepare_gigaspeech2
 from .gigast import download_gigast, prepare_gigast
 from .grid import download_grid, prepare_grid
 from .heroico import download_heroico, prepare_heroico
@@ -152,6 +153,7 @@
     "prepare_gale_arabic",
     "prepare_gale_mandarin",
     "prepare_gigaspeech",
+    "prepare_gigaspeech2",
     "download_gigast",
     "prepare_gigast",
     "download_grid",