Merge branch 'lhotse-speech:master' into gigaspeech2

bookbot-hive · Nov 24, 2024 · 49a0f26 · 49a0f26
2 parents e478ade + 36ce63e
commit 49a0f26
Show file tree

Hide file tree

Showing 29 changed files with 209 additions and 152 deletions.
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -22,14 +22,14 @@ jobs:
           - python-version: "3.9"
             torch-install-cmd: "pip install torch==2.3 torchaudio==2.3 --extra-index-url https://download.pytorch.org/whl/cpu"
             extra_deps: ""
-          - python-version: "3.10"
-            torch-install-cmd: "pip install torch==2.3 torchaudio==2.3 --extra-index-url https://download.pytorch.org/whl/cpu"
+          - python-version: "3.10"  # note: no torchaudio
+            torch-install-cmd: "pip install torch==2.3 --extra-index-url https://download.pytorch.org/whl/cpu"
             extra_deps: ""
-          - python-version: "3.11"
-            torch-install-cmd: "pip install torch==2.3 torchaudio==2.3 --extra-index-url https://download.pytorch.org/whl/cpu"
+          - python-version: "3.11"  # note: no torchaudio
+            torch-install-cmd: "pip install torch==2.3 --extra-index-url https://download.pytorch.org/whl/cpu"
             extra_deps: ""
-          - python-version: "3.12"
-            torch-install-cmd: "pip install torch==2.3 torchaudio==2.3 --extra-index-url https://download.pytorch.org/whl/cpu"
+          - python-version: "3.12"  # note: no torchaudio
+            torch-install-cmd: "pip install torch==2.3 --extra-index-url https://download.pytorch.org/whl/cpu"
             extra_deps: ""
 
       fail-fast: false

diff --git a/README.md b/README.md
@@ -116,7 +116,8 @@ Lhotse uses several environment variables to customize it's behavior. They are a
 
 ### Optional dependencies
 
-**Other pip packages.** You can leverage optional features of Lhotse by installing the relevant supporting package like this: `pip install lhotse[package_name]`. The supported optional packages include:
+**Other pip packages.** You can leverage optional features of Lhotse by installing the relevant supporting package:
+- `torchaudio` used to be a core dependency in Lhotse, but is now optional. Refer to [official PyTorch documentation for installation](https://pytorch.org/get-started/locally/).
 - `pip install lhotse[kaldi]` for a maximal feature set related to Kaldi compatibility. It includes libraries such as `kaldi_native_io` (a more efficient variant of `kaldi_io`) and `kaldifeat` that port some of Kaldi functionality into Python.
 - `pip install lhotse[orjson]` for up to 50% faster reading of JSONL manifests.
 - `pip install lhotse[webdataset]`. We support "compiling" your data into WebDataset tarball format for more effective IO. You can still interact with the data as if it was a regular lazy CutSet. To learn more, check out the following tutorial: [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lhotse-speech/lhotse/blob/master/examples/02-webdataset-integration.ipynb)

diff --git a/docs/conf.py b/docs/conf.py
@@ -78,4 +78,4 @@
     "exclude-members": "__weakref__",
 }
 
-autodoc_mock_imports = ["torchaudio", "SoundFile", "soundfile"]
+autodoc_mock_imports = ["SoundFile", "soundfile"]
diff --git a/docs/datasets.rst b/docs/datasets.rst
@@ -28,7 +28,7 @@ It allows for interesting collation methods - e.g. **padding the speech with noi
 
 The items for mini-batch creation are selected by the ``Sampler``.
 Lhotse defines ``Sampler`` classes that are initialized with :class:`~lhotse.cut.CutSet`'s, so that they can look up specific properties of an utterance to stratify the sampling.
-For example, :class:`~lhotse.dataset.sampling.SimpleCutSampler` has a defined ``max_frames`` attribute, and it will keep sampling cuts for a batch until they do not exceed the specified number of frames.
+For example, :class:`~lhotse.dataset.sampling.SimpleCutSampler` has a defined ``max_duration`` attribute, and it will keep sampling cuts for a batch until they do not exceed the specified number of seconds.
 Another strategy — used in :class:`~lhotse.dataset.sampling.BucketingSampler` — will first group the cuts of similar durations into buckets, and then randomly select a bucket to draw the whole batch from.
 
 For tasks where both input and output of the model are speech utterances, we can use the :class:`~lhotse.dataset.sampling.CutPairsSampler`, which accepts two :class:`~lhotse.cut.CutSet`'s and will match the cuts in them by their IDs.
@@ -38,11 +38,11 @@ A typical Lhotse's dataset API usage might look like this:
 .. code-block::
 
     from torch.utils.data import DataLoader
-    from lhotse.dataset import SpeechRecognitionDataset, SimpleCutSampler
+    from lhotse.dataset import K2SpeechRecognitionDataset, SimpleCutSampler
 
     cuts = CutSet(...)
-    dset = SpeechRecognitionDataset(cuts)
-    sampler = SimpleCutSampler(cuts, max_frames=50000)
+    dset = K2SpeechRecognitionDataset(cuts)
+    sampler = SimpleCutSampler(cuts, max_duration=500)
     # Dataset performs batching by itself, so we have to indicate that
     # to the DataLoader with batch_size=None
     dloader = DataLoader(dset, sampler=sampler, batch_size=None, num_workers=1)

diff --git a/docs/getting-started.rst b/docs/getting-started.rst
@@ -143,7 +143,9 @@ Lhotse uses several environment variables to customize it's behavior. They are a
 Optional dependencies
 *********************
 
-**Other pip packages.** You can leverage optional features of Lhotse by installing the relevant supporting package like this: ``pip install lhotse[package_name]``. The supported optional packages include:
+**Other pip packages.** You can leverage optional features of Lhotse by installing the relevant supporting package:
+
+* ``torchaudio`` used to be a core dependency in Lhotse, but is now optional. Refer to official PyTorch documentation for installation at `official Pytorch documentation for installation`_.
 
 * ``pip install lhotse[kaldi]`` for a maximal feature set related to Kaldi compatibility. It includes libraries such as ``kaldi_native_io`` (a more efficient variant of ``kaldi_io``) and ``kaldifeat`` that port some of Kaldi functionality into Python.
 
@@ -230,3 +232,4 @@ the speech starts roughly at the first second (100 frames):
 .. _Icefall recipes: https://github.com/k2-fsa/icefall
 .. _orjson: https://pypi.org/project/orjson/
 .. _AIStore: https://aiatscale.org
+.. _official Pytorch documentation for installation: https://pytorch.org/get-started/locally/
diff --git a/lhotse/audio/recording.py b/lhotse/audio/recording.py
@@ -8,7 +8,7 @@
 import torch
 from _decimal import ROUND_HALF_UP
 
-from lhotse.audio.backend import info, save_audio, torchaudio_info
+from lhotse.audio.backend import get_current_audio_backend, info, save_audio
 from lhotse.audio.source import AudioSource
 from lhotse.audio.utils import (
     AudioLoadingError,
@@ -168,6 +168,23 @@ def is_placeholder(self) -> bool:
     def num_channels(self) -> int:
         return len(self.channel_ids)
 
+    @property
+    def source_format(self) -> str:
+        """Infer format of the audio sources.
+        If all sources have the same format, return it.
+        If sources have different formats, raise an error.
+        """
+        source_formats = list(set([s.format for s in self.sources]))
+
+        if len(source_formats) == 1:
+            # if all sources have the same format, return it
+            return source_formats[0]
+        else:
+            # at the moment, we don't resolve different formats
+            raise NotImplementedError(
+                "Sources have different formats. Resolving to a single format not implemented."
+            )
+
     @staticmethod
     def from_file(
         path: Pathlike,
@@ -260,7 +277,7 @@ def from_bytes(
         :return: a new ``Recording`` instance that owns the byte string data.
         """
         stream = BytesIO(data)
-        audio_info = torchaudio_info(stream)
+        audio_info = get_current_audio_backend().info(stream)
         return Recording(
             id=recording_id,
             sampling_rate=audio_info.samplerate,

diff --git a/lhotse/audio/source.py b/lhotse/audio/source.py
@@ -1,3 +1,5 @@
+import io
+import os
 import warnings
 from dataclasses import dataclass
 from io import BytesIO, FileIO
@@ -6,6 +8,7 @@
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
+import soundfile as sf
 import torch
 
 from lhotse.audio.backend import read_audio
@@ -64,6 +67,10 @@ class AudioSource:
     def has_video(self) -> bool:
         return self.video is not None
 
+    @property
+    def format(self) -> str:
+        return self._get_format()
+
     def load_audio(
         self,
         offset: Seconds = 0.0,
@@ -316,3 +323,24 @@ def _prepare_for_reading(
             )
 
         return source
+
+    def _get_format(self) -> str:
+        """Get format for the audio source.
+        If using 'file' or 'url' types, the format is inferred from the file extension, as in soundfile.
+        If using 'memory' type, the format is inferred from the binary data.
+        """
+        if self.type in ("file", "url"):
+            # Resolve audio format based on the filename
+            format = os.path.splitext(self.source)[-1][1:]
+            return format.lower()
+        elif self.type == "memory":
+            sf_info = sf.info(io.BytesIO(self.source))
+            if sf_info.format == "OGG" and sf_info.subtype == "OPUS":
+                # soundfile describes opus as ogg container with opus coding
+                return "opus"
+            else:
+                return sf_info.format.lower()
+        else:
+            raise NotImplementedError(
+                f"Getting format not implemented for source type {self.type}"
+            )
diff --git a/lhotse/bin/lhotse.py b/lhotse/bin/lhotse.py
@@ -1,22 +1,6 @@
 #!/usr/bin/env python3
 """
-Use this script like:
-
-$ lhotse --help
-$ lhotse make-feats --help
-$ lhotse make-feats --compressed recording_manifest.yml mfcc_dir/
-$ lhotse write-default-feature-config feat-conf.yml
-$ lhotse kaldi import data/train 16000 train_manifests/
-$ lhotse split 3 audio.yml split_manifests/
-$ lhotse combine feature.1.yml feature.2.yml combined_feature.yml
-$ lhotse recipe --help
-$ lhotse recipe librimix-dataprep path/to/librimix.csv output_manifests_dir/
-$ lhotse recipe librimix-obtain target_dir/
-$ lhotse recipe mini-librispeech-dataprep corpus_dir/ output_manifests_dir/
-$ lhotse recipe mini-librispeech-obtain target_dir/
-$ lhotse cut --help
-$ lhotse cut simple supervisions.yml features.yml simple_cuts.yml
-$ lhotse cut stereo-mixed supervisions.yml features.yml mixed_cuts.yml
+Use this script like: https://lhotse.readthedocs.io/en/latest/cli.html
 """
 
 # Note: we import all the CLI modes here so they get auto-registered

diff --git a/lhotse/bin/modes/shar.py b/lhotse/bin/modes/shar.py
@@ -27,8 +27,8 @@ def shar():
     "-a",
     "--audio",
     default="none",
-    type=click.Choice(["none", "wav", "flac", "mp3", "opus"]),
-    help="Format in which to export audio (disabled by default, enabling will make a copy of the data)",
+    type=click.Choice(["none", "wav", "flac", "mp3", "opus", "original"]),
+    help="Format in which to export audio. Original will save in the same format as the original audio (disabled by default, enabling will make a copy of the data)",
 )
 @click.option(
     "-f",

diff --git a/lhotse/cut/data.py b/lhotse/cut/data.py
@@ -723,7 +723,7 @@ def pad(
         """
         Return a new MixedCut, padded with zeros in the recording, and ``pad_feat_value`` in each feature bin.
 
-        The user can choose to pad either to a specific `duration`; a specific number of frames `max_frames`;
+        The user can choose to pad either to a specific `duration`; a specific number of frames `num_frames`;
         or a specific number of samples `num_samples`. The three arguments are mutually exclusive.
 
         :param duration: The cut's minimal duration after padding.

diff --git a/lhotse/cut/mixed.py b/lhotse/cut/mixed.py
@@ -622,7 +622,7 @@ def pad(
         """
         Return a new MixedCut, padded with zeros in the recording, and ``pad_feat_value`` in each feature bin.
 
-        The user can choose to pad either to a specific `duration`; a specific number of frames `max_frames`;
+        The user can choose to pad either to a specific `duration`; a specific number of frames `num_frames`;
         or a specific number of samples `num_samples`. The three arguments are mutually exclusive.
 
         :param duration: The cut's minimal duration after padding.

diff --git a/lhotse/cut/padding.py b/lhotse/cut/padding.py
@@ -236,7 +236,7 @@ def pad(
         """
         Return a new MixedCut, padded with zeros in the recording, and ``pad_feat_value`` in each feature bin.
 
-        The user can choose to pad either to a specific `duration`; a specific number of frames `max_frames`;
+        The user can choose to pad either to a specific `duration`; a specific number of frames `num_frames`;
         or a specific number of samples `num_samples`. The three arguments are mutually exclusive.
 
         :param duration: The cut's minimal duration after padding.

diff --git a/lhotse/cut/set.py b/lhotse/cut/set.py
@@ -2821,7 +2821,7 @@ def pad(
     """
     Return a new MixedCut, padded with zeros in the recording, and ``pad_feat_value`` in each feature bin.
 
-    The user can choose to pad either to a specific `duration`; a specific number of frames `max_frames`;
+    The user can choose to pad either to a specific `duration`; a specific number of frames `num_frames`;
     or a specific number of samples `num_samples`. The three arguments are mutually exclusive.
 
     :param cut: DataCut to be padded.

diff --git a/lhotse/dataset/audio_tagging.py b/lhotse/dataset/audio_tagging.py
@@ -78,7 +78,7 @@ def __init__(
     def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List[str]]]:
         """
         Return a new batch, with the batch size automatically determined using the constraints
-        of max_frames and max_cuts.
+        of max_duration and max_cuts.
         """
         self.hdf5_fix.update()
 

diff --git a/lhotse/dataset/sampling/bucketing.py b/lhotse/dataset/sampling/bucketing.py
@@ -30,7 +30,7 @@ class BucketingSampler(CutSampler):
         ...    # BucketingSampler specific args
         ...    sampler_type=SimpleCutSampler, num_buckets=20,
         ...    # Args passed into SimpleCutSampler
-        ...    max_frames=20000
+        ...    max_duration=200
         ... )
 
     Bucketing sampler with 20 buckets, sampling pairs of source-target cuts::
@@ -40,7 +40,7 @@ class BucketingSampler(CutSampler):
         ...    # BucketingSampler specific args
         ...    sampler_type=CutPairsSampler, num_buckets=20,
         ...    # Args passed into CutPairsSampler
-        ...    max_source_frames=20000, max_target_frames=15000
+        ...    max_source_duration=200, max_target_duration=150
         ... )
     """
 

diff --git a/lhotse/dataset/sampling/cut_pairs.py b/lhotse/dataset/sampling/cut_pairs.py
@@ -12,10 +12,10 @@ class CutPairsSampler(CutSampler):
     It expects that both CutSet's strictly consist of Cuts with corresponding IDs.
     It behaves like an iterable that yields lists of strings (cut IDs).
 
-    When one of :attr:`max_frames`, :attr:`max_samples`, or :attr:`max_duration` is specified,
+    When one of :attr:`max_source_duration`, :attr:`max_target_duration`, or :attr:`max_cuts` is specified,
     the batch size is dynamic.
     Exactly zero or one of those constraints can be specified.
-    Padding required to collate the batch does not contribute to max frames/samples/duration.
+    Padding required to collate the batch does not contribute to max source_duration/target_duration.
     """
 
     def __init__(
@@ -229,7 +229,7 @@ def _next_batch(self) -> Tuple[CutSet, CutSet]:
             self.source_constraints.add(next_source_cut)
             self.target_constraints.add(next_target_cut)
 
-            # Did we exceed the max_source_frames and max_cuts constraints?
+            # Did we exceed the max_source_duration and max_cuts constraints?
             if (
                 not self.source_constraints.exceeded()
                 and not self.target_constraints.exceeded()
@@ -249,7 +249,7 @@ def _next_batch(self) -> Tuple[CutSet, CutSet]:
                     # and return the cut anyway.
                     warnings.warn(
                         "The first cut drawn in batch collection violates one of the max_... constraints"
-                        "we'll return it anyway. Consider increasing max_source_frames/max_cuts/etc."
+                        "we'll return it anyway. Consider increasing max_source_duration/max_cuts/etc."
                     )
                     source_cuts.append(next_source_cut)
                     target_cuts.append(next_target_cut)

diff --git a/lhotse/dataset/sampling/dynamic.py b/lhotse/dataset/sampling/dynamic.py
@@ -335,7 +335,7 @@ def detuplify(
                 else next_cut_or_tpl
             )
 
-            # Did we exceed the max_frames and max_cuts constraints?
+            # Did we exceed the max_duration and max_cuts constraints?
             if self.constraint.close_to_exceeding():
                 # Yes. Finish sampling this batch.
                 if self.constraint.exceeded() and len(cuts) == 1:

diff --git a/lhotse/dataset/sampling/simple.py b/lhotse/dataset/sampling/simple.py
@@ -11,10 +11,10 @@ class SimpleCutSampler(CutSampler):
     Samples cuts from a CutSet to satisfy the input constraints.
     It behaves like an iterable that yields lists of strings (cut IDs).
 
-    When one of :attr:`max_frames`, :attr:`max_samples`, or :attr:`max_duration` is specified,
+    When one of :attr:`max_duration`, or :attr:`max_cuts` is specified,
     the batch size is dynamic.
     Exactly zero or one of those constraints can be specified.
-    Padding required to collate the batch does not contribute to max frames/samples/duration.
+    Padding required to collate the batch does not contribute to max duration.
 
     Example usage::
 
@@ -197,10 +197,10 @@ def _next_batch(self) -> CutSet:
                 self.diagnostics.discard_single(next_cut)
                 continue
 
-            # Track the duration/frames/etc. constraints.
+            # Track the duration/etc. constraints.
             self.time_constraint.add(next_cut)
 
-            # Did we exceed the max_frames and max_cuts constraints?
+            # Did we exceed the max_duration and max_cuts constraints?
             if not self.time_constraint.exceeded():
                 # No - add the next cut to the batch, and keep trying.
                 cuts.append(next_cut)
@@ -215,9 +215,9 @@ def _next_batch(self) -> CutSet:
                     # and return the cut anyway.
                     warnings.warn(
                         "The first cut drawn in batch collection violates "
-                        "the max_frames, max_cuts, or max_duration constraints - "
+                        "the max_duration, or max_cuts constraints - "
                         "we'll return it anyway. "
-                        "Consider increasing max_frames/max_cuts/max_duration."
+                        "Consider increasing max_duration/max_cuts."
                     )
                     cuts.append(next_cut)
 

diff --git a/lhotse/dataset/sampling/weighted_simple.py b/lhotse/dataset/sampling/weighted_simple.py
@@ -15,7 +15,7 @@ class WeightedSimpleCutSampler(SimpleCutSampler):
     When performing sampling, it avoids having duplicated cuts in the same batch.
     The sampler terminates if the number of sampled cuts reach :attr:`num_samples`
 
-    When one of :attr:`max_frames`, :attr:`max_samples`, or :attr:`max_duration` is specified,
+    When one of :attr:`max_duration`, or :attr:`max_cuts` is specified,
     the batch size is dynamic.
 
     Example usage:

diff --git a/lhotse/dataset/speech_recognition.py b/lhotse/dataset/speech_recognition.py
@@ -94,7 +94,7 @@ def __init__(
     def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List[str]]]:
         """
         Return a new batch, with the batch size automatically determined using the constraints
-        of max_frames and max_cuts.
+        of max_duration and max_cuts.
         """
         validate_for_asr(cuts)
 

diff --git a/lhotse/dataset/speech_translation.py b/lhotse/dataset/speech_translation.py
@@ -97,7 +97,7 @@ def __init__(
     def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List[str]]]:
         """
         Return a new batch, with the batch size automatically determined using the constraints
-        of max_frames and max_cuts.
+        of max_duration and max_cuts.
         """
         validate_for_asr(cuts)
         self.hdf5_fix.update()

diff --git a/lhotse/dataset/surt.py b/lhotse/dataset/surt.py
@@ -170,7 +170,7 @@ def __init__(
     def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List[str]]]:
         """
         Return a new batch, with the batch size automatically determined using the constraints
-        of max_frames and max_cuts.
+        of max_duration and max_cuts.
         """
         validate_for_asr(cuts)
 

diff --git a/lhotse/parallel.py b/lhotse/parallel.py
@@ -88,7 +88,7 @@ class ParallelExecutor:
 
         >>> class MyRunner:
         ...     def __init__(self):
-        ...         self.name = name
+        ...         pass
         ...     def __call__(self, x):
         ...         return f'processed: {x}'
         ...