Merge branch 'lhotse-speech-master'

bookbot-hive · Nov 7, 2024 · a14220f · a14220f
2 parents 52e3e59 + a4faab9
commit a14220f
Show file tree

Hide file tree

Showing 114 changed files with 8,289 additions and 944 deletions.
diff --git a/.github/workflows/missing_torchaudio.yml b/.github/workflows/missing_torchaudio.yml
@@ -16,8 +16,8 @@ jobs:
     strategy:
       matrix:
         include:
-          - python-version: "3.11"
-            torch-install-cmd: "pip install torch==2.0 --extra-index-url https://download.pytorch.org/whl/cpu"
+          - python-version: "3.12"
+            torch-install-cmd: "pip install torch==2.3 --index-url https://download.pytorch.org/whl/cpu"
 
       fail-fast: false
 

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -20,16 +20,16 @@ jobs:
             torch-install-cmd: "pip install torch==1.12.1 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cpu"
             extra_deps: kaldifeat
           - python-version: "3.9"
-            torch-install-cmd: "pip install torch==2.0 torchaudio==2.0 --extra-index-url https://download.pytorch.org/whl/cpu"
-            extra_deps: kaldifeat
-          - python-version: "3.10"
-            torch-install-cmd: "pip install torch==2.1 torchaudio==2.1 --extra-index-url https://download.pytorch.org/whl/cpu"
+            torch-install-cmd: "pip install torch==2.3 torchaudio==2.3 --extra-index-url https://download.pytorch.org/whl/cpu"
+            extra_deps: ""
+          - python-version: "3.10"  # note: no torchaudio
+            torch-install-cmd: "pip install torch==2.3 --extra-index-url https://download.pytorch.org/whl/cpu"
             extra_deps: ""
-          - python-version: "3.11"
-            torch-install-cmd: "pip install torch==2.2 torchaudio==2.2 --extra-index-url https://download.pytorch.org/whl/cpu"
+          - python-version: "3.11"  # note: no torchaudio
+            torch-install-cmd: "pip install torch==2.3 --extra-index-url https://download.pytorch.org/whl/cpu"
             extra_deps: ""
-          - python-version: "3.12"
-            torch-install-cmd: "pip install torch==2.2 torchaudio==2.2 --extra-index-url https://download.pytorch.org/whl/cpu"
+          - python-version: "3.12"  # note: no torchaudio
+            torch-install-cmd: "pip install torch==2.3 --extra-index-url https://download.pytorch.org/whl/cpu"
             extra_deps: ""
 
       fail-fast: false
@@ -55,7 +55,7 @@ jobs:
         # Force the installation of a CPU-only PyTorch
         ${{ matrix.torch-install-cmd }}
         # the torchaudio env var does nothing when torchaudio is installed, but doesn't require it's presence when it's not
-        pip install '.[tests]'
+        pip install lilcom '.[tests]'
         # Enable some optional tests
         pip install h5py dill smart_open[http] kaldi_native_io webdataset==0.2.5 s3prl scipy nara_wpe pyloudnorm ${{ matrix.extra_deps }}
     - name: Install sph2pipe

diff --git a/NOTICE b/NOTICE
@@ -0,0 +1,20 @@
+Lhotse
+Copyright 2020-2024 Piotr Żelasko
+Copyright 2020-2024 Johns Hopkins University
+Copyright 2020-2024 Xiaomi Corporation
+Copyright 2022-2023 Meaning.Team Inc.
+Copyright 2023-2024 NVIDIA Corporation
+
+This repository includes software developed by:
+- Johns Hopkins University
+- Xiaomi Corporation
+- Meaning.Team Inc.
+- NVIDIA Corporation
+- other organizations and individuals.
+
+This project includes contributions from various organizations and individuals.
+Only major copyright holders are listed here.
+For a complete list of contributors, please refer to the project's version control history.
+
+Licensed under the Apache License, Version 2.0 (the "License").
+See the LICENSE file for the full contents of the license.
diff --git a/README.md b/README.md
@@ -110,17 +110,20 @@ Lhotse uses several environment variables to customize it's behavior. They are a
 - `LHOTSE_LEGACY_OPUS_LOADING` - (`=1`) reverts to a legacy OPUS loading mechanism that triggered a new ffmpeg subprocess for each OPUS file.
 - `LHOTSE_PREPARING_RELEASE` - used internally by developers when releasing a new version of Lhotse.
 - `TORCHAUDIO_USE_BACKEND_DISPATCHER` - when set to `1` and torchaudio version is below 2.1, we'll enable the experimental ffmpeg backend of torchaudio.
+- `AIS_ENDPOINT` is read by AIStore client to determine AIStore endpoint URL. Required for AIStore dataloading.
 - `RANK`, `WORLD_SIZE`, `WORKER`, and `NUM_WORKERS` are internally used to inform Lhotse Shar dataloading subprocesses.
 - `READTHEDOCS` is internally used for documentation builds.
 
 ### Optional dependencies
 
-**Other pip packages.** You can leverage optional features of Lhotse by installing the relevant supporting package like this: `pip install lhotse[package_name]`. The supported optional packages include:
+**Other pip packages.** You can leverage optional features of Lhotse by installing the relevant supporting package:
+- `torchaudio` used to be a core dependency in Lhotse, but is now optional. Refer to [official PyTorch documentation for installation](https://pytorch.org/get-started/locally/).
 - `pip install lhotse[kaldi]` for a maximal feature set related to Kaldi compatibility. It includes libraries such as `kaldi_native_io` (a more efficient variant of `kaldi_io`) and `kaldifeat` that port some of Kaldi functionality into Python.
 - `pip install lhotse[orjson]` for up to 50% faster reading of JSONL manifests.
 - `pip install lhotse[webdataset]`. We support "compiling" your data into WebDataset tarball format for more effective IO. You can still interact with the data as if it was a regular lazy CutSet. To learn more, check out the following tutorial: [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lhotse-speech/lhotse/blob/master/examples/02-webdataset-integration.ipynb)
 - `pip install h5py` if you want to extract speech features and store them as HDF5 arrays.
 - `pip install dill`. When `dill` is installed, we'll use it to pickle CutSet that uses a lambda function in calls such as `.map` or `.filter`. This is helpful in PyTorch DataLoader with `num_jobs>0`. Without `dill`, depending on your environment, you'll see an exception or a hanging script.
+- `pip install aistore` to read manifests, tar fles, and other data from AIStore using AIStore-supported URLs (set `AIS_ENDPOINT` environment variable to activate it). See [AIStore documentation](https://aiatscale.org) for more details.
 - `pip install smart_open` to read and write manifests and data in any location supported by `smart_open` (e.g. cloud, http).
 - `pip install opensmile` for feature extraction using the OpenSmile toolkit's Python wrapper.
 

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-1.22.0
+1.28.0
diff --git a/docs/conf.py b/docs/conf.py
@@ -78,4 +78,4 @@
     "exclude-members": "__weakref__",
 }
 
-autodoc_mock_imports = ["torchaudio", "SoundFile", "soundfile"]
+autodoc_mock_imports = ["SoundFile", "soundfile"]
diff --git a/docs/corpus.rst b/docs/corpus.rst
@@ -99,6 +99,8 @@ a CLI tool that create the manifests given a corpus directory.
     - :func:`lhotse.recipes.prepare_earnings21`
   * - Earnings'22
     - :func:`lhotse.recipes.prepare_earnings22`
+  * - EARS
+    - :func:`lhotse.recipes.prepare_ears`
   * - The Edinburgh International Accents of English Corpus
     - :func:`lhotse.recipes.prepare_edacc`
   * - English Broadcast News 1997
@@ -107,6 +109,8 @@ a CLI tool that create the manifests given a corpus directory.
     - :func:`lhotse.recipes.prepare_fisher_english`
   * - Fisher Spanish
     - :func:`lhotse.recipes.prepare_fisher_spanish`
+  * - FLEURS
+    - :func:`lhotse.recipes.prepare_fleurs`
   * - Fluent Speech Commands
     - :func:`lhotse.recipes.slu`
   * - GALE Arabic Broadcast Speech
@@ -131,6 +135,8 @@ a CLI tool that create the manifests given a corpus directory.
     - :func:`lhotse.recipes.prepare_iwslt22_ta`
   * - KeSpeech
     - :func:`lhotse.recipes.prepare_kespeech`
+  * - KsponSpeech
+    - :func:`lhotse.recipes.prepare_ksponspeech`
   * - L2 Arctic
     - :func:`lhotse.recipes.prepare_l2_arctic`
   * - LibriCSS
@@ -145,6 +151,8 @@ a CLI tool that create the manifests given a corpus directory.
     - :func:`lhotse.recipes.prepare_librittsr`
   * - LJ Speech
     - :func:`lhotse.recipes.prepare_ljspeech`
+  * - MDCC
+    - :func:`lhotse.recipes.prepare_mdcc`
   * - Medical
     - :func:`lhotse.recipes.prepare_medical`
   * - MiniLibriMix
@@ -163,14 +171,24 @@ a CLI tool that create the manifests given a corpus directory.
     - :func:`lhotse.recipes.prepare_nsc`
   * - People's Speech
     - :func:`lhotse.recipes.prepare_peoples_speech`
+  * - ReazonSpeech
+    - :func:`lhotse.recipes.prepare_reazonspeech`
   * - RIRs and Noises Corpus (OpenSLR 28)
     - :func:`lhotse.recipes.prepare_rir_noise`
+  * - SBCSAE
+    - :func:`lhotse.recipes.prepare_sbcsae`
+  * - Spatial-LibriSpeech
+    - :func:`lhotse.recipes.prepare_spatial_librispeech`
   * - Speech Commands
     - :func:`lhotse.recipes.prepare_speechcommands`
+  * - SpeechIO
+    - :func:`lhotse.recipes.prepare_speechio`
   * - SPGISpeech
     - :func:`lhotse.recipes.prepare_spgispeech`
   * - Switchboard
     - :func:`lhotse.recipes.prepare_switchboard`
+  * - TED-LIUM v2
+    - :func:`lhotse.recipes.prepare_tedlium2`
   * - TED-LIUM v3
     - :func:`lhotse.recipes.prepare_tedlium`
   * - TIMIT
@@ -189,8 +207,12 @@ a CLI tool that create the manifests given a corpus directory.
     - :func:`lhotse.recipes.prepare_voxpopuli`
   * - WenetSpeech
     - :func:`lhotse.recipes.prepare_wenet_speech`
+  * - WenetSpeech4TTS
+    - :func:`lhotse.recipes.prepare_wenetspeech4tts`
   * - YesNo
     - :func:`lhotse.recipes.prepare_yesno`
+  * - Emilia
+    - :func:`lhotse.recipes.prepare_emilia`
   * - Eval2000
     - :func:`lhotse.recipes.prepare_eval2000`
   * - MGB2

diff --git a/docs/datasets.rst b/docs/datasets.rst
@@ -141,6 +141,76 @@ However, many functions and classes in Lhotse accept either a random seed or an
 .. note:: The lazy seed resolution is done by calling :func:`lhotse.dataset.dataloading.resolve_seed`.
 
 
+Customizing sampling constraints
+--------------------------------
+
+Since version 1.22.0, Lhotse provides a mechanism to customize how samplers measure the "length" of each example
+for the purpose of determining dynamic batch size. To leverage this option, use the keyword argument ``constraint``
+in :class:`~lhotse.dataset.sampling.DynamicCutSampler` or :class:`~lhotse.dataset.sampling.DynamicBucketingSampler`.
+The sampling criteria are defined by implementing a subclass of :class:`~lhotse.dataset.sampling.base.SamplingConstraint`:
+
+.. autoclass:: lhotse.dataset.sampling.base.SamplingConstraint
+    :members:
+
+The default constraint is :class:`~lhotse.dataset.sampling.base.TimeConstraint` which is created from
+``max_duration``, ``max_cuts``, and ``quadratic_duration`` args passed to samplers constructor.
+
+Sampling non-audio data
+***********************
+
+Because :class:`~lhotse.dataset.sampling.base.SamplingConstraint` defines the method ``measure_length``,
+it's possible to use a different attribute than duration (or a different formula) for computing the effective batch size.
+This enables re-using Lhotse's sampling algorithms for other data than speech, and passing around other objects than :class:`~lhotse.cut.Cut`.
+
+To showcase this, we added an experimental support for text-only dataloading. We introduced a few classes specifically for this purpose:
+
+.. autoclass:: lhotse.cut.text.TextExample
+    :members:
+
+.. autoclass:: lhotse.cut.text.TextPairExample
+    :members:
+
+.. autoclass:: lhotse.lazy.LazyTxtIterator
+    :members:
+
+.. autoclass:: lhotse.dataset.sampling.base.TokenConstraint
+    :members:
+
+A minimal example of how to perform text-only dataloading is available below (note that any of these classes may be replaced by your own implementation if that is more suitable to your work)::
+
+    import torch
+    import numpy as np
+    from lhotse import CutSet
+    from lhotse.lazy import LazyTxtIterator
+    from lhotse.cut.text import TextPairExample
+    from lhotse.dataset import DynamicBucketingSampler, TokenConstraint
+    from lhotse.dataset.collation import collate_vectors
+
+    examples = CutSet(LazyTxtIterator("data.txt"))
+
+    def tokenize(example):
+        # tokenize as individual bytes; BPE or another technique may be used here instead
+        example.tokens = np.frombuffer(example.text.encode("utf-8"), np.int8)
+        return example
+
+    examples = examples.map(tokenize, apply_fn=None)
+
+    sampler = DynamicBucketingSampler(examples, constraint=TokenConstraint(max_tokens=1024, quadratic_length=128),      num_buckets=2)
+
+    class ExampleTextDataset(torch.utils.data.Dataset):
+        def __getitem__(self, examples: CutSet):
+            tokens = [ex.tokens for ex in examples]
+            token_lens = torch.tensor([len(t) for t in tokens])
+            tokens = collate_vectors(tokens, padding_value=-1)
+            return tokens, token_lens
+
+    dloader = torch.utils.data.DataLoader(ExampleTextDataset(), sampler=sampler, batch_size=None)
+
+    for batch in dloader:
+        print(batch)
+
+.. note:: Support for this kind of dataloading is experimental in Lhotse. If you run into any rough edges, please let us know.
+
 Dataset's list
 --------------
 

diff --git a/docs/getting-started.rst b/docs/getting-started.rst
@@ -133,6 +133,8 @@ Lhotse uses several environment variables to customize it's behavior. They are a
 
 * ``TORCHAUDIO_USE_BACKEND_DISPATCHER`` - when set to 1 and torchaudio version is below 2.1, we'll enable the experimental ffmpeg backend of torchaudio.
 
+* ``AIS_ENDPOINT`` is read by AIStore client to determine AIStore endpoint URL. Required for AIStore dataloading.
+
 * ``RANK``, ``WORLD_SIZE``, ``WORKER``, and ``NUM_WORKERS`` are internally used to inform Lhotse Shar dataloading subprocesses.
 
 * ``READTHEDOCS`` is internally used for documentation builds.
@@ -141,7 +143,9 @@ Lhotse uses several environment variables to customize it's behavior. They are a
 Optional dependencies
 *********************
 
-**Other pip packages.** You can leverage optional features of Lhotse by installing the relevant supporting package like this: ``pip install lhotse[package_name]``. The supported optional packages include:
+**Other pip packages.** You can leverage optional features of Lhotse by installing the relevant supporting package:
+
+* ``torchaudio`` used to be a core dependency in Lhotse, but is now optional. Refer to official PyTorch documentation for installation at `official Pytorch documentation for installation`_.
 
 * ``pip install lhotse[kaldi]`` for a maximal feature set related to Kaldi compatibility. It includes libraries such as ``kaldi_native_io`` (a more efficient variant of ``kaldi_io``) and ``kaldifeat`` that port some of Kaldi functionality into Python.
 
@@ -153,6 +157,8 @@ Optional dependencies
 
 * ``pip install dill``. When ``dill`` is installed, we'll use it to pickle CutSet that uses a lambda function in calls such as ``.map`` or ``.filter``. This is helpful in PyTorch DataLoader with ``num_jobs>0``. Without ``dill``, depending on your environment, you'll see an exception or a hanging script.
 
+* ``pip install aistore`` to read manifests, tar fles, and other data from AIStore using AIStore-supported URLs (set ``AIS_ENDPOINT`` environment variable to activate it). See |AIStore| for more details.
+
 * ``pip install smart_open`` to read and write manifests and data in any location supported by ``smart_open`` (e.g. cloud, http).
 
 * ``pip install opensmile`` for feature extraction using the OpenSmile toolkit's Python wrapper.
@@ -225,3 +231,5 @@ the speech starts roughly at the first second (100 frames):
 .. _Kaldi: https://github.com/kaldi-asr/kaldi
 .. _Icefall recipes: https://github.com/k2-fsa/icefall
 .. _orjson: https://pypi.org/project/orjson/
+.. _AIStore: https://aiatscale.org
+.. _official Pytorch documentation for installation: https://pytorch.org/get-started/locally/
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -1,5 +1,5 @@
 numpy>=1.18.1
 sphinx_rtd_theme==2.0.0
-sphinx==7.2.6
+sphinx==7.1.2
 sphinx-click==5.1.0
 sphinx-autodoc-typehints==2.0.0
diff --git a/lhotse/array.py b/lhotse/array.py
@@ -7,7 +7,7 @@
 
 import numpy as np
 
-from lhotse.utils import Pathlike, Seconds, fastcopy, ifnone
+from lhotse.utils import Pathlike, Seconds, fastcopy
 
 
 @dataclass
@@ -51,6 +51,16 @@ class Array:
     def ndim(self) -> int:
         return len(self.shape)
 
+    @property
+    def is_in_memory(self) -> bool:
+        from lhotse.features.io import is_in_memory
+
+        return is_in_memory(self.storage_type)
+
+    @property
+    def is_placeholder(self) -> bool:
+        return self.storage_type == "shar"
+
     def to_dict(self) -> dict:
         return asdict(self)
 
@@ -157,6 +167,14 @@ class TemporalArray:
     # the shape, temporal_dim, and frame_shift.
     start: Seconds
 
+    @property
+    def is_in_memory(self) -> bool:
+        return self.array.is_in_memory
+
+    @property
+    def is_placeholder(self) -> bool:
+        return self.array.is_placeholder
+
     @property
     def shape(self) -> List[int]:
         return self.array.shape

diff --git a/lhotse/audio/backend.py b/lhotse/audio/backend.py
@@ -808,7 +808,8 @@ def torchaudio_info(
 
     if torchaudio_ffmpeg_backend_available():
         # Torchaudio 2.1 with official "ffmpeg" backend should solve all the special cases below.
-        info = torchaudio.info(path_or_fileobj, backend="ffmpeg")
+        backend = "ffmpeg" if "ffmpeg" in torchaudio.list_audio_backends() else None
+        info = torchaudio.info(path_or_fileobj, backend=backend)
         return LibsndfileCompatibleAudioInfo(
             channels=info.num_channels,
             frames=info.num_frames,