Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
version: 2
updates:
# Maintain dependencies for GitHub Actions
# Enable version updates for github-actions
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "weekly"
5 changes: 3 additions & 2 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.7', '3.8', '3.9', '3.10']
python-version: ['3.9', '3.10', '3.11', '3.12']

steps:
- uses: actions/checkout@v2
Expand All @@ -27,7 +27,8 @@ jobs:
run: |
python -m pip install --upgrade pip
python -m pip install .
pip install flake8 pytest wheel torch
pip install torch --index-url https://download.pytorch.org/whl/cpu
pip install flake8 pytest wheel
- name: Test with pytest
run: |
pytest
22 changes: 22 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v6.0.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files

- repo: https://github.com/psf/black
rev: 25.1.0
hooks:
- id: black
exclude: ^(doc)

- repo: https://github.com/pycqa/isort
rev: 6.0.1
hooks:
- id: isort
exclude: ^(doc)
1 change: 0 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
include ctc_segmentation/ctc_segmentation_dyn.pyx

5 changes: 2 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ clean:

upload:
twine upload dist/*

test:
cd tests; python -c "import test_ctc_segmentation as test; test.test_ctc_segmentation()"
cd tests; python -c "import test_ctc_segmentation as test; test.test_determine_utterance_segments()"
Expand All @@ -23,7 +23,7 @@ test:
# To test the various installation methods:
github:
cd /; pip install git+https://github.com/lumaku/ctc-segmentation --user

pip:
cd /; pip install ctc-segmentation --user

Expand All @@ -32,4 +32,3 @@ local:

rm:
cd /; pip uninstall -y ctc-segmentation

44 changes: 22 additions & 22 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# CTC segmentation

<!-- Badges -->
[![build status](https://github.com/lumaku/ctc-segmentation/actions/workflows/python-package.yml/badge.svg)](https://github.com/lumaku/ctc-segmentation/actions/workflows/python-package.yml)
[![build status](https://github.com/espnet/ctc-segmentation/actions/workflows/python-package.yml/badge.svg)](https://github.com/espnet/ctc-segmentation/actions/workflows/python-package.yml)
[![version](https://img.shields.io/pypi/v/ctc-segmentation)](https://pypi.org/project/ctc-segmentation/)
[![AUR](https://img.shields.io/aur/version/python-ctc-segmentation-git)](https://aur.archlinux.org/packages/python-ctc-segmentation-git)
[![downloads](https://img.shields.io/pypi/dm/ctc-segmentation)](https://pypi.org/project/ctc-segmentation/)
Expand All @@ -19,7 +19,7 @@ The CTC segmentation package is not standalone, as it needs a neural network wit
* In ESPnet 1 as corpus recipe: [Alignment script](https://github.com/espnet/espnet/blob/master/espnet/bin/asr_align.py), [Example recipe](https://github.com/espnet/espnet/tree/master/egs/tedlium2/align1), [Demo](https://github.com/espnet/espnet#ctc-segmentation-demo )
* In ESPnet 2, as script or directly as python interface: [Alignment script](https://github.com/espnet/espnet/blob/master/espnet2/bin/asr_align.py), [Demo](https://github.com/espnet/espnet#ctc-segmentation-demo )
* In Nvidia NeMo as dataset creation tool: [Documentation](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/tools/ctc_segmentation.html), [Example](https://github.com/NVIDIA/NeMo/blob/main/tutorials/tools/CTC_Segmentation_Tutorial.ipynb)
* In Speechbrain, as python interface: [Alignment module](https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/alignment/ctc_segmentation.py), [Examples](https://gist.github.com/lumaku/75eca1c86d9467a54888d149dc7b84f1)
* In Speechbrain, as python interface: [Alignment module](https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/alignment/ctc_segmentation.py), [Examples](https://gist.github.com/espnet/75eca1c86d9467a54888d149dc7b84f1)

It can also be used with other frameworks:

Expand Down Expand Up @@ -59,29 +59,29 @@ def align_with_transcript(
with torch.no_grad():
logits = model(inputs.input_values).logits.cpu()[0]
probs = torch.nn.functional.softmax(logits,dim=-1)

# Tokenize transcripts
vocab = tokenizer.get_vocab()
inv_vocab = {v:k for k,v in vocab.items()}
unk_id = vocab["<unk>"]

tokens = []
for transcript in transcripts:
assert len(transcript) > 0
tok_ids = tokenizer(transcript.replace("\n"," ").lower())['input_ids']
tok_ids = np.array(tok_ids,dtype=np.int)
tokens.append(tok_ids[tok_ids != unk_id])

# Align
char_list = [inv_vocab[i] for i in range(len(inv_vocab))]
config = ctc_segmentation.CtcSegmentationParameters(char_list=char_list)
config.index_duration = audio.shape[0] / probs.size()[0] / samplerate

ground_truth_mat, utt_begin_indices = ctc_segmentation.prepare_token_list(config, tokens)
timings, char_probs, state_list = ctc_segmentation.ctc_segmentation(config, probs.numpy(), ground_truth_mat)
segments = ctc_segmentation.determine_utterance_segments(config, utt_begin_indices, char_probs, timings, transcripts)
return [{"text" : t, "start" : p[0], "end" : p[1], "conf" : p[2]} for t,p in zip(transcripts, segments)]

def get_word_timestamps(
audio : np.ndarray,
samplerate : int = SAMPLERATE,
Expand All @@ -95,38 +95,38 @@ def get_word_timestamps(
with torch.no_grad():
logits = model(inputs.input_values).logits.cpu()[0]
probs = torch.nn.functional.softmax(logits,dim=-1)

predicted_ids = torch.argmax(logits, dim=-1)
pred_transcript = processor.decode(predicted_ids)

# Split the transcription into words
words = pred_transcript.split(" ")

# Align
vocab = tokenizer.get_vocab()
inv_vocab = {v:k for k,v in vocab.items()}
char_list = [inv_vocab[i] for i in range(len(inv_vocab))]
config = ctc_segmentation.CtcSegmentationParameters(char_list=char_list)
config.index_duration = audio.shape[0] / probs.size()[0] / samplerate

ground_truth_mat, utt_begin_indices = ctc_segmentation.prepare_text(config, words)
timings, char_probs, state_list = ctc_segmentation.ctc_segmentation(config, probs.numpy(), ground_truth_mat)
segments = ctc_segmentation.determine_utterance_segments(config, utt_begin_indices, char_probs, timings, words)
return [{"text" : w, "start" : p[0], "end" : p[1], "conf" : p[2]} for w,p in zip(words, segments)]

print(align_with_transcript(audio,transcripts))
# [{'text': 'A MAN SAID TO THE UNIVERSE', 'start': 0.08124999999999993, 'end': 2.034375, 'conf': 0.0},
# [{'text': 'A MAN SAID TO THE UNIVERSE', 'start': 0.08124999999999993, 'end': 2.034375, 'conf': 0.0},
# {'text': 'SIR I EXIST', 'start': 2.3260775862068965, 'end': 4.078771551724138, 'conf': 0.0}]

print(get_word_timestamps(audio))
# [{'text': 'a', 'start': 0.08124999999999993, 'end': 0.5912715517241378, 'conf': 0.9999501323699951},
# {'text': 'man', 'start': 0.5912715517241378, 'end': 0.9219827586206896, 'conf': 0.9409108982174931},
# {'text': 'said', 'start': 0.9219827586206896, 'end': 1.2326508620689656, 'conf': 0.7700278702302796},
# {'text': 'to', 'start': 1.2326508620689656, 'end': 1.3529094827586206, 'conf': 0.5094435178226225},
# {'text': 'the', 'start': 1.3529094827586206, 'end': 1.4831896551724135, 'conf': 0.4580493446392211},
# {'text': 'universe', 'start': 1.4831896551724135, 'end': 2.034375, 'conf': 0.9285054256219009},
# {'text': 'sir', 'start': 2.3260775862068965, 'end': 3.036530172413793, 'conf': 0.0},
# {'text': 'i', 'start': 3.036530172413793, 'end': 3.347198275862069, 'conf': 0.7995760873559864},
# [{'text': 'a', 'start': 0.08124999999999993, 'end': 0.5912715517241378, 'conf': 0.9999501323699951},
# {'text': 'man', 'start': 0.5912715517241378, 'end': 0.9219827586206896, 'conf': 0.9409108982174931},
# {'text': 'said', 'start': 0.9219827586206896, 'end': 1.2326508620689656, 'conf': 0.7700278702302796},
# {'text': 'to', 'start': 1.2326508620689656, 'end': 1.3529094827586206, 'conf': 0.5094435178226225},
# {'text': 'the', 'start': 1.3529094827586206, 'end': 1.4831896551724135, 'conf': 0.4580493446392211},
# {'text': 'universe', 'start': 1.4831896551724135, 'end': 2.034375, 'conf': 0.9285054256219009},
# {'text': 'sir', 'start': 2.3260775862068965, 'end': 3.036530172413793, 'conf': 0.0},
# {'text': 'i', 'start': 3.036530172413793, 'end': 3.347198275862069, 'conf': 0.7995760873559864},
# {'text': 'exist', 'start': 3.347198275862069, 'end': 4.078771551724138, 'conf': 0.0}]
```

Expand All @@ -145,7 +145,7 @@ pip install ctc-segmentation

* From source:
```sh
git clone https://github.com/lumaku/ctc-segmentation
git clone https://github.com/espnet/ctc-segmentation
cd ctc-segmentation
cythonize -3 ctc_segmentation/ctc_segmentation_dyn.pyx
python setup.py build
Expand Down Expand Up @@ -229,7 +229,7 @@ For examples, see the `prepare_*` functions in `ctc_segmentation.py`, or the exa

### Segments clean-up

Segments that were written to a `segments` file can be filtered using the confidence score. This is the minium confidence score in log space as described in the paper.
Segments that were written to a `segments` file can be filtered using the confidence score. This is the minium confidence score in log space as described in the paper.

Utterances with a low confidence score are discarded in a data clean-up. This parameter may need adjustment depending on dataset, ASR model and used text conversion.

Expand Down
15 changes: 9 additions & 6 deletions ctc_segmentation/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
"""Import all functions of the CTC segmentation package."""
from .ctc_segmentation import ctc_segmentation
from .ctc_segmentation import CtcSegmentationParameters
from .ctc_segmentation import determine_utterance_segments
from .ctc_segmentation import prepare_text
from .ctc_segmentation import prepare_tokenized_text
from .ctc_segmentation import prepare_token_list

from .ctc_segmentation import (
CtcSegmentationParameters,
ctc_segmentation,
determine_utterance_segments,
prepare_text,
prepare_token_list,
prepare_tokenized_text,
)
from .partitioning import get_partitions
1 change: 1 addition & 0 deletions ctc_segmentation/ctc_segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"""

import logging

import numpy as np

logger = logging.getLogger("ctc_segmentation")
Expand Down
2 changes: 2 additions & 0 deletions ctc_segmentation/ctc_segmentation_dyn.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ For a description, see https://arxiv.org/abs/2007.09127
"""

import logging

import numpy as np

cimport numpy as np


Expand Down
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[isort]
profile = black
29 changes: 13 additions & 16 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,26 @@
from setuptools import setup, find_packages, Extension
from setuptools.command.build_ext import build_ext
import numpy

from setuptools import Extension, find_packages, setup
from setuptools.command.build_ext import build_ext

try:
from Cython.Build import cythonize

USE_CYTHON = True
except ImportError:
USE_CYTHON = False

# https://cython.readthedocs.io/en/latest/src/userguide/source_files_and_compilation.html
ext = '.pyx' if USE_CYTHON else '.c'
ext = ".pyx" if USE_CYTHON else ".c"
extensions = [
Extension(
name="ctc_segmentation.ctc_segmentation_dyn",
sources=["ctc_segmentation/ctc_segmentation_dyn"+ext],
sources=["ctc_segmentation/ctc_segmentation_dyn" + ext],
include_dirs=[numpy.get_include()],
)
]
if USE_CYTHON:
from Cython.Build import cythonize

extensions = cythonize(extensions)

package_information = """
Expand All @@ -29,28 +30,24 @@
It can be combined with CTC-based ASR models.
This package includes the core functions.

https://github.com/lumaku/ctc-segmentation
https://github.com/espnet/ctc-segmentation
"""

setup(
name="ctc_segmentation",
version="1.7.4",

python_requires='>=3.6',
version="1.7.5",
python_requires=">=3.9",
packages=find_packages(exclude=["tests"]),
setup_requires=["numpy"],
install_requires=["setuptools", "numpy", "Cython"],
tests_require=["pytest", "torch"],
zip_safe=False,
ext_modules=extensions,
cmdclass={'build_ext': build_ext},

cmdclass={"build_ext": build_ext},
author="Ludwig Kuerzinger <ludwig.kuerzinger@tum.de>, "
"Dominik Winkelbauer <dominik.winkelbauer@tum.de>",
description="CTC segmentation to align utterances within "
"large audio files.",
url="https://github.com/lumaku/ctc-segmentation",

"Dominik Winkelbauer <dominik.winkelbauer@tum.de>",
description="CTC segmentation to align utterances within " "large audio files.",
url="https://github.com/espnet/ctc-segmentation",
long_description_content_type="text/markdown",
long_description=package_information,
)
14 changes: 8 additions & 6 deletions tests/test_ctc_segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@
"""Test functions for CTC segmentation."""
import numpy as np

from ctc_segmentation import ctc_segmentation
from ctc_segmentation import CtcSegmentationParameters
from ctc_segmentation import determine_utterance_segments
from ctc_segmentation import prepare_text
from ctc_segmentation import prepare_tokenized_text
from ctc_segmentation import prepare_token_list
from ctc_segmentation import (
CtcSegmentationParameters,
ctc_segmentation,
determine_utterance_segments,
prepare_text,
prepare_token_list,
prepare_tokenized_text,
)


def test_ctcsegmentationparameters():
Expand Down