From 2ca75bf1b22a4753078ead003260a978caf89d4c Mon Sep 17 00:00:00 2001 From: popcornell Date: Tue, 24 Jun 2025 13:29:43 +0200 Subject: [PATCH 1/2] added phoneme counting duration --- meeteval/wer/preprocess.py | 11 ++++-- meeteval/wer/wer/time_constrained.py | 50 ++++++++++++++++++++++++---- 2 files changed, 51 insertions(+), 10 deletions(-) diff --git a/meeteval/wer/preprocess.py b/meeteval/wer/preprocess.py index 1e1783e..c0a6fdc 100644 --- a/meeteval/wer/preprocess.py +++ b/meeteval/wer/preprocess.py @@ -17,6 +17,7 @@ def split_words( keys=('words',), word_level_timing_strategy=None, segment_representation='word', # 'segment', 'word', 'speaker' + language=None ): """ Splits segments into words and copies all other entries. @@ -77,8 +78,7 @@ def get_words(s): words = s['words'] or [''] timestamps = word_level_timing_strategy( (s['start_time'], s['end_time']), - words - ) + words, language) s['start_time'] = [s for s, _ in timestamps] s['end_time'] = [s for _, s in timestamps] @@ -263,6 +263,7 @@ def _preprocess_single( name=None, segment_index=False, # 'segment', 'word', False segment_representation='word', # 'segment', 'word', 'speaker' + language=None ): """ >>> from paderbox.utils.pretty import pprint @@ -428,7 +429,8 @@ def _preprocess_single( words = split_words( segments, word_level_timing_strategy=word_level_timing_strategy, - segment_representation=segment_representation + segment_representation=segment_representation, + language=language ) # Warn or raise an exception if the order of the words contradicts the @@ -517,6 +519,7 @@ def preprocess( hypothesis_pseudo_word_level_timing=None, segment_representation='segment', # 'segment', 'word', 'speaker' ensure_single_session=True, + language=None ): """ Preprocessing. @@ -538,6 +541,7 @@ def preprocess( collar=None, # collar is not applied to the reference word_level_timing_strategy=reference_pseudo_word_level_timing, segment_representation=segment_representation, + language=language ) hypothesis, hypothesis_self_overlap = _preprocess_single( hypothesis, @@ -549,6 +553,7 @@ def preprocess( collar=collar, word_level_timing_strategy=hypothesis_pseudo_word_level_timing, segment_representation=segment_representation, + language=language ) diff --git a/meeteval/wer/wer/time_constrained.py b/meeteval/wer/wer/time_constrained.py index 84c3d32..32ec1e1 100644 --- a/meeteval/wer/wer/time_constrained.py +++ b/meeteval/wer/wer/time_constrained.py @@ -5,6 +5,8 @@ import typing from dataclasses import dataclass, replace +import transphone + from meeteval.io.pbjson import zip_strict from meeteval.io.stm import STM from meeteval.io.seglst import SegLST, seglst_map, asseglst, SegLstSegment @@ -40,7 +42,7 @@ class Segment(TypedDict): # pseudo-timestamp strategies -def equidistant_intervals(interval, words): +def equidistant_intervals(interval, words, *args): """Divides the interval into `count` equally sized intervals """ count = len(words) @@ -57,7 +59,7 @@ def equidistant_intervals(interval, words): ] -def equidistant_points(interval, words): +def equidistant_points(interval, words, *args): """Places `count` points (intervals of size zero) in `interval` with equal distance""" count = len(words) if count == 0: @@ -72,7 +74,7 @@ def equidistant_points(interval, words): ] -def character_based(interval, words): +def character_based(interval, words, *args): """Divides the interval into one interval per word where the size of the interval is proportional to the word length in characters.""" if len(words) == 0: @@ -93,7 +95,29 @@ def character_based(interval, words): ] -def character_based_points(interval, words): +def phoneme_based(interval, words, language): + """Divides the interval into one interval per word where the size of the interval is + proportional to the number of phonemes in the word.""" + + g2p = transphone.read_tokenizer(language) + if len(words) == 0: + return [] + elif len(words) == 1: + return [interval] + import numpy as np + word_lengths = np.asarray([len(g2p.tokenizer(w)) for w in words]) + end_points = np.cumsum(word_lengths) + total_num_characters = end_points[-1] + character_length = (interval[1] - interval[0]) / total_num_characters + return [ + ( + interval[0] + character_length * start, + interval[0] + character_length * end + ) + for start, end in zip([0] + list(end_points[:-1]), end_points) + ] + +def character_based_points(interval, words, *args): """Places points in the center of the character-based intervals""" intervals = character_based(interval, words) intervals = [ @@ -102,13 +126,22 @@ def character_based_points(interval, words): ] return intervals +def phoneme_based_points(interval, words, language): + """Places points in the center of the phoneme-based intervals""" + intervals = phoneme_based(interval, words, language) + intervals = [ + ((interval[1] + interval[0]) / 2,) * 2 + for interval in intervals + ] + return intervals + -def full_segment(interval, words): +def full_segment(interval, words, *args): """Outputs `interval` for each word""" return [interval] * len(words) -def no_segmentation(interval, words): +def no_segmentation(interval, words, *args): if len(words) != 1: if len(words) > 1: raise ValueError( @@ -131,6 +164,8 @@ def no_segmentation(interval, words): 'equidistant_intervals': equidistant_intervals, 'equidistant_points': equidistant_points, 'full_segment': full_segment, + 'phoneme_based': phoneme_based, + 'phoneme_based_points': phoneme_based_points, 'character_based': character_based, 'character_based_points': character_based_points, 'none': no_segmentation, @@ -669,6 +704,7 @@ def time_constrained_minimum_permutation_word_error_rate( collar, reference_pseudo_word_level_timing='character_based', hypothesis_pseudo_word_level_timing='character_based_points', + language=None, reference_sort='segment', hypothesis_sort='segment', ) -> CPErrorRate: @@ -707,7 +743,7 @@ def time_constrained_minimum_permutation_word_error_rate( collar=collar, reference_pseudo_word_level_timing=reference_pseudo_word_level_timing, hypothesis_pseudo_word_level_timing=hypothesis_pseudo_word_level_timing, - segment_representation='word', + segment_representation='word', language=language ) er = _minimum_permutation_word_error_rate( From a72cfe30a82072638a7b4a5f455150bf82694ede Mon Sep 17 00:00:00 2001 From: popcornell Date: Tue, 24 Jun 2025 14:22:22 +0200 Subject: [PATCH 2/2] added tests --- meeteval/wer/wer/time_constrained.py | 6 ++--- tests/test_time_constrained.py | 39 ++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/meeteval/wer/wer/time_constrained.py b/meeteval/wer/wer/time_constrained.py index 32ec1e1..76ea1d7 100644 --- a/meeteval/wer/wer/time_constrained.py +++ b/meeteval/wer/wer/time_constrained.py @@ -105,7 +105,8 @@ def phoneme_based(interval, words, language): elif len(words) == 1: return [interval] import numpy as np - word_lengths = np.asarray([len(g2p.tokenizer(w)) for w in words]) + + word_lengths = np.asarray([len(g2p.tokenize(w)) for w in words]) end_points = np.cumsum(word_lengths) total_num_characters = end_points[-1] character_length = (interval[1] - interval[0]) / total_num_characters @@ -620,8 +621,7 @@ def time_constrained_siso_word_error_rate( reference_pseudo_word_level_timing='character_based', hypothesis_pseudo_word_level_timing='character_based_points', reference_sort='segment', - hypothesis_sort='segment', -): + hypothesis_sort='segment'): """ Time-constrained word error rate for single-speaker transcripts. diff --git a/tests/test_time_constrained.py b/tests/test_time_constrained.py index 4220224..7bb788e 100644 --- a/tests/test_time_constrained.py +++ b/tests/test_time_constrained.py @@ -204,6 +204,15 @@ def test_time_constrained_sorting_options(): ) assert er.error_rate == 0 + er = time_constrained_minimum_permutation_word_error_rate( + r1, r1, reference_sort='word', + reference_pseudo_word_level_timing='phoneme_based', + hypothesis_pseudo_word_level_timing='phoneme_based', + language="eng", + collar=0, + ) + assert er.error_rate == 0 + r1 = SegLST([ {'words': 'a b c d', 'start_time': 0, 'end_time': 4, 'speaker': 'A'}, {'words': 'e f g h', 'start_time': 2, 'end_time': 6, 'speaker': 'A'}, @@ -217,6 +226,16 @@ def test_time_constrained_sorting_options(): ) assert er.error_rate == 0.75 + er = time_constrained_minimum_permutation_word_error_rate( + r1, r2, reference_sort='word', + reference_pseudo_word_level_timing='phoneme_based', + hypothesis_pseudo_word_level_timing='phoneme_based_points', + language="eng", + collar=0, + ) + # reference will be: ['ʌ'], ['b', 'i'], ['s', 'i'], ['d', 'i'], ['i'], ['ɛ', 'f'], ['d͡ʒ', 'i'], ['e', 'j', 't͡ʃ'] + assert er.error_rate == 0.625 + er = time_constrained_minimum_permutation_word_error_rate( r1, r2, reference_sort='segment', collar=0, @@ -255,6 +274,26 @@ def test_time_constrained_sorting_options(): ) assert er.error_rate == 1 + # japanese testing with kanji char + # whitespace on characters + r1 = SegLST([ + {'words': '\u4f11 \u65e5', 'start_time': 4, 'end_time': 8, 'speaker': 'A'}, # holiday + {'words': '\u4eca \u65e5', 'start_time': 0, 'end_time': 4, 'speaker': 'A'}, # today + ]) + r2 = SegLST([ + {'words': '\u4f11 \u65e5 \u4eca \u65e5', 'start_time': 0, 'end_time': 8, 'speaker': 'A'}, + ]) + + er = time_constrained_minimum_permutation_word_error_rate( + r1, r2, reference_sort='word', + reference_pseudo_word_level_timing='phoneme_based', + hypothesis_pseudo_word_level_timing='phoneme_based_points', + language="jpn", + collar=5, + ) + assert er.error_rate == 0.5 + + def test_examples_zero_self_overlap(): """Tests that self-overlap is measured correctly (0) for the example files"""