-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathphonemize.py
104 lines (77 loc) · 3.42 KB
/
phonemize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from collections import Counter
from piper_phonemize import (
phonemize_espeak,
phonemize_codepoints,
phoneme_ids_espeak,
phoneme_ids_codepoints,
get_codepoints_map,
get_espeak_map,
get_max_phonemes,
tashkeel_run,
)
# -----------------------------------------------------------------------------
# Maximum number of phonemes in a Piper model.
# Larger than necessary to accomodate future phonemes.
assert get_max_phonemes() == 256
# -----------------------------------------------------------------------------
de_phonemes = phonemize_espeak("licht!", "de")
# "lˈɪçt!" where "ç" is decomposed into two codepoints
assert de_phonemes == [["l", "ˈ", "ɪ", "c", "̧", "t", "!"]], de_phonemes
# phoneme -> [id, ...]
espeak_map = get_espeak_map()
for phoneme in de_phonemes[0]:
assert phoneme in espeak_map, f"Missing phoneme: {phoneme}"
de_ids = phoneme_ids_espeak(de_phonemes[0])
# 0 = pad
# 1 = bos
# 2 = eos
# 4 = !
assert de_ids == [1, 0, 24, 0, 120, 0, 74, 0, 16, 0, 140, 0, 32, 0, 4, 0, 2]
# Verify missing phoneme counts
missing_phonemes: Counter[str] = Counter()
assert phoneme_ids_espeak(["\u0000", "\u0000", "\u0000"], missing_phonemes) == [1, 0, 2]
assert missing_phonemes == {"\u0000": 3}, missing_phonemes
# -----------------------------------------------------------------------------
# cn_phonemes = phonemize_espeak("这又是一个测试", "zh-cn")
# Capitalization is required to get espeak to split the sentences.
# en_phonemes = phonemize_espeak("The twelve the eggs I keep.", "en-us")
# print(en_phonemes)
# assert en_phonemes == [
# ["t", "ˈ", "ɛ", "s", "t", " ", "w", "ˈ", "ʌ", "n", "."],
# ["t", "ˈ", "ɛ", "s", "t", " ", "t", "ˈ", "u", "ː", "."],
# ], en_phonemes
# assert en_phonemes == [['ð', 'ə', ' ', 't', 'w', 'ˈ', 'ɛ', 'l', 'v', ' ', 'ð', 'ɪ',
# ' ', 'ˈ', 'ɛ', 'ɡ', 'z', ' ', 'a', 'ɪ', ' ', 'k', 'ˈ', 'i', 'ː', 'p', '.']], en_phonemes
def en_phonemes(transcripts):
en_phonemes = phonemize_espeak(transcripts, "en-us")
return en_phonemes
# -----------------------------------------------------------------------------
codepoints_map = get_codepoints_map()
assert "uk" in codepoints_map, "uk not supported"
uk_phonemes = phonemize_codepoints("ВЕСЕ́ЛКА")
# case folding / NFD normalization is automatically applied
assert uk_phonemes == [["в", "е", "с", "е", "́", "л", "к", "а"]]
for phoneme in uk_phonemes[0]:
assert phoneme in codepoints_map["uk"]
uk_ids = phoneme_ids_codepoints("uk", uk_phonemes[0])
# 0 = pad
# 1 = bos
# 2 = eos
assert uk_ids == [1, 0, 14, 0, 18, 0, 33, 0, 18, 0, 45, 0, 27, 0, 26, 0, 12, 0, 2]
# Casing can be changed, but this will break models trained with the default ("fold").
assert phonemize_codepoints("ВЕСЕ́ЛКА", casing="upper") == [
["В", "Е", "С", "Е", "́", "Л", "К", "А"]
]
# Verify missing phoneme counts
missing_phonemes = Counter()
assert phoneme_ids_codepoints(
"uk", ["\u0000", "\u0000", "\u0000"], missing_phonemes
) == [1, 0, 2]
assert missing_phonemes == {"\u0000": 3}, missing_phonemes
# -----------------------------------------------------------------------------
# Test Arabic with libtashkeel (https://github.com/mush42/libtashkeel)
expected_text = "مَرْحَبًا"
actual_text = tashkeel_run("مرحبا")
assert actual_text == expected_text, f"Expected {expected_text}, got {actual_text}"
# -----------------------------------------------------------------------------
print("OK")