-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmake_symbols.py
49 lines (45 loc) · 1.39 KB
/
make_symbols.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# %%
import unicodedata
import pandas as pd
import re
from text.jp_phonemizer import _CONVRULES
train_df = pd.read_csv('./preprocessed_data/visual_novel.en_trim_dur/train.txt',
delimiter='|', names=['filename', 'speaker', 'text', 'raw_text'])
val_df = pd.read_csv('./preprocessed_data/visual_novel.en_trim_dur/val.txt',
delimiter='|', names=['filename', 'speaker', 'text', 'raw_text'])
# %%
symbols_dict = {}
_letters = ""
for letter in _letters:
symbols_dict[letter] = letter
def map_fn(text, symbols_dict):
symbols = text[1:-1].split()
for symbol in symbols:
if symbol not in symbols_dict.keys():
symbols_dict[symbol] = symbol
else:
continue
# %%
for symbols in _CONVRULES:
for symbol in symbols.split('/')[1].split():
if symbol not in symbols_dict.keys():
symbols_dict[symbol] = symbol
else:
continue
# %%
train_df['text'].map(lambda x: map_fn(x, symbols_dict))
val_df['text'].map(lambda x: map_fn(x, symbols_dict))
# %%
for i, symbol in enumerate(symbols_dict.keys()):
symbols_dict[symbol] = i
# %%
norm_symbol = unicodedata.normalize('NFKC', " ".join(list(symbols_dict.keys())))
# %%
with open('./text/jp_symbol.txt', 'w', encoding='utf-8') as f:
f.write(norm_symbol)
# %%
with open('./text/jp_symbol.txt', 'rb') as f:
text = f.read().decode()
# %%
text
# %%