This repository has been archived by the owner on May 14, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
轉做調型資料.py
128 lines (112 loc) · 3.71 KB
/
轉做調型資料.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import csv
import re
import unicodedata
from pathlib import Path
tongmia = '《臺灣客家語常用詞辭典》內容資料(1100430).csv'
四縣聲調表 = {
'24': 'ˊ',
'33': '+',
'11': 'ˇ',
'31': 'ˋ',
'55': '',
'2': 'ˋ',
'5': '',
'53': 'ˋ', # 畀 bi53
}
海陸聲調表 = {
'53': 'ˋ',
'55': '',
'24': 'ˊ',
'11': 'ˇ',
'33': '+',
'5': '',
'2': 'ˋ',
}
大埔聲調表 = {
'33': '+',
'35': 'ˊ',
'113': 'ˇ',
'31': '^',
'53': 'ˋ',
'21': '^',
'54': 'ˋ',
}
饒平聲調表 = {
'11': 'ˇ',
'55': '',
'53': 'ˋ',
'24': 'ˊ',
'2': 'ˋ',
'5': '',
}
詔安聲調表 = {
'11': 'ˇ',
'53': 'ˋ',
'31': '^',
'55': '',
'24': 'ˊ',
'43': 'ˋ',
}
造字表 = {
'\ue72c': '𫟧', # U+2B7E7
'\ue0c7': '𫠛', # U+2B81B
'\ue711': '𫝘', # U+2B758
'\ue700': '𫣆', # U+2B8C6
'\ue725': '𬠖', # U+2C816
'\ue76f': '⿺皮卜', # 無Unicode
# 下底是近反義詞才有出現--ê,對應豆腐烏外字表ê「客語教典PUA」欄
'\uf354': '䞚', # U+479A
'\uf369': '𧊅', # U+27285
'\uf36e': '𧩣', # U+27A63
'\uf36f': '𩜰', # U+29730
'\uf374': '𢯭', # U+22BED
'\uf3b9': '𥉌', # U+2524C
'\uf545': '⿺皮卜', # 無Unicode
}
def uann(lomaji, pio):
for tat, hing in pio.items():
pattern = r'([a-z]+)' + tat + r'(?![\d])'
lomaji = re.sub(
pattern,
lambda match: match.group(1) + hing,
lomaji
)
return lomaji
def biang_zosii(row_dict):
for col in row_dict:
for k, v in 造字表.items():
row_dict[col] = row_dict[col].replace(k, v)
for ji in row_dict[col]:
pianbe = hex(ord(ji))
if unicodedata.category(ji) == 'Co':
print(f'有造字字元,編碼{pianbe}')
return row_dict
def main():
with open(Path(__file__).parent / '調值資料_raw' / tongmia) as guanpun:
with open(Path(__file__).parent / '調型資料' / tongmia, 'w') as sin:
reader = csv.DictReader(guanpun)
writer = csv.DictWriter(sin, fieldnames=reader.fieldnames)
writer.writeheader()
for row in reader:
row = biang_zosii(row)
row['四縣腔音讀'] = uann(row['四縣腔音讀'], 四縣聲調表)
row['南四縣腔音讀'] = uann(row['南四縣腔音讀'], 四縣聲調表)
row['南四縣相關字詞音讀'] = uann(row['南四縣相關字詞音讀'], 四縣聲調表)
row['海陸腔音讀'] = uann(row['海陸腔音讀'], 海陸聲調表)
row['大埔腔音讀'] = uann(row['大埔腔音讀'], 大埔聲調表)
row['大埔腔相關字詞音讀'] = uann(row['大埔腔相關字詞音讀'], 大埔聲調表)
row['饒平腔音讀'] = uann(row['饒平腔音讀'], 饒平聲調表)
row['饒平腔相關字詞音讀'] = uann(row['饒平腔相關字詞音讀'], 饒平聲調表)
row['詔安腔音讀'] = uann(row['詔安腔音讀'], 詔安聲調表)
row['詔安腔相關字詞音讀'] = uann(row['詔安腔相關字詞音讀'], 詔安聲調表)
writer.writerow(row)
with open(Path(__file__).parent / '調值資料_raw' / tongmia) as guanpun:
with open(Path(__file__).parent / '調值資料_uni' / tongmia, 'w') as sin:
reader = csv.DictReader(guanpun)
writer = csv.DictWriter(sin, fieldnames=reader.fieldnames)
writer.writeheader()
for row in reader:
row = biang_zosii(row)
writer.writerow(row)
if __name__ == '__main__':
main()