-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmain.py
137 lines (107 loc) · 4.84 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# Requires the following packages: pyphen, pandas, numpy
# Takes '-' hyphen as input
print('Starting Syllabificator...')
import pandas as pd
import re
import unicodedata
import logging
import warnings
# Suppress superfluous 'deprecated' warning upon loading CRF-keras
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
from algorithms import algo_brandt, algo_liang, algo_crf, algo_weijters, algo_neural
from algorithms import algo_neural_eng # For english NN
from scripts import analyze
print('Libraries imported successfully.')
brandt_syl = algo_brandt.brandt_syllabificate
liang_syl = algo_liang.liang_syllabificate
crf_syl = algo_crf.crf_syllabificate
weijters_syl = algo_weijters.weijters_syllabificate
nn_syl = algo_neural.nn_syllabificate
nn_syl_eng = algo_neural_eng.nn_syllabificate
run_analysis = analyze.analyze_set
# Function for dehyphenating
def dehyphenate(word):
return word.replace('-', '')
# Imput as pandas series containing correct hyphenations. Set
# print option to false to toggle help file
def run_all(reference_set, print_info=True):
dehyphenated = reference_set.apply(dehyphenate)
brandt_set = dehyphenated.apply(brandt_syl)
liang_set = dehyphenated.apply(liang_syl)
crf_set = dehyphenated.apply(crf_syl)
weijters_set = dehyphenated.apply(weijters_syl, checks=200)
nn_set = dehyphenated.apply(nn_syl)
# x_set = dehyphenated.apply(algorithm_of_choice) to easily add another algorithm
# y_set etc.
print('Running algorithm performance analysis.')
print('Note that the default naming order for Dataset 1 through 5 is Brandt, Liang, CRF, Weijters, NN')
output = run_analysis(reference_set,
brandt_set,
liang_set,
crf_set,
weijters_set,
nn_set,
# x_set, y_set,
print_help=print_info)
return output
# For running hyphenated lists against each other. Input is x series of
# which first is the solution (reference set)
def run_sets(*input_datasets, print_info=True):
output = run_analysis(*input_datasets,
print_help=print_info)
# For hyphenating individual word. 'alg' selects which algorithm to use.
# The neural net is trained for a max word length of 34/22 characters.
def syllabificate_word(word, alg='c', pattern=2, w_size=20, language='nl'):
if alg=='l':
hyphenated = liang_syl(word)
if alg=='b':
hyphenated = brandt_syl(word)
if alg=='c':
hyphenated = crf_syl(word, language)
if alg=='w':
hyphenated = weijters_syl(word, checks=w_size)
if alg=='n':
if language=='nl' and len(word)<35:
hyphenated = nn_syl(word)
elif language=='nl' and len(word)>=35:
hyphenated = crf_syl(word)
print('Word with length>34 detected, replacing with CRF')
if language=='eng' and len(word)<23:
hyphenated = nn_syl_eng(word)
elif language=='eng' and len(word)>=23:
hyphenated = crf_syl(word, language)
print('Word with length>22 detected, replacing with CRF')
return hyphenated
# Quick function to remove accents to remove special characters (e.g. 'é')
def remove_accents(input_str):
nfkd_form = unicodedata.normalize('NFKD', input_str)
return ''.join([c for c in nfkd_form if not unicodedata.combining(c)])
# Syllabificate a document. Numbers are ignored, special characters removed
def syllabificate_text(text, alg='c', pattern=2, w_size=2000, language='nl'):
split_text = re.split(r'(\W|_)', remove_accents(text))
output = []
# Syllabificate each words, capitals are removed and reinserted
for section in split_text:
if section.isalpha() == False:
output.append(section)
continue
word = section
word_original = word
capitals = re.finditer('[A-Z]', word)
word = word.lower()
hyphenated = syllabificate_word(word, alg, pattern, w_size, language)
hyphens_positions = ([hyphenated[i+1] == '-' for i in
range(len(hyphenated)-1)
if hyphenated[i]!= '-'] + [False])
word_output = []
# Rejoin the capatalized version
for letter in range(len(word)):
if hyphens_positions[letter] == True:
word_output.append(word_original[letter]+'-')
else:
word_output.append(word_original[letter])
output.append(''.join(word_output))
return ''.join(output)
print('To syllabify text, use syllabificate_text(\'Een bepaalde tekststring.\')')