-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathlexeme.py
592 lines (480 loc) · 27.2 KB
/
lexeme.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
#!/usr/bin/env python3
from collections import defaultdict, namedtuple
import itertools
import math
import unittest
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
INS = 1
DEL = 2
SUB = 3
def longest_common_substring_position(string_a, string_b):
"""Returns (start of lcs in string_a, start of lcs in string_b, length of lcs)"""
# Initialize the longest-suffixes table.
length_table = [[0] * (1 + len(string_b)) for i in range(1 + len(string_a))]
# These will hold information about where the longest common substring ends in the strings and about its length.
longest_len = 0
longest_substr_end_x = 0
longest_substr_end_y = 0
# Walk through the strings, matching all possible suffixes.
for x in range(len(string_a)):
for y in range(len(string_b)):
if string_a[x] == string_b[y]:
# We can extend the current longest suffix here by one.
length_table[x + 1][y + 1] = length_table[x][y] + 1
if length_table[x + 1][y + 1] > longest_len:
# If we've found a new longest substring, store information about it for later retrieval.
longest_len = length_table[x + 1][y + 1]
longest_substr_end_x = x + 1
longest_substr_end_y = y + 1
else:
# We cannot extend the current longest suffix, so start a new match instead of extending the old one.
length_table[x + 1][y + 1] = 0
return (longest_substr_end_x - longest_len, longest_substr_end_y - longest_len, longest_len)
def longest_common_substring(string_a, string_b):
pos_x, pos_y, length = longest_common_substring_position(string_a, string_b)
return string_a[pos_x:pos_x+length]
def divide_string(s, p1, p2):
"""Divide s into 3 parts: [0:p1], [p1:p2], [p2:]"""
return (s[0:p1], s[p1:p2], s[p2:])
def pairs(l):
"""For [a,b,c, … y,z] in list l, return (a,b), (b,c) … (y,z)"""
for lower in range(len(l) - 1):
yield (l[lower], l[lower + 1])
def extend_hypothesis(tables, hypothesis, f, t):
change = (f, t)
change_prob = tables.get_change_prob(f, t)
if hypothesis is not None:
# Extend the existing hypothesis with the change.
return (hypothesis[0] * change_prob, hypothesis[1] + [change])
else:
return (change_prob, [change])
def init_hypotheses():
return ((1.0, True, tuple()), )
def extend_hypotheses(tables, hypotheses, f, t):
assert hypotheses, "Hypotheses must not be empty; use init_hypotheses first."
change = (f, t)
change_prob = tables.get_change_prob(f, t)
# Extend the existing hypothesis with the change.
if t == "":
# Deletion. Disallow insertion in the next step.
return tuple((prob * change_prob, False, changes + (change, )) for prob, _, changes in hypotheses)
elif f == "":
# Insertion. Check that it is allowed.
return tuple((prob * change_prob, ins_allowed, changes + (change, )) for prob, ins_allowed, changes in hypotheses if ins_allowed)
else:
# Substitution. Allow insertion in the next step.
return tuple((prob * change_prob, True, changes + (change, )) for prob, _, changes in hypotheses)
def generate_string_mapping_hypotheses(tables, parent_stem, child_stem, hypothesis, allowed_map_types):
if not parent_stem and not child_stem:
assert hypothesis is not None, "Hypothesis must not be None, probably a mapping to an empty string was attempted."
yield hypothesis
else:
if SUB in allowed_map_types and parent_stem and child_stem:
#print("sub", parent_stem[0], child_stem[0])
sub_hypothesis = extend_hypothesis(tables, hypothesis, parent_stem[0], child_stem[0])
yield from generate_string_mapping_hypotheses(tables, parent_stem[1:], child_stem[1:], sub_hypothesis, {INS, DEL, SUB})
if INS in allowed_map_types and child_stem:
#print("ins", child_stem[0])
ins_hypothesis = extend_hypothesis(tables, hypothesis, "", child_stem[0])
yield from generate_string_mapping_hypotheses(tables, parent_stem, child_stem[1:], ins_hypothesis, allowed_map_types - {DEL})
if DEL in allowed_map_types and parent_stem:
#print("del", parent_stem[0])
del_hypothesis = extend_hypothesis(tables, hypothesis, parent_stem[0], "")
yield from generate_string_mapping_hypotheses(tables, parent_stem[1:], child_stem, del_hypothesis, allowed_map_types - {INS})
#def map_strings(tables, parent_stem, child_stem, new_tables, prob_modifier):
## I need an algorithm to walk all possibilities and add the final probabilities to new_tables.
#total_prob = 0.0
#for prob, hypothesis in generate_string_mapping_hypotheses(tables, parent_stem, child_stem, None, {INS, DEL, SUB}):
##print("Obtained hypothesis with prob {}:".format(prob), hypothesis)
#total_prob += prob
#for (f, t) in hypothesis:
#new_tables.add_change(f, t, prob * prob_modifier)
#return total_prob
#def map_strings(tables, parent_stem, child_stem, new_tables, prob_modifier):
#"""The complete version of the tabular algorithm, with the 'only keep two lines in memory at once' optimization."""
## I need an algorithm to walk all possibilities and add the final probabilities to new_tables.
## Table algorithm.
#assert len(parent_stem) >= 1, "The parent stem must not be empty."
#assert len(child_stem) >= 1, "The child stem must not be empty."
## Initialize the first row.
#prev_line = [None] * (len(child_stem) + 1)
#prev_line[0] = init_hypotheses() # The initial empty hypothesis.
#for j in range(1, len(child_stem) + 1):
#prev_line[j] = extend_hypotheses(tables, prev_line[j - 1], "", child_stem[j - 1])
## Initialize the second row, i.e. the one to be filled next.
#cur_line = [None] * (len(child_stem) + 1)
## Fill in the rest of the table.
#for i in range(1, len(parent_stem) + 1):
## Fill in the first item (it is special, because there is only one hypothesis instead of three.
#cur_line[0] = extend_hypotheses(tables, prev_line[0], parent_stem[i - 1], "")
## Fill in the rest of the line.
#for j in range(1, len(child_stem) + 1):
#cur_line[j] = (extend_hypotheses(tables, prev_line[j - 1], parent_stem[i - 1], child_stem[j - 1])
#+ extend_hypotheses(tables, cur_line[j - 1], "", child_stem[j - 1])
#+ extend_hypotheses(tables, prev_line[j], parent_stem[i - 1], ""))
## Swap the lines (the filled-in cur_line will be overwritten in the next iteration).
#tmp = prev_line
#prev_line = cur_line
#cur_line = tmp
## The bottom right corner contains the hypothesis list. Calculate the final prob from it.
## Beware that the lines have been swapped at the end of the last cycle, so the bottom right corner is in prev_line.
#total_prob = 0.0
#for prob, hypothesis in prev_line[-1]:
##print("Obtained hypothesis with prob {}:".format(prob), hypothesis)
#modified_prob = prob * prob_modifier
#total_prob += prob
#for (f, t) in hypothesis:
#new_tables.add_change(f, t, modified_prob)
#return total_prob
#def map_strings(tables, parent_stem, child_stem, new_tables, prob_modifier):
#"""The windowed of the tabular algorithm, with the 'only keep two lines in memory at once' and 'only consider a small window' (borrowed from DTW) optimizations."""
## Lengths of the window – to the left and to the right.
#window_size = (-2, 2)
#assert len(parent_stem) >= 1, "The parent stem must not be empty."
#assert len(child_stem) >= 1, "The child stem must not be empty."
## TODO enlarge the window dynamically if we map a very short stem to a very long one, so that we can reach the start from the first character and the end from the last one.
## TODO and make sure that the windows overlap between steps.
## Initialize the first row.
#prev_line = [None] * (len(child_stem) + 1)
#prev_line[0] = init_hypotheses() # The initial empty hypothesis.
#for j in range(1, min(len(child_stem), window_size[1]) + 1):
#prev_line[j] = extend_hypotheses(tables, prev_line[j - 1], "", child_stem[j - 1])
## Fill in the rest of the table. Go through the whole parent stem.
#for i in range(1, len(parent_stem) + 1):
## Initialize the second row, i.e. the one to be filled next.
#cur_line = [None] * (len(child_stem) + 1)
## Calculate the center of the window.
#cur_item = i * (len(child_stem) + 1) / (len(parent_stem) + 1)
## Calculate the window bounds. The upper bound is exclusive.
#window_start = max(0, math.floor(cur_item) + window_size[0])
#window_end = min(len(child_stem), math.ceil(cur_item) + window_size[1]) + 1
## Fill in the rest of the line. Map only the items from within the window.
#for j in range(window_start, window_end):
#cur_line[j] = tuple()
#if j >= 1:
#if prev_line[j - 1] is not None:
#cur_line[j] += extend_hypotheses(tables, prev_line[j - 1], parent_stem[i - 1], child_stem[j - 1])
#if cur_line[j - 1] is not None:
#cur_line[j] += extend_hypotheses(tables, cur_line[j - 1], "", child_stem[j - 1])
#if prev_line[j] is not None:
#cur_line[j] += extend_hypotheses(tables, prev_line[j], parent_stem[i - 1], "")
## Swap the lines.
## It doesn't matter that cur_line stays the same, it is overwritten with a new initialization next.
#prev_line = cur_line
#assert prev_line[-1] is not None, "The last item never got into the window for stems '{}' and '{}'! Window sizes: cur: {}, start: {}, end: {}.".format(parent_stem, child_stem, cur_item, window_start, window_end)
## The bottom right corner contains the hypothesis list. Calculate the final prob from it.
## Beware that the lines have been swapped at the end of the last cycle, so the bottom right corner is in prev_line.
## That one item should definitely be in the window, so it should never be None.
#total_prob = 0.0
#for prob, hypothesis in prev_line[-1]:
##print("Obtained hypothesis with prob {}:".format(prob), hypothesis)
#modified_prob = prob * prob_modifier
#total_prob += prob
#for (f, t) in hypothesis:
#new_tables.add_change(f, t, modified_prob)
#return total_prob
def map_strings(tables, parent_stem, child_stem, new_tables, prob_modifier):
"""The complete version of the tabular algorithm, with the 'only keep two lines in memory at once' optimization and n-best-list option culling."""
# I need an algorithm to walk all possibilities and add the final probabilities to new_tables.
# Table algorithm.
# TODO put this into an argument.
n_best_limit = 100
assert len(parent_stem) >= 1, "The parent stem must not be empty."
assert len(child_stem) >= 1, "The child stem must not be empty."
# Initialize the first row.
prev_line = [None] * (len(child_stem) + 1)
prev_line[0] = init_hypotheses() # The initial empty hypothesis.
for j in range(1, len(child_stem) + 1):
prev_line[j] = extend_hypotheses(tables, prev_line[j - 1], "", child_stem[j - 1])
# Initialize the second row, i.e. the one to be filled next.
cur_line = [None] * (len(child_stem) + 1)
# Fill in the rest of the table.
for i in range(1, len(parent_stem) + 1):
# Fill in the first item (it is special, because there is only one hypothesis instead of three.
cur_line[0] = extend_hypotheses(tables, prev_line[0], parent_stem[i - 1], "")
# Fill in the rest of the line.
for j in range(1, len(child_stem) + 1):
hypotheses = (extend_hypotheses(tables, prev_line[j - 1], parent_stem[i - 1], child_stem[j - 1])
+ extend_hypotheses(tables, cur_line[j - 1], "", child_stem[j - 1])
+ extend_hypotheses(tables, prev_line[j], parent_stem[i - 1], ""))
cur_line[j] = sorted(hypotheses, reverse=True)[:n_best_limit]
# Swap the lines (the filled-in cur_line will be overwritten in the next iteration).
tmp = prev_line
prev_line = cur_line
cur_line = tmp
# The bottom right corner contains the hypothesis list. Calculate the final prob from it.
# Beware that the lines have been swapped at the end of the last cycle, so the bottom right corner is in prev_line.
total_prob = 0.0
# TODO normalize probs before updating the counts.
# We assume that there has to be a mapping from parent_stem to child_stem with a probability of prob_modifier.
# FIXME is this correct? Shouldn't we normalize the prob_modifier somewhere first? Across all possible segmentations of a single word? Across all parent segmentation choices? Etc.
normalizer = sum([prob for prob, ins_allowed, hypothesis in prev_line[-1]])
assert normalizer > 0.0, "Normalizer is zero for '{}' and '{}'. The result item is {}".format(parent_stem, child_stem, repr(prev_line[-1]))
normalizer = prob_modifier / normalizer
for prob, ins_allowed, hypothesis in prev_line[-1]:
#print("Obtained hypothesis with prob {}:".format(prob), hypothesis)
modified_prob = prob * normalizer
total_prob += prob
for (f, t) in hypothesis:
new_tables.add_change(f, t, modified_prob)
return total_prob
class Lexeme:
__slots__ = "id", "lemma", "parent_id", "parent_lemma", "parent", "children", "morph_bounds", "morph_change_type", "stem_map"
# Types of morpheme changes encountered in the data and their frequencies.
# TODO detect and ignore suppletion.
morph_change_types = {"padd": 0, # Prefix addition
"prem": 0, # Prefix removal
"pcha": 0, # Prefix substitution
"sadd": 0, # Suffix addition
"srem": 0, # Suffix removal
"scha": 0, # Suffix substitution
"conv": 0, # Conversion (no change)
"circ": 0} # Circumfixation (or weird change on both sides)
allowed_morph_change_types = {"padd", "sadd", "scha", "conv",
"prem", "pcha", "srem", "circ" # Normally disallowed types, allowed for debugging.
}
def __init__(self, lemma=None, morphs=None, id=None, parent_id=None, parent_lemma=None):
self.id = int(id) if id else None
morph_bounds = {0, len(lemma)}
if lemma is not None and morphs is None:
self.lemma = lemma
elif morphs is not None:
joined_lemma = "".join(morphs)
if lemma is None or lemma == joined_lemma:
self.lemma = joined_lemma
given_bounds = []
last_given_bound = 0
for morph in morphs:
last_given_bound += len(morph)
given_bounds.append(last_given_bound)
morph_bounds = morph_bounds.union(given_bounds)
else:
raise ValueError("The provided lemma '%s' doesn't match the morphs '%s'" % (lemma, joined_lemma))
else:
raise ValueError("You must provide at least one of {lemma, morphs}")
self.parent_id = int(parent_id) if parent_id else None
self.parent_lemma = parent_lemma # TODO if both parent_id and parent_lemma are filled in, cross-check their information so that it fits.
self.parent = None
self.children = []
self.morph_bounds = morph_bounds # A set of morph boundaries in lemma. Each boundary is an int: an offset where a new morph starts in lemma.
self.morph_change_type = None # One of the keys on morph_change_types, documenting the type of morph change present when going from parent to self.
self.stem_map = None # A hash with keys "self", "parent", each with a tuple with (start, end) of the common stem (longest common substring) in lemma.
def to_string(self):
if self.id is not None:
return "%s (%d)" % (self.lemma, self.id)
else:
return "%s (from MorfFlex)" % self.lemma
def is_cycle(self, node):
"""Detect whether the parents of node include self, return True if they do, false otherwise.
Used in detecting cycles when adding node as the parent of self."""
while node is not None:
if (node.id is not None and node.id == self.id) \
or (node.id is None and node.lemma == self.lemma):
# A cycle has been detected
return True
else:
node = node.parent
return False
def set_parent(self, parent):
"""Set parent as the parent of self. Do some error checks in the process."""
if not parent:
raise ValueError("Setting empty parent not supported")
if self.parent:
raise Exception("Parent already set to %s" % self.parent)
if self.parent_id is not None and parent.id != self.parent_id:
raise ValueError("Parent ID of %d doesn't match expected ID of %d" % (parent.id, self.parent_id))
if self.parent_lemma is not None and parent.lemma != self.parent_lemma:
raise ValueError("Parent lemma '%s' doesn't match expected lemma '%s'" % (parent.lemma, self.parent_lemma))
if self.is_cycle(parent):
raise Exception("Found a cycle involving lexeme %s" % self.to_string())
self.parent = parent
parent.children.append(self)
def count_stems_simple(self, tables):
"""Find the longest common substring of self.lemma and parent.lemma. Consider that substring to be the stem, and the rest to be prefixes/suffixes. Store counts of the suffixes of self in tables."""
# If my own stem map can be filled in, but has not been filled in yet, do it now.
if self.parent is not None:
if self.stem_map is None:
pprefix, pstem, psuffix, sprefix, sstem, ssuffix = self.find_stem_map_simple()
tables.add_affix(pprefix, psuffix, sprefix, ssuffix, 1.0)
#assert pstem == sstem, "The stems found by string comparison don't match: '{}' and '{}'".format(pstem, sstem) # This assert is wrong, we ignore case, making them unequal.
for char in pstem.lower():
tables.add_change(char, char, 1.0)
else:
logger.warn("Stem map of lexeme '%s' has already been filled in.", self.lemma)
def estimate_probabilities(self, tables, new_tables):
if self.parent is None:
# We don't attempt to divide roots at all. They will only be divided in the next phase, transitively using the divisions found in their children.
return 1.0
best_prob = -1.0
hypotheses = []
for child_stem_start in range(len(self.lemma)):
for child_stem_end in range(len(self.lemma), child_stem_start, -1):
child_prefix, child_stem, child_suffix = divide_string(self.lemma, child_stem_start, child_stem_end)
for parent_stem_start in range(len(self.parent.lemma)):
for parent_stem_end in range(len(self.parent.lemma), parent_stem_start, -1):
parent_prefix, parent_stem, parent_suffix = divide_string(self.parent.lemma, parent_stem_start, parent_stem_end)
# TODO when the prefix is unknown, take the next smallest known suffix and multiply it by a discount based on the length difference.
# That may prevent probability collapse caused by the roots – attempting to find the best root will always find the smallest one,
# because there, the length actually plays a role.
# Or rather, perform some smoothing.
child_affix_prob = tables.get_affix_prob(parent_prefix, parent_suffix, child_prefix, child_suffix)
if child_affix_prob == 0.0:
# Don't perform the computationally expensive stem mapping, if the word probability will be 0 anyway.
continue
child_stem_prob = map_strings(tables, parent_stem.lower(), child_stem.lower(), new_tables, child_affix_prob)
# FIXME right now, the probability is p(seen prefix) * p(stem maps to parent) * p(seen suffix). This is imbalanced and leads to spurious splits (esp. spurious prefixes).
# Solution: probability should be p(seen prefix) * p(seen stem) * p(seen suffix) * p(prefix does not map to parent) * p(stem maps to parent) * p(suffix does not map to parent)
# = p(seen prefix) * p(seen stem) * p(seen suffix) * (1 - p(prefix maps to parent)) * p(stem maps to parent) * (1 - p(suffix maps to parent))
# NO! p(seen stem) should be almost constantly zero. Only words that share a parent may have the same stem.
# Therefore, it should be p(seen prefix) * p(seen suffix) * (1 - p(prefix maps to parent)) * p(stem maps to parent) * (1 - p(suffix maps to parent))
word_prob = child_stem_prob * child_affix_prob
if word_prob > 0.0:
# Record the hypothesis if it is plausible.
hypotheses.append((word_prob, child_affix_prob, child_stem_prob, parent_prefix, parent_suffix, child_prefix, child_stem, child_suffix, child_stem_start, child_stem_end, parent_stem_start, parent_stem_end))
normalizer = 1 / sum([item[0] for item in hypotheses])
for word_prob, child_affix_prob, child_stem_prob, parent_prefix, parent_suffix, child_prefix, child_stem, child_suffix, child_stem_start, child_stem_end, parent_stem_start, parent_stem_end in hypotheses:
new_tables.add_affix(parent_prefix, parent_suffix, child_prefix, child_suffix, word_prob * normalizer)
# Record information about the best segmentation.
best_prob, child_affix_prob, child_stem_prob, parent_prefix, parent_suffix, child_prefix, child_stem, child_suffix, child_stem_start, child_stem_end, parent_stem_start, parent_stem_end = max(hypotheses)
best_prob = best_prob * normalizer
logger.debug("Dividing '%s-%s-%s'.\tp(stem) = %e, p(affix) = %e, p(word) = %e", child_prefix, child_stem, child_suffix, child_stem_prob, child_affix_prob, best_prob)
self.stem_map = {"self": (child_stem_start, child_stem_end),
"parent": (parent_stem_start, parent_stem_end)}
self.morph_bounds = {0, self.stem_map["self"][0], self.stem_map["self"][1], len(self.lemma)}
if best_prob == 0.0:
logger.warn("No segmentation obtained for '%s'.", self.lemma)
assert best_prob >= 0.0 and best_prob <= 1.0, "Best found p(%s) = %e is out of bounds." % (self.lemma, best_prob)
return best_prob
def find_stem_map_simple(self, parent=None):
"""Look at the lemma of self and parent and try to detect any morph changes between the two.
If there are any, set the appropriate morph bounds.
If parent is None, use self.parent instead.
Return the obtained prefix, stem and suffix of self.lemma."""
if parent is None:
parent = self.parent
if parent is None:
logger.warn("Trying to detect morph bounds of %s, which has no parent!", self.to_string())
return None, None, None
plemma = parent.lemma
slemma = self.lemma
stem_start_parent, stem_start_self, stem_length = longest_common_substring_position(plemma.lower(), slemma.lower())
# FIXME what if there is no common substring? stem_length can be zero. Fix that with a special code case.
bounds_self = (stem_start_self, stem_start_self + stem_length)
bounds_parent = (stem_start_parent, stem_start_parent + stem_length)
self.stem_map = {"self": bounds_self,
"parent": bounds_parent}
# Fill in the morph change type.
stem_end_off_parent = len(plemma) - (stem_start_parent + stem_length)
stem_end_off_self = len(slemma) - (stem_start_self + stem_length)
morph_change_type = None
if stem_start_parent == 0:
# PADD / S / CONV
if stem_start_self == 0:
# S / CONV
if stem_end_off_parent == 0:
# SADD / CONV
if stem_end_off_self == 0:
morph_change_type = "conv"
else:
morph_change_type = "sadd"
else:
# SREM / SCHA
if stem_end_off_self == 0:
morph_change_type = "srem"
else:
morph_change_type = "scha"
else:
if stem_end_off_parent != 0 or stem_end_off_self != 0:
logger.debug("Circumfixation in %s → %s.", plemma, slemma)
morph_change_type = "circ"
else:
morph_change_type = "padd"
else:
# PREM / PCHA
if stem_end_off_parent != 0 or stem_end_off_self != 0:
logger.debug("Circumfixation in %s → %s.", plemma, slemma)
morph_change_type = "circ"
else:
if stem_start_self == 0:
morph_change_type = "prem"
else:
morph_change_type = "pcha"
self.morph_change_type = morph_change_type;
Lexeme.morph_change_types[morph_change_type] += 1
# What can happen:
# |xxx| → |xxx| (conversion)
# |xxx| → |xxx|x (suffixation)
# |xxx|x → |xxx|y (suffix change)
# |xxx| → x|xxx| (prefixation)
# *|xxx|x → |xxx| (suffix removal)
# *anything else?*
pprefix, pstem, psuffix = divide_string(plemma, stem_start_parent, stem_start_parent + stem_length)
psegments = [x for x in (pprefix, pstem, psuffix) if x != ""]
sprefix, sstem, ssuffix = divide_string(slemma, stem_start_self, stem_start_self + stem_length)
ssegments = [x for x in (sprefix, sstem, ssuffix) if x != ""]
# Write the newly found bounds, unless they are weird.
if morph_change_type in Lexeme.allowed_morph_change_types:
logger.debug("Divided by %s: '%s' → '%s'", morph_change_type, "/".join(psegments), "/".join(ssegments))
self.morph_bounds = self.morph_bounds.union(bounds_self)
parent.morph_bounds = parent.morph_bounds.union(bounds_parent)
else:
logger.debug("Divided by %s: '%s' → '%s' (spurious, not saving)", morph_change_type, "/".join(psegments), "/".join(ssegments))
pass
return pprefix, pstem, psuffix, sprefix, sstem, ssuffix
def copy_morph_bounds(self, parent=None):
"""Propagate the morph bounds from parent to self. If parent is None, use self.parent instead."""
if parent is None:
parent = self.parent
if parent is not None:
stem_bounds_parent = self.stem_map['parent']
stem_bounds_self = self.stem_map['self']
offset = stem_bounds_self[0] - stem_bounds_parent[0]
mapped_self_splits = [split + offset for split in parent.morph_bounds if split > stem_bounds_parent[0] and split < stem_bounds_parent[1]]
self.morph_bounds = self.morph_bounds.union(mapped_self_splits)
def propagate_morph_bounds(self):
"""Find whether splits of this lemma appear in its children and/or parent as well, and propagate them if neccessary.
Call after detecting the stem bounds first.
Best called on the root of a tree."""
# First, go down the tree, towards its leaves.
# After that's finished, go from the leaves up.
# FIXME if one child gives me a new split, do I propagate it to the other children?
self.copy_morph_bounds()
#for child in self.children:
#child.propagate_morph_bounds()
for child in self.children + self.children:
child.propagate_morph_bounds()
# I have the child's stem_map, which documents how the morphs in child map to morphs in self.
# All splits in child's stem map to splits in self's stem.
# Other splits don't map.
stem_bounds_self = child.stem_map['parent']
stem_bounds_child = child.stem_map['self']
offset = stem_bounds_self[0] - stem_bounds_child[0]
mapped_child_splits = [split + offset for split in child.morph_bounds if split > stem_bounds_child[0] and split < stem_bounds_child[1]]
self.morph_bounds = self.morph_bounds.union(mapped_child_splits)
# TODO do it again.
def morphs(self):
"""Return a list of morphs, in order. Joined together, they form lemma."""
return [self.lemma[lower:upper] for lower, upper in pairs(sorted(self.morph_bounds))]
class TestDerinetSegmentation(unittest.TestCase):
def test_lcs(self):
self.assertEqual(longest_common_substring("ahoj", "ahoj"), "ahoj")
self.assertEqual(longest_common_substring("ahoj", "bhoj"), "hoj")
self.assertEqual(longest_common_substring("ahoj", "bhojahoj"), "ahoj")
self.assertEqual(longest_common_substring("hoja", "hojb"), "hoj")
self.assertEqual(longest_common_substring("ahoja", "bhojb"), "hoj")
self.assertEqual(longest_common_substring("amatér", "amater"), "amat")
self.assertEqual(longest_common_substring("", ""), "")
self.assertEqual(longest_common_substring("", "ahoj"), "")
self.assertEqual(longest_common_substring("ahoj", ""), "")
self.assertEqual(longest_common_substring("ahojxahoj", "ahojyahoj"), "ahoj")
self.assertEqual(longest_common_substring("xyaz", "abc"), "a")
self.assertEqual(longest_common_substring("xyz", "abc"), "")
self.assertEqual(longest_common_substring("ahoj", "hojahoj"), "ahoj")
self.assertEqual(longest_common_substring("hojahoj", "ahoj"), "ahoj")
self.assertIn(longest_common_substring("abcxyz", "xyzabc"), ("abc", "xyz"))
self.assertIn(longest_common_substring("xyzabc", "abcxyz"), ("abc", "xyz"))
if __name__ == '__main__':
#unittest.main()
pass