-
Notifications
You must be signed in to change notification settings - Fork 40
/
Copy pathTruecaser.py
104 lines (81 loc) · 4.28 KB
/
Truecaser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import string
import math
"""
This file contains the functions to truecase a sentence.
"""
def getScore(prevToken, possibleToken, nextToken, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist):
pseudoCount = 5.0
#Get Unigram Score
nominator = uniDist[possibleToken]+pseudoCount
denominator = 0
for alternativeToken in wordCasingLookup[possibleToken.lower()]:
denominator += uniDist[alternativeToken]+pseudoCount
unigramScore = nominator / denominator
#Get Backward Score
bigramBackwardScore = 1
if prevToken != None:
nominator = backwardBiDist[prevToken+'_'+possibleToken]+pseudoCount
denominator = 0
for alternativeToken in wordCasingLookup[possibleToken.lower()]:
denominator += backwardBiDist[prevToken+'_'+alternativeToken]+pseudoCount
bigramBackwardScore = nominator / denominator
#Get Forward Score
bigramForwardScore = 1
if nextToken != None:
nextToken = nextToken.lower() #Ensure it is lower case
nominator = forwardBiDist[possibleToken+"_"+nextToken]+pseudoCount
denominator = 0
for alternativeToken in wordCasingLookup[possibleToken.lower()]:
denominator += forwardBiDist[alternativeToken+"_"+nextToken]+pseudoCount
bigramForwardScore = nominator / denominator
#Get Trigram Score
trigramScore = 1
if prevToken != None and nextToken != None:
nextToken = nextToken.lower() #Ensure it is lower case
nominator = trigramDist[prevToken+"_"+possibleToken+"_"+nextToken]+pseudoCount
denominator = 0
for alternativeToken in wordCasingLookup[possibleToken.lower()]:
denominator += trigramDist[prevToken+"_"+alternativeToken+"_"+nextToken]+pseudoCount
trigramScore = nominator / denominator
result = math.log(unigramScore) + math.log(bigramBackwardScore) + math.log(bigramForwardScore) + math.log(trigramScore)
#print "Scores: %f %f %f %f = %f" % (unigramScore, bigramBackwardScore, bigramForwardScore, trigramScore, math.exp(result))
return result
def getTrueCase(tokens, outOfVocabularyTokenOption, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist):
"""
Returns the true case for the passed tokens.
@param tokens: Tokens in a single sentence
@param outOfVocabulariyTokenOption:
title: Returns out of vocabulary (OOV) tokens in 'title' format
lower: Returns OOV tokens in lower case
as-is: Returns OOV tokens as is
"""
tokensTrueCase = []
for tokenIdx in xrange(len(tokens)):
token = tokens[tokenIdx]
if token in string.punctuation or token.isdigit():
tokensTrueCase.append(token)
else:
if token in wordCasingLookup:
if len(wordCasingLookup[token]) == 1:
tokensTrueCase.append(list(wordCasingLookup[token])[0])
else:
prevToken = tokensTrueCase[tokenIdx-1] if tokenIdx > 0 else None
nextToken = tokens[tokenIdx+1] if tokenIdx < len(tokens)-1 else None
bestToken = None
highestScore = float("-inf")
for possibleToken in wordCasingLookup[token]:
score = getScore(prevToken, possibleToken, nextToken, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)
if score > highestScore:
bestToken = possibleToken
highestScore = score
tokensTrueCase.append(bestToken)
if tokenIdx == 0:
tokensTrueCase[0] = tokensTrueCase[0].title();
else: #Token out of vocabulary
if outOfVocabularyTokenOption == 'title':
tokensTrueCase.append(token.title())
elif outOfVocabularyTokenOption == 'lower':
tokensTrueCase.append(token.lower())
else:
tokensTrueCase.append(token)
return tokensTrueCase