-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathtagger.py
94 lines (84 loc) · 3.56 KB
/
tagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from viterbi import viterbi
from bigram import Bigram
from liblinearutil import load_model, predict
from tools import sentenceIterator, featurizeSentence, addTagging
import math
import sys
import os
class Tagger():
def __init__(self, featureSet, options):
self.featureSet = featureSet
self.params = '-b 1'
self.lmw = options['lmw']
modelName = options['modelName']
sys.stderr.write('loading transition model...')
self.transProbs = Bigram.getModelFromFile(options['bigramModelFile'])
sys.stderr.write('done\nloading observation model...')
self.model = load_model('{0}.model'.format(modelName))
self.labelCounter = options['labelCounter']
self.featCounter = options['featCounter']
sys.stderr.write('done\n')
def getNumberedSenFeats(self, senFeats):
return [[self.featCounter.getNo(feat)
for feat in feats]
for feats in senFeats]
def getLogTagProbsByPos(self, senFeats):
numberedSenFeats = self.getNumberedSenFeats(senFeats)
contexts = [dict([(feat, 1) for feat in feats])
for feats in numberedSenFeats]
dummyOutcomes = [1 for c in contexts]
_, __, probDistsByPos = predict(dummyOutcomes, contexts,
self.model, self.params)
"""
logTagProbsByPos = [ dict([(self.featCounter.noToFeat[i+1],
math.log(prob))
for i, prob in enumerate(probDist)])
for probDist in probDistsByPos]
"""
logTagProbsByPos = []
for probDist in probDistsByPos:
logProbsByTag = {}
for c, prob in enumerate(probDist):
tag = self.labelCounter.noToFeat[c+1]
logProbsByTag[tag] = math.log(prob)
logTagProbsByPos.append(logProbsByTag)
return logTagProbsByPos
def tag_features(self, file_name):
sen_feats = []
senCount = 0
for line in file(file_name):
if line == '\n':
senCount += 1
tagging = self.tag_sen_feats(sen_feats)
yield [[tag] for tag in tagging]
sen_feats = []
if senCount % 1000 == 0:
sys.stderr.write(str(senCount)+'...')
sen_feats.append(line.strip().split())
sys.stderr.write(str(senCount)+'...done\n')
def tag_dir(self, dir_name):
for fn in os.listdir(dir_name):
sys.stderr.write('processing file {}...'.format(fn))
try:
for sen, _ in self.tag_corp(open(os.path.join(dir_name, fn))):
yield sen, fn
except:
sys.stderr.write('error in file {}\n'.format(fn))
def tag_corp(self, input):
senCount = 0
for sen, comment in sentenceIterator(input):
senCount += 1
#sys.stderr.write(str(sen)+'\n')
#sys.stderr.flush()
senFeats = featurizeSentence(sen, self.featureSet)
bestTagging = self.tag_sen_feats(senFeats)
taggedSen = addTagging(sen, bestTagging)
yield taggedSen, comment
if senCount % 1000 == 0:
sys.stderr.write(str(senCount)+'...')
sys.stderr.write(str(senCount)+'...done\n')
def tag_sen_feats(self, sen_feats):
logTagProbsByPos = self.getLogTagProbsByPos(sen_feats)
_, bestTagging = viterbi(self.transProbs, logTagProbsByPos,
self.lmw)
return bestTagging