-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathaddlanguage.py
130 lines (104 loc) · 3.82 KB
/
addlanguage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python
"""Add language labels to output of nearest.py."""
from __future__ import print_function
import sys
import re
import codecs
import logging
from os import path
from logging import warn, info
logging.getLogger().setLevel(logging.INFO)
# Regex for lines to add language codes to
WORD_SIM_RE = re.compile(r'^\s*(\S+)\s+([0-9.]+)\s*$')
class FormatError(Exception):
pass
def argparser():
try:
import argparse
except ImportError:
import compat.argparse as argparse
ap=argparse.ArgumentParser()
ap.add_argument('-f', '--file', default=None,
help='file to add languages to (default STDIN)')
ap.add_argument('-r', '--max-rank', metavar='INT', default=None, type=int,
help='only read r most frequent words')
ap.add_argument('-v', '--verbose', default=False, action='store_true')
ap.add_argument('vocabs', nargs='+', metavar='FILE',
help='vocabulary files')
return ap
def language_label(fn):
"""Guess language label from vocabulary filename."""
bn = path.basename(fn)
n = path.splitext(bn)[0]
n = n.replace('_vocab', '')
return n
def load_vocab(fn, options):
freq_by_word = {}
with codecs.open(fn, encoding='utf-8') as f:
for i, l in enumerate(f, start=1):
if options.max_rank and i > options.max_rank:
break
l = l.rstrip()
fields = l.split(None, 1)
if len(fields) != 2:
raise FormatError('line {} in {}: {}'.format(i, fn, l))
try:
freq = int(fields[0])
except ValueError:
raise FormatError('line {} in {}: {}'.format(i, fn, l))
word = fields[1]
if word in freq_by_word:
warn('duplicate word in {}: {}'.format(fn, word))
else:
freq_by_word[word] = freq
return freq_by_word
def load_vocabs(files, options):
vocab_by_label = {}
for fn in files:
label = language_label(fn)
if label in vocab_by_label:
raise ValueError('duplicate language {}'.format(label))
vocab = load_vocab(fn, options)
info('read {} words for {} from {}'.format(len(vocab), label, fn))
vocab_by_label[label] = vocab
# Group into dict of dicts, outer keyed by word, inner by language
# label, values are frequencies in language.
combined = {}
for label, vocab in vocab_by_label.items():
for word, freq in vocab.items():
if word not in combined:
combined[word] = {}
combined[word][label] = freq
return combined
def format_labels(word, vocabs, options):
if word not in vocabs:
return '<NONE>'
freq_by_label = vocabs[word]
freq_and_label = [(f, l) for l, f in freq_by_label.items()]
freq_and_label = list(reversed(sorted(freq_and_label)))
if not options.verbose:
return freq_and_label[0][1] # Most frequent label only
else:
return '\t'.join('{} ({})'.format(l, f) for f, l in freq_and_label)
def add_languages(flo, vocabs, options, out=None):
if out is None:
out = codecs.getwriter('utf-8')(sys.stdout)
for l in flo:
l = l.rstrip()
m = WORD_SIM_RE.match(l)
if m:
word, sim = m.groups()
labels = format_labels(word, vocabs, options)
l += '\t{}'.format(labels)
print(l, file=out)
def main(argv):
options = argparser().parse_args(argv[1:])
vocabs = load_vocabs(options.vocabs, options)
if options.file is None:
with codecs.getreader('utf-8')(sys.stdin) as f:
add_languages(f, vocabs, options)
else:
with codecs.open(options.file, encoding='utf-8') as f:
add_languages(f, vocabs, options)
if __name__ == '__main__':
sys.exit(main(sys.argv))