-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathngram.py
122 lines (104 loc) · 3.29 KB
/
ngram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
"""
__file__
ngram.py
__description__
This file provide function to compute n-gram & n-term
__author__
Liang yongjie
"""
def getUnigram(words):
"""
Args: a list of words, e.g., ['I', 'am', 'Denny']
Output: a list of unigram
"""
assert type(words) == list
return words
def getBigram(words, join_string, skip=0):
"""
Input: a list of words, e.g., ['I', 'am', 'Denny']
Output: a list of bigram, e.g., ['I_am', 'am_Denny']
I use _ as join_string for this example.
"""
assert type(words) == list
L = len(words)
if L > 1:
lst = []
for i in range(L - 1):
for k in range(1, skip + 2):
if i + k < L:
lst.append(join_string.join([words[i], words[i + k]]))
else:
# set it as unigram
lst = getUnigram(words)
return lst
def getTrigram(words, join_string='_', skip=0):
"""
Input: a list of words, e.g., ['I', 'am', 'Denny']
Output: a list of trigram, e.g., ['I_am_Denny']
I use _ as join_string for this example.
"""
assert type(words) == list
L = len(words);
if L > 2:
lst = []
for i in range(L - 2):
for k1 in range(1, skip + 2):
for k2 in range(1, skip + 2):
if i + k1 < L and i + k1 + k2 < L:
lst.append(join_string.join([words[i], words[i + k1], words[i + k1 + k2]]))
else:
# set it as bigram
lst = getBigram(words, join_string, skip)
return lst
def getFourgram(words, join_string='_'):
"""
Input: a list of words, e.g., ['I', 'am', 'Denny', 'boy']
Output: a list of trigram, e.g., ['I_am_Denny_boy']
I use _ as join_string for this example.
"""
assert type(words) == list
L = len(words)
if L > 3:
lst = []
for i in xrange(L - 3):
lst.append(join_string.join([words[i], words[i + 1], words[i + 2], words[i + 3]]))
else:
# set it as bigram
lst = getTrigram(words, join_string)
return lst
def getBiterm(words, join_string="_"):
"""
Input: a list of words, e.g., ['I', 'am', 'Denny', 'boy']
Output: a list of biterm, e.g., ['I_am', 'I_Denny', 'I_boy', 'am_Denny', 'am_boy', 'Denny_boy']
I use _ as join_string for this example.
"""
assert type(words) == list
L = len(words)
if L > 1:
lst = []
for i in range(L - 1):
for j in range(i + 1, L):
lst.append(join_string.join([words[i], words[j]]))
else:
# set it as unigram
lst = getUnigram(words)
return lst
def getTriterm(words, join_string="_"):
"""
Input: a list of words, e.g., ['I', 'am', 'Denny']
Output: a list of triterm, e.g., ['I_am_Denny', 'I_Denny_am', 'am_I_Denny',
'am_Denny_I', 'Denny_I_am', 'Denny_am_I']
I use _ as join_string for this example.
"""
assert type(words) == list
L = len(words)
if L > 2:
lst = []
for i in xrange(L - 2):
for j in xrange(i + 1, L - 1):
for k in xrange(j + 1, L):
lst.append(join_string.join([words[i], words[j], words[k]]))
else:
# set it as biterm
lst = getBiterm(words, join_string)
return lst