-
Notifications
You must be signed in to change notification settings - Fork 20
/
test.py
executable file
·314 lines (250 loc) · 9.32 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from __future__ import print_function, division, absolute_import
import os
import sys
from copy import copy
import pickle
def red(s):
CSI="\x1B["
return CSI+"1;31m" + s + CSI + "0m"
def green(s):
CSI="\x1B["
return CSI+"1;32m" + s + CSI + "0m"
def test(a, b = None):
if b is None:
if a:
return "... " + green("ok")
else:
print(red("FAILED!"), file=sys.stderr)
sys.exit(2)
else:
if a == b:
return "... " + str(a) + " " + green("ok")
else:
print(red("FAILED!") + " Got "+ str(a) + ", expected " + str(b),file=sys.stderr)
sys.exit(2)
try:
import colibricore
except ImportError:
print("Run setup.py install first!",file=sys.stderr)
raise
with open("/tmp/colibritest",'w') as f:
f.write("5\tbe\n6\tTo\n7\tto\n8\tor\n9\tnot\n73477272\tblah\n")
print("Loading class decoder...")
decoder = colibricore.ClassDecoder("/tmp/colibritest")
print("Loading class encoder...")
encoder = colibricore.ClassEncoder("/tmp/colibritest")
print("Building pattern...")
ngram = encoder.buildpattern("To be or not to be")
print("Ngram: ", test(ngram.tostring(decoder),"To be or not to be"))
print("Size: ", test(len(ngram),6))
print("Bytesize: ", test(ngram.bytesize(),6))
print("Category==NGRAM", test(ngram.category() == colibricore.Category.NGRAM) )
print("Hash: ", test(hash(ngram)))
print("Raw bytes: ",repr(bytes(ngram)))
print("Third token ", test(ngram[2].tostring(decoder), "or"))
print("Last token ", test(ngram[-1].tostring(decoder), "be"))
print("Slicing ngram[2:4]", test(ngram[2:4].tostring(decoder), "or not"))
print("Copying n-gram:", test(copy(ngram) == ngram))
if sys.version[0] != '2':
#Python 3 only for now:
print("Picking n-gram:")
pickled = pickle.dumps(ngram)
print("Unpicking n-gram:")
unpickledngram = pickle.loads(pickled)
print("Equality check", test(ngram == unpickledngram))
print("Tokens of ngram:")
tokens =["To","be","or","not","to","be"]
for token,tokenref in zip(ngram,tokens):
test(token.tostring(decoder),tokenref)
print("Count check", test(len(list(iter(ngram))), len(tokens)))
subngrams = [
"To",
"be",
"or",
"not",
"to",
"be",
"To be",
"be or",
"or not",
"not to",
"to be",
"To be or",
"be or not",
"or not to",
"not to be",
"To be or not",
"be or not to",
"or not to be",
"To be or not to",
"be or not to be"]
print("Subgrams of ngram:")
for subngram,subngramref in zip(ngram.subngrams(),subngrams):
print(test(subngram.tostring(decoder),subngramref) )
print("Count check", test(len(list(ngram.subngrams())), len(subngrams)))
subngram = encoder.buildpattern("or not")
print("Testing occurrence of substring 'or not'...", test(subngram in ngram))
subngram2 = encoder.buildpattern("to be")
print("Testing occurrence of substring 'to be'...", test(subngram2 in ngram))
subngram3 = encoder.buildpattern("or")
print("Testing occurrence of substring 'or'...", test(subngram3 in ngram))
print("Testing gram addition:")
ngramconc = subngram + subngram2
print(ngramconc.tostring(decoder),test(ngramconc.tostring(decoder) == "or not to be"))
print("Testing sorting")
for subngram in sorted(ngram.subngrams()):
print(subngram.tostring(decoder))
print("Skipgram test")
skipgram = encoder.buildpattern("To {*1*} or {*1*} to be")
print("Skipgram: ", test(skipgram.tostring(decoder),"To {*} or {*} to be") )
print("Size: ", test(len(skipgram),6))
print("Bytesize: ", test(skipgram.bytesize(),6))
print("Category==SKIPGRAM", test(skipgram.category() == colibricore.Category.SKIPGRAM) )
print("Hash: ", test(hash(skipgram)))
print("Skipcount check...", test(skipgram.skipcount() == 2))
print("Parts:")
for part in skipgram.parts():
print(part.tostring(decoder))
print("Gaps:")
for begin,length in skipgram.gaps():
print(begin,length)
print("Converting to flexgram")
flexgram = skipgram.toflexgram()
print("Flexgram: ", test(flexgram.tostring(decoder),"To {**} or {**} to be" ))
print("Size: ", test(len(flexgram),6))
print("Bytesize: ", test(flexgram.bytesize(),6))
print("Category==SKIPGRAM", test(flexgram.category() == colibricore.Category.FLEXGRAM) )
print("Hash: ", test(hash(flexgram)))
print("Skipcount check...", test(flexgram.skipcount() == 2))
print("Parts:")
partsref = ["To","or","to be"]
for part,partref in zip(flexgram.parts(),partsref) :
print(test(part.tostring(decoder),partref))
hamlet = """To be or not to be , that is the question ;
Whether 'tis nobler in the mind to suffer
The slings and arrows of outrageous fortune ,
Or to take arms against a sea of troubles ,
And by opposing , end them . To die , to sleep ;
No more ; and by a sleep to say we end
The heart-ache and the thousand natural shocks
That flesh is heir to — 'tis a consummation
Devoutly to be wish'd . To die , to sleep ;
To sleep , perchance to dream . Ay , there's the rub ,
For in that sleep of death what dreams may come ,
When we have shuffled off this mortal coil ,
Must give us pause . There's the respect
That makes calamity of so long life ,
For who would bear the whips and scorns of time,
Th'oppressor's wrong , the proud man 's contumely ,
The pangs of despised love , the law 's delay ,
The insolence of office , and the spurns
That patient merit of th' unworthy takes ,
When he himself might his quietus make
With a bare bodkin ? who would fardels bear ,
To grunt and sweat under a weary life ,
But that the dread of something after death ,
The undiscovered country from whose bourn
No traveller returns , puzzles the will ,
And makes us rather bear those ills we have
Than fly to others that we know not of ?
Thus conscience does make cowards of us all ,
And thus the native hue of resolution
Is sicklied o'er with the pale cast of thought ,
And enterprises of great pitch and moment
With this regard their currents turn awry ,
And lose the name of action .
Soft you now ! The fair Ophelia ! Nymph ,
in thy orisons be all my sins remember'd .
To flee or not to flee .
To flee or not to flee .
To see or not to see .
To see or not to see .
To pee or not to pee .
"""
with open('/tmp/hamlet.txt','w') as f:
f.write(hamlet)
print("Class encoding corpus...")
assert os.system("cd /tmp/ && colibri-classencode hamlet.txt") == 0
print("Loading new decoder")
decoder = colibricore.ClassDecoder("/tmp/hamlet.colibri.cls")
encoder = colibricore.ClassEncoder("/tmp/hamlet.colibri.cls")
print("Loading corpus as IndexedCorpus")
corpus = colibricore.IndexedCorpus("/tmp/hamlet.colibri.dat")
print("Total number of tokens: ", len(corpus))
firstword = corpus[(1,0)]
print("First word: ", test(firstword.tostring(decoder),"To"))
needle = encoder.buildpattern("fair Ophelia")
for match in corpus.findpattern(needle):
print( "'fair Ophelia' found at ", match)
print("Token iteration:")
i = 0
for ref in corpus:
i += 1
print("Total number of tokens: ", test(len(corpus),i))
print()
options = colibricore.PatternModelOptions(doskipgrams_exhaustive=True)
print("\n===== Building unindexed model ======\n")
unindexedmodel = colibricore.UnindexedPatternModel()
unindexedmodel.train("/tmp/hamlet.colibri.dat",options)
print("Pattern count", test(len(unindexedmodel), 385))
print("Type count", test(unindexedmodel.types(), 186))
print("Token count", test(unindexedmodel.tokens(), 354))
unindexedmodel.printmodel(decoder)
print("REPORT:")
unindexedmodel.report()
print("HISTOGRAM:")
unindexedmodel.histogram()
outputfilename = "/tmp/data.colibri.patternmodel"
print("Writing to file")
unindexedmodel.write(outputfilename)
print("Loading unindexed corpus")
unindexedmodel = colibricore.UnindexedPatternModel("/tmp/data.colibri.patternmodel")
print("REPORT:")
unindexedmodel.report()
print("iterating over all patterns")
i = 0
for pattern in unindexedmodel:
print(pattern.tostring(decoder))
i += 1
print("Pattern count", test(i, 385))
print("iterating over all patterns and values")
i = 0
for pattern, value in unindexedmodel.items():
print(pattern.tostring(decoder), value)
i += 1
print("Pattern count", test(i, 385))
print("Extracting count for specific pattern")
print(test(unindexedmodel[encoder.buildpattern("to be")],2))
print("\n======= Loading corpus data =========\n")
corpus = colibricore.IndexedCorpus("/tmp/hamlet.colibri.dat")
print("Sentence count", test(corpus.sentencecount(),40))
i = 0
for sentence in corpus.sentences():
print(sentence.tostring(decoder))
i += 1
print("Count check",test(i,40))
print("\n======= Building indexed model =========\n")
options = colibricore.PatternModelOptions(doskipgrams=True)
indexedmodel = colibricore.IndexedPatternModel(reverseindex=corpus)
indexedmodel.train("/tmp/hamlet.colibri.dat",options)
print("Pattern count", test(len(indexedmodel), 133))
print("Type count", test(indexedmodel.types(), 186))
print("Token count", test(indexedmodel.tokens(), 354))
indexedmodel.printmodel(decoder)
print("REPORT:")
indexedmodel.report()
print("HISTOGRAM:")
indexedmodel.histogram()
outputfilename = "/tmp/data.colibri.indexedpatternmodel"
print("Writing to file")
indexedmodel.write(outputfilename)
print("Loading indexed corpus")
indexedmodel = colibricore.IndexedPatternModel("/tmp/data.colibri.indexedpatternmodel")
print("iterating over all patterns and values")
for pattern, value in indexedmodel.items():
print(pattern.tostring(decoder), len(value))
print("Extracting count for specific pattern")
print(test(len(indexedmodel[encoder.buildpattern("to be")]),2))
print("Test done")