-
Notifications
You must be signed in to change notification settings - Fork 1
/
generate_train_samples.py
129 lines (119 loc) · 4.47 KB
/
generate_train_samples.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os
#os.environ['KERAS_BACKEND']='theano'
from keras.preprocessing.text import Tokenizer, one_hot, text_to_word_sequence
from random import randint, choice
import simplejson #to write list to file
from nltk import tokenize
VOCAB_SIZE = 2000
DIMENSIONS = 50
#give size of left=right side of context window
#actual context window size is CONTEXT_WINDOW*2
CONTEXT_WINDOW = 5
#percentage of words from context window to sample
#1 would mean use all words in context window
RANDOM_SAMPLES = 0.5
#after how many samples to save into a file
SAMPLES_SAVE = 10000
MAX_SAMPLES = 1000000
print '>Loading training corpus'
with open("text8.txt", "r") as f:
text = f.read()
#TODO: is the text_to_word_sequence step required or is the data already pre processed
filter_string = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
#TODO: tokenization cleanup
text = text_to_word_sequence(text, filters=filter_string, lower=True, split=" ")
#text = tokenize.sent_tokenize(text)
#print text
print 'No of words in corpus: {}'.format(len(text))
tokenizer = Tokenizer(num_words=VOCAB_SIZE, filters=filter_string, lower=True, split=" ", char_level=False)
tokenizer.fit_on_texts(text)
word_to_int = {}
temp_word_to_int = tokenizer.word_index
for k, v in temp_word_to_int.iteritems():
if(v > 0 and v <= VOCAB_SIZE):
word_to_int[k] = v
#for k, v in word_to_int.iteritems():
# print k,v
int_to_word = dict(zip(word_to_int.values(), word_to_int.keys()))
f = open('int_to_word_dict.txt', 'w')
simplejson.dump(int_to_word, f)
print 'No of unique words: {}'.format(len(word_to_int))
print '>Training corpus loaded'
#print word_to_int['ill']
print '>Generating training samples'
no_words = len(text)
x_train = []
y_train = []
samples = 0
count = 0
for i in range(no_words):
if(samples > MAX_SAMPLES):
break
if(text[i] not in word_to_int):
continue
if(len(x_train) >= SAMPLES_SAVE):
samples += len(x_train)
print 'Total training samples saved: {}'.format(samples)
f = open('Samples/samples_{}'.format(count), 'w')
x_temp = []
y_temp = []
for x, y in zip(x_train, y_train):
#print x
x1 = [0 for _ in range(x-1)] + [1] + [0 for _ in range(VOCAB_SIZE - x)]
y1 = [0 for _ in range(y-1)] + [1] + [0 for _ in range(VOCAB_SIZE - y)]
x_temp.append(x1)
y_temp.append(y1)
simplejson.dump([x_temp, y_temp], f)
f.close()
count += 1
del x_train[:]
del y_train[:]
del x_temp[:]
del y_temp[:]
indices = [-x for x in range(1, CONTEXT_WINDOW+1)] + range(1, CONTEXT_WINDOW+1)
for j in range(int(round(RANDOM_SAMPLES*CONTEXT_WINDOW*2, 0))):
if((i >= CONTEXT_WINDOW) and (i+CONTEXT_WINDOW < no_words)):
#negative numbers mean choose from left side context window
#positive numbers mean choose from right side context window
index = choice(indices)
#print indices
#-1 for backward context window and +1 for forward context window
output_word = i + index
#print output_word
if(text[output_word] in word_to_int):
x_train.append(word_to_int[text[i]])
y_train.append(word_to_int[text[output_word]])
#remove samples word for next sampling
indices.remove(index)
else:
#ignore input samples at the beginning and end of corpus whose context window is less than CONTEXT_WINDOW*2
continue
#saving leftover samples to file
if(len(x_train) > 0):
samples += len(x_train)
print 'Total training samples saved: {}'.format(samples)
f = open('Samples/samples_{}'.format(count), 'w')
x_temp = []
y_temp = []
for x, y in zip(x_train, y_train):
#print x
x1 = [0 for _ in range(x-1)] + [1] + [0 for _ in range(VOCAB_SIZE - x)]
y1 = [0 for _ in range(y-1)] + [1] + [0 for _ in range(VOCAB_SIZE - y)]
x_temp.append(x1)
y_temp.append(y1)
simplejson.dump([x_temp, y_temp], f)
f.close()
count += 1
del x_train[:]
del y_train[:]
del x_temp[:]
del y_temp[:]
print 'No of training samples generated: {}'.format(samples)
print 'No of training files saved: {}'.format(count)
#print len(y_train)
#print zip(x_train, y_train)
#for k, v in word_to_int.iteritems():
# print k, v
#print x_train[0]
#temp = one_hot(text, VOCAB_SIZE, filters=base_filter(), lower=True, split=" ")
#print temp