-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcode.py
72 lines (65 loc) · 2.87 KB
/
code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import numpy as np
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
data = """ Jack and Jill went up the hill\n
To fetch a pail of water\n
Jack fell down and broke his crown\n
And Jill came tumbling after\n """
#object of tokenizer class created
tokenizer=Tokenizer()
#data fitted by tokenizer
tokenizer.fit_on_texts([data])
#encoded_data stores the value of repeated score of the word 1= highly repeated 21= least repeated
encoded_data=tokenizer.texts_to_sequences([data])[0]
print(encoded_data)
''' Data Formation
calculate the vocab size
tokenizer.word_size gives dictionary of words corresponidng to the there label no.
eg: "and" is repeated most in the data and thus it has label 1
now the list will be created containing current word and next word. this list will be fed into the neural network
and training will be done on the list created
list format ----> [current word :next word]'''
vocab_size=len(tokenizer.word_index)+1
print(vocab_size)
word_sequence=[] #list_word_sequence created
for i in range(1,len(encoded_data)):
sequence=encoded_data[i-1:i+1] #saving encoder current word and next word in sequence variable
word_sequence.append(sequence)
word_sequence=np.array(word_sequence) #converting list into array
print(word_sequence)
'''# Splitting of Data
now we have generated data in right format and now we can use the data to in our ML model
'''
x_train,y_train = word_sequence[:,0],word_sequence[:,1] #slicing coloumn of the array into x_train and y_train
print(f"x_train:{x_train} \ny_train:{y_train}") #print statement
#converting categorical data to numerical data using one hot encoding
#The binary variables are often called “dummy variables"
y_train=to_categorical(y_train,num_classes=vocab_size)
model=Sequential()
model.add(Embedding(vocab_size,10,input_length=1))
model.add(LSTM(70))
model.add(Dense(vocab_size,activation="softmax"))
model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"])
print(model.summary())
model.fit(x_train,y_train,epochs=500,verbose=1)
x_test=input("enter word whose next word you wanna predict:")
no_next_word=int(input("enter no. of predictions you want:"))
#in_test variable created to prevent chnages form testing element. can be ignored and changes can be done directly
#on the x_test
in_test,next_word=x_test,x_test
print(f"entered word:{x_test}")
for i in range(no_next_word):
print(i)
out_word=""
encoded_test=tokenizer.texts_to_sequences([in_test])[0]
encoded_test=np.array(encoded_test)
#prediction:
y_predict=model.predict_classes(encoded_test,verbose=0)
for word, index in tokenizer.word_index.items():
if y_predict==index:
out_word=word
break
in_test=out_word
print(f"predicted next words:{out_word}")