-
Notifications
You must be signed in to change notification settings - Fork 0
/
my_utils.py
66 lines (55 loc) · 2.3 KB
/
my_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
import tensorflow as tf
from tensorflow.keras.layers import Bidirectional, GRU, Dense, Dropout
from tensorflow.keras.models import Sequential
def essay_to_wordlist(essay_v, remove_stopwords):
#Remove the tagged labels and word tokenize the sentence.
essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
words = essay_v.lower().split()
if remove_stopwords:
stops = set(stopwords.words("english"))
words = [w for w in words if not w in stops]
return (words)
def essay_to_sentences(essay_v, remove_stopwords):
"""Sentence tokenize the essay and call essay_to_wordlist() for word tokenization."""
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
raw_sentences = tokenizer.tokenize(essay_v.strip())
sentences = []
for raw_sentence in raw_sentences:
if len(raw_sentence) > 0:
sentences.append(essay_to_wordlist(raw_sentence, remove_stopwords))
return sentences
def makeFeatureVec(words, model, num_features):
"""Make ar from the words list of an Essay."""
featureVec = np.zeros((num_features,),dtype="float32")
num_words = 0.
index2word_set = set(model.wv.index_to_key)
for word in words:
if word in index2word_set:
num_words += 1
featureVec = np.add(featureVec,model.wv[word]) # might causing the warning
featureVec = np.divide(featureVec,num_words)
return featureVec
def getAvgFeatureVecs(essays, model, num_features):
"""Main function to generate the word vectors for word2vec model."""
counter = 0
essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
for essay in essays:
essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
counter = counter + 1
return essayFeatureVecs
def get_model():
"""Define the model."""
model = Sequential()
model.add(Bidirectional(GRU(300, dropout=0.4, recurrent_dropout=0, return_sequences=True), input_shape=(1, 300)))
model.add(Bidirectional(GRU(64, recurrent_dropout=0)))
model.add(Dropout(0.3))
model.add(Dense(1, activation='relu'))
model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
model.summary()
return model