-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLemmatization.py
105 lines (83 loc) · 3.62 KB
/
Lemmatization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import mysql.connector
import numpy as np
import random
import statistics
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras import activations
from keras.utils.vis_utils import plot_model
import Utils
# Connect to the MySQL server
connection = mysql.connector.connect(host='localhost', database='oldengli_oea', user='root', password='password')
if not connection.is_connected():
print("Error: could not connect to MySQL server")
exit()
# Retrieve a list of tuples, where each tuple has two items: the spelling of a word and its headword form
cursor = connection.cursor()
cursor.execute("SELECT spelling, hw FROM words INNER JOIN entries ON entries.entry_id = words.entry_id;")
data = cursor.fetchall()
# Close the cursor and connection
cursor.close()
connection.close()
# Remove any entries that are missing a spelling or headword
data = [x for x in data if x[0] != None and x[1] != None and x[0] != "" and x[1] != ""]
# The number of datapoints
numDataPoints = len(data)
# Format the data
data = [(Utils.replaceOldEnglishCharsInWord(x[0]), Utils.replaceOldEnglishCharsInWord(x[1])) for x in data]
data = [(Utils.removePunctuationFromWord(x[0]), Utils.removePunctuationFromWord(x[1])) for x in data]
data = [(x[0].lower(), x[1].lower()) for x in data]
# Randomize the order of the entries
random.shuffle(data)
# Tranform the spellings and headwords into vectors of dimension `maxWordLength`
maxWordLength = max([max(len(x[0]), len(x[1])) for x in data])
data = [(Utils.wordStringToVector(x[0], maxWordLength), Utils.wordStringToVector(x[1], maxWordLength)) for x in data]
data = np.array(data, dtype=np.float64)
# Normalize data so all values fall within the range [0, 1]
maxCharValue = np.amax(data)
minCharValue = np.amin(data)
data = [(Utils.noramlizeWordVector(x[0], minCharValue, maxCharValue), Utils.noramlizeWordVector(x[1], minCharValue, maxCharValue)) for x in data]
data = np.array(data, dtype=np.float64)
# Split data into training (80%), validation (10%), and testing (10%) sets
trainingSetCutoff = int(numDataPoints * 0.8)
validationSetCutoff = int(numDataPoints * 0.9)
spellingsTrainingSet = data[:trainingSetCutoff, 0, :]
headwordsTrainingSet = data[:trainingSetCutoff, 1, :]
spellingsValidationSet = data[trainingSetCutoff:validationSetCutoff, 0, :]
headwordsValidationSet = data[trainingSetCutoff:validationSetCutoff, 1, :]
spellingsTestingSet = data[validationSetCutoff:, 0, :]
headwordsTestingSet = data[validationSetCutoff:, 1, :]
lengthOfTestingSet = len(spellingsTestingSet)
# Make the model
model = keras.Sequential([
keras.layers.Dense(maxWordLength * 2, activation=activations.relu),
keras.layers.Dense(maxWordLength, activation=activations.relu)
])
# Compile the model
model.compile(
optimizer=keras.optimizers.Adam(learning_rate=0.005),
loss=keras.losses.MeanSquaredError(),
)
# Fit the data
model.fit(
spellingsTrainingSet,
headwordsTrainingSet,
validation_data=(spellingsValidationSet, headwordsValidationSet),
batch_size=32,
epochs=25
)
# Make a plot of the model
plot_model(
model,
to_file="ModelPlots/LemmatizationModel.png",
show_shapes=True,
show_layer_names=True
)
# Evaluate the model
modelOutput = model.call(tf.convert_to_tensor(spellingsTestingSet), training=False).numpy()
# Calculate the minimum and average distances between vectors
distances = [Utils.distanceBetweenVectors(headwordsTestingSet[i], modelOutput[i], minCharValue, maxCharValue) for i in range(lengthOfTestingSet)]
averageDistance = statistics.mean(distances)
stdevDistance = statistics.stdev(distances)
print("\n\nDistance: %.3f +/- %.3f" % (averageDistance, stdevDistance))