-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathUtils.py
158 lines (105 loc) · 4.91 KB
/
Utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import numpy as np
def removePunctuationFromWord(word: str):
"""Removes punctuation characters from a word."""
return ''.join(char for char in word if char not in removePunctuationFromWord.unwantedChars)
# List of punctuation characters to remove from words in the process of decoding
removePunctuationFromWord.unwantedChars = set("!?,.:;\'\" []()")
def replaceOldEnglishCharsInWord(word: str):
"""Replaces certain character sequences in a word with their original Old English character equivalents."""
newWord = word
for i in replaceOldEnglishCharsInWord.charReplacements:
newWord = newWord.replace(*i)
return newWord
# List of character replacements to make in the process of decoding a word
replaceOldEnglishCharsInWord.charReplacements = (
("Ä\u0081", "ā"),
("Ä“", "ē"),
("Ä«", "ī"),
("Å\u008d", "ō"),
("Å«", "ū"),
("ȳ", "ȳ"),
("Ç£", "ǣ"),
("æ", "æ"),
("Ä\u2039", "ċ"),
("Ä¡", "ġ"),
("þ", "þ"),
("ð", "ð"),
("Ä\u2019", "Ē"),
("Ä\u00a0", "Ġ"),
# Missing several less common letters
)
def wordStringToVector(word: str, dim: int):
"""
Returns the vector representation of a word.
Parameters:
word (str): The string representation of the word
dim (int): The desired dimension of the resulting vector (dim >= len(word))
Returns:
vector (list): The vector of length `dim` as a list of ints
"""
wordLength = len(word)
if dim < wordLength:
raise Exception("dim cannot be lower than the length of the word")
vector = [0] * dim
for i in range(wordLength):
vector[i] = ord(word[i])
return vector
def intToVector(val: int, dim: int):
"""Transform an integer into a vector of dimension `dim` by setting each value in the vector to 0 except for a value of 1 in the dimension equal to `val` (note: dimensions are 0-indexed)."""
if val < 0:
raise Exception("val cannot be negative")
elif dim <= val:
raise Exception("dim cannot be less than or equal to val (note: `dim` is the number of dimensions in the resulting array and dimensions are 0-indexed)")
vector = [0] * dim
vector[val] = 1
return vector
def noramlizeWordVector(word: np.ndarray, minCharValue: int, maxCharValue: int):
"""
Normalizes a word vector so that all values are within the range [0, 1].
Parameters:
word (numpy array): The unnormalized vector representation of a word
minCharValue (int): The minimum character value in all of the data (not just this word)
maxCharValue (int): The maximum character value in all of the data (not just this word)
Returns:
A numpy array (dtype=np.float64) of the normalized vector
"""
return (word - minCharValue) / (maxCharValue - minCharValue)
def normalizedWordVectorToString(word: np.ndarray, minCharValue: int, maxCharValue: int):
"""
Returns the string representation of a normalized word vector.
Parameters:
word (numpy array): The normalized vector representation of a word
minCharValue (int): The minimum character value in all of the data (not just this word)
maxCharValue (int): The maximum character value in all of the data (not just this word)
Returns:
The string representation of `word`
"""
unNormalizedVector = (word * (maxCharValue - minCharValue)) + minCharValue
unNormalizedVector = unNormalizedVector.astype(np.int32)
return ''.join(chr(char) for char in unNormalizedVector)
def distanceBetweenVectors(vector1: np.ndarray, vector2: np.ndarray, minCharValue: int, maxCharValue: int):
"""
This function takes in normalized vectors as parameters and returns the Euclidean distance between their unnormalized equivalents.
Parameters:
vector1 (numpy array): The normalized vector representation of a word
vector2 (numpy array): The normalized vector representation of a word
minCharValue (int): The minimum character value in all of the data (not just these words)
maxCharValue (int): The maximum character value in all of the data (not just these words)
Returns:
The distance between the unnormalized equivalents of `vector1` and `vector2` as a float
"""
unnormalizedVector1 = (vector1 * (maxCharValue - minCharValue)) + minCharValue
unnormalizedVector2 = (vector2 * (maxCharValue - minCharValue)) + minCharValue
squaredDifferences = (unnormalizedVector2 - unnormalizedVector1) ** 2
return np.sum(squaredDifferences) ** 0.5
def partOfSpeechVectorToInt(vector: np.ndarray):
"""Converts a part-of-speech vector into its integer representation."""
if np.size(vector) == 0:
raise Exception("vector is empty")
maxVal = vector[0]
maxValIndex = 0
for i in range(1, len(vector)):
if vector[i] > maxVal:
maxVal = vector[i]
maxValIndex = i
return maxValIndex