This repository has been archived by the owner on May 5, 2023. It is now read-only.
-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
semantica.py
175 lines (136 loc) · 6.55 KB
/
semantica.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import gensim
from gensim import matutils
import gensim.downloader as api
import numpy as np
from numpy import ndarray, float32, array, dot, mean, median
class Semantica:
def __init__(self, word_count=100000):
self.c = gensim.models.KeyedVectors.load_word2vec_format(
api.load('word2vec-google-news-300', return_path=True), binary=True, limit=word_count)
def unique(self, sequence):
"""Turn a list into a set, while preserving unique element order.
"""
seen = set()
return [x for x in sequence if not (x in seen or seen.add(x))]
def lower_unique(self, concept_keys):
"""Turn a list of strings into a set of lowercase strings.
"""
for i in range(len(concept_keys)):
concept_keys[i] = concept_keys[i].lower()
return self.unique(concept_keys)
def to_vector(self, concept, norm_result=True):
"""Turn a concept key or vector into a concept vector.
"""
# Extract concept vector accordingly.
if isinstance(concept, ndarray):
result_vector = concept
elif isinstance(concept, str):
result_vector = self.c.get_vector(concept)
else:
raise ValueError("concept should be of type str or ndarray.")
# Optionally normalize result
if norm_result:
result_vector = matutils.unitvec(result_vector)
return result_vector
def field(self, concept, norm_concept=True, lower=True, max_concept_count=10):
"""Return the semantic field of a given concept key or vector.
"""
# Extract concept keys most similar to concept
field = self.c.most_similar(
[self.to_vector(concept, norm_result=norm_concept)], topn=max_concept_count)
field = [e[0] for e in field]
# Optionally make concept keys lowercase and unique
if lower:
field = self.lower_unique(field)
# Remove the query concept key itself from the result
if isinstance(concept, str):
field = [e for e in field if str(e) != str(concept)]
return field
def mix(self, *concepts, norm_concepts=True, norm_result=True, lower=True, return_vector=False):
"""Combine the meaning of multiple concept keys or vectors.
"""
# Create list of vectorized concepts
concept_vectors = []
for concept in concepts:
concept_vectors += [self.to_vector(concept,
norm_result=norm_concepts)]
# Compute average of vectorized concepts
mix = array(concept_vectors).mean(axis=0).astype(float32)
if return_vector:
return mix
# Compute semantic field of vector average
results = self.field(mix, norm_concept=norm_result, lower=lower)
# Optionally make concept keys lowercase and unique
if lower:
results = self.lower_unique(results)
# Remove the query concept keys themselves from the result
for concept in concepts:
if isinstance(concept, str):
results = [e for e in results if str(e) != str(concept)]
return results
def shift(self, source, target, norm_concepts=True, norm_result=True):
"""Return a vector which encodes a meaningful semantic shift.
"""
# Extract concept vectors for source and target concepts
source_vector = self.to_vector(source, norm_result=norm_concepts)
target_vector = self.to_vector(target, norm_result=norm_concepts)
# Compute shift
shift = array([-1 * source_vector, target_vector]
).mean(axis=0).astype(float32)
# Optionally normalize result
if norm_result:
shift = matutils.unitvec(shift)
return shift
def span(self, start, end, steps=5, norm_concepts=False, norm_shift_result=False, norm_result=False, norm_mix_concepts=False):
"""Return an interpolation of the semantic space between two concepts.
"""
results = []
shift = self.shift(
start, end, norm_concepts=norm_concepts, norm_result=norm_shift_result)
# Append concept keys most similar to step vectors
for step in range(1, steps + 1):
step_key_field = self.mix(*[start, shift * (1 / (steps + 1)) * step],
norm_result=norm_result, norm_concepts=norm_mix_concepts, lower=False)
results += [*step_key_field]
# Remove the query concept keys themselves from the result
results = [e for e in results if e not in [start, end]]
# Sort concept keys by location across conceptual spectrum
results = sorted(results, key=lambda x: self.c.similarity(
x, end) - self.c.similarity(x, start))
# Make concept keys lowercase and unique
results = self.lower_unique(results)
# Add ends
results = [start.lower(), *results, end.lower()]
return results
def match(self, *model, target=None):
"""Find analogies for a given conceptual model.
"""
# Extract conceptual relations from model
root = model[0]
skeleton = [self.shift(root, e) for e in model[1:]]
# Define target domain
if target:
target_domain = [self.to_vector(
e[0]) for e in self.c.most_similar(target, topn=10000)]
else:
target_domain = self.c.vectors
for i in range(len(target_domain)):
match_score = []
new_leaf_concepts = []
# Compute concept vectors of analogy
new_root_vector = target_domain[i]
new_leaf_vectors = [
self.mix(new_root_vector, skeleton[j], return_vector=True) for j in range(len(skeleton))]
# Compute concept keys of analogy
new_root_concept = self.c.similar_by_vector(new_root_vector)[0][0]
new_leaf_concepts = [[e[0] for e in self.c.similar_by_vector(
f) if e[0] not in [*model, new_root_concept]] for f in new_leaf_vectors]
# Evaluate match through measure of alignment between relations
for j in range(len(new_leaf_vectors)):
match_score += [dot(self.shift(new_root_concept,
new_leaf_concepts[j][k]), skeleton[j]) for k in range(len(new_leaf_concepts[j]))]
match_score = mean(match_score)
# Print if there's a match
if match_score > 0.25:
match = [new_root_concept, *new_leaf_concepts]
print(i, match, match_score)