-
Notifications
You must be signed in to change notification settings - Fork 12
/
vector_pairing_models.py
53 lines (40 loc) · 2.15 KB
/
vector_pairing_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#GiG
import numpy as np
from scipy.spatial import distance
#This is the Abstract Base Class for all Vector Pairing models
class ABCVectorPairing:
def __init__(self):
pass
#Input is an embedding matrix: #tuples x #dimension
def index(self, embedding_matrix):
pass
#Input is an embedding matrix: #tuples x #dimension
#Output: is a matrix of size #tuples x K where K is application dependent
def query(self, embedding_matrix):
pass
#This is a top-K based blocking strategy
# We index the tuple embeddings from one of the datasets and query the othe
#This is an expensive approach that computes all pair cosine and similarity
# and then extracts top-K neighbors
class ExactTopKVectorPairing(ABCVectorPairing):
def __init__(self, K):
super().__init__()
self.K = K
#Input is an embedding matrix: #tuples x #dimension
def index(self, embedding_matrix_for_indexing):
self.embedding_matrix_for_indexing = embedding_matrix_for_indexing
#Input is an embedding matrix: #tuples x #dimension
#Output: is a matrix of size #tuples x K where K is an optional parameter
# the j-th entry in i-th row corresponds to the top-j-th nearest neighbor for i-th row
def query(self, embedding_matrix_for_querying, K=None):
if K is None:
K = self.K
#Compute the cosine similarity between two matrices with same number of dimensions
# E.g. N1 x D and N2 x D, this outputs a similarity matrix of size N1 x N2
#Note: we pass embedding matrix for querying first and then indexing so that we get
# top-K neighbors in the indexing matrix
all_pair_cosine_similarity_matrix = 1 - distance.cdist(embedding_matrix_for_querying, self.embedding_matrix_for_indexing, metric="cosine")
#-all_pair_cosine_similarity_matrix is needed to get the max.. use all_pair_cosine_similarity_matrix for min
topK_indices_each_row = np.argsort(-all_pair_cosine_similarity_matrix)[:, :K]
#you can get the corresponding simlarities via all_pair_cosine_similarity_matrix[index, topK_indices_each_row[index]]
return topK_indices_each_row