-
Notifications
You must be signed in to change notification settings - Fork 0
/
02_find_duplicates_matrix.py
93 lines (83 loc) · 3.42 KB
/
02_find_duplicates_matrix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
'''
Second script for finding similar sentences.
The script reads documents and creates BOW matrix. This matrix is used to find similarity using arithematic operations.
The BOW matrix is a sparse matrix. However, similarity matrix is a dense matrix. So, the computation runs out of memory with higher BOW dimensions.
To accomodate this, the matrix is broken into parts and then arithematic operations are carried out.
The algorithm is O(n.lon(n)) in time and memory.
'''
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import binarize
import sys, math
raw_file = 'mmd_assignment/sentences.txt'
# writer = open('mmd_assignment/sentences_similar_sentences_matrix_%s.txt'%sys.argv[1],'wb')
texts = []
num_rows = int(sys.argv[1])
# threshold = float(sys.argv[2])
# num_rows = 10000
threshold = 0.95
# Read the file till num_rows. Find the num_rows where memory requirement increases beyond limit
counter = 0
for row in open(raw_file,'rb'):
counter += 1
if counter == num_rows:
break
texts.append(' '.join(row.strip().split(' ')[1:]))
# Make sparse numerical representation of the sentences
vectorizer = CountVectorizer(min_df=1)
X = normalize(vectorizer.fit_transform(texts).astype(np.float),norm='l2').tocsr()
# mag_squared,= np.array(X.multiply(X).sum(1).T)
# mag_all = np.sqrt(mag_squared)
# mag_all[mag_all == 0] = 1
# del X,mag_squared,vectorizer
group = 0
group_map = {}
# Loop over documents taken num_documents2 at a time to find similarity. Taking all documents will lead to memory error
# Find similarity of documents and add them to the similar groups
num_documents2 = 2**12
for segment3 in range(0,int(math.ceil((1.0*len(texts))/num_documents2))):
X1 = X[num_documents2*segment3:min(num_documents2*(segment3+1),len(texts))]
X2 = X[0:min(num_documents2*(segment3+1),len(texts))]
Y1 = binarize(X1 * X2.T, threshold=threshold).tocsr()
# print Y1.shape
del X1
for row in range(Y1.shape[0]):
sentence_index = row+num_documents2*segment3
if row%1000 == 0:
print "%d documents Done"%(row+segment3*num_documents2)
cols_array = Y1[row].toarray()[0]
cols = np.where(cols_array[0:row+segment3*num_documents2]>=threshold)[0]
for col in reversed(cols):
if sentence_index in group_map.keys():
break
try:
value = Y1[row,col]
except:
value = 0
if value >= threshold:
if sentence_index != col:
if col in group_map.keys():
group_map[sentence_index] = group_map[col]
else:
group += 1
group_map[sentence_index] = group
group_map[col] = group
df1 = pd.DataFrame(pd.Series(group_map))
df1['Row'] = df1.index
df1.rename(columns={0:'Group'},inplace=True)
groups_map = df1.groupby('Group')['Row'].min().reset_index()
df2 = pd.merge(df1,groups_map,on='Group')
df2.index = df2['Row_x']
group = df2['Row_y'].to_dict()
similar_texts = set(group.keys())
counter = -1
for row in open(raw_file,'rb'):
counter += 1
if counter == num_rows:
break
if counter in similar_texts:
writer.write('|'.join(map(str,[group[counter]]+[counter] +[row])))
writer.close()