01_find_duplicates_dok.py

'''
First basic script to get duplicate sentences.
The script just uses basic python to get the baseline time performance.
This will help guage the performance of numpy, scipy modules in the second script.
This script will take ages to run on the entire set of 10MM sentences.
The algorithm is O(n^2) in time and O(1) in memory.
'''
import sys, math
from gensim import corpora, models, similarities

raw_file = 'mmd_assignment/sentences.txt'
writer1 = open('mmd_assignment/sentences_similar_groups_dok.txt','wb')
writer2 = open('mmd_assignment/sentences_similar_sentences_dok.txt','wb')

texts = []
num_rows = int(sys.argv[1])
threshold = float(sys.argv[2])
counter = 0

def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])
    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    #No need to below as the vectors are already normalised
    # denominator = math.sqrt(sum1) * math.sqrt(sum2)
    # if not denominator:
        # return 0.0
    # else:
        # return float(numerator/denominator)
    return float(numerator)

# Read the file and add to lists of sentences
for row in open(raw_file,'rb'):
    counter += 1
    if counter == num_rows:
        break
    texts.append(row.strip().split(' ')[1:])
    
# create dictionary of word:dummy_number
dictionary = corpora.Dictionary(texts)

# creating the Bag of Words for each documnet(row) in the data(only keep words in the dictionary)
corpus = [dictionary.doc2bow(text) for text in texts]
print "Bag of words"

#Normalise all the vectors in corpus
corp = []
for vec in corpus:
    dict_vec = dict(vec)
    l = dict_vec.values()
    sm = 0
    for sc in l:
        sm = sm + sc**2
    denom = math.sqrt(sm)
    for keys in dict_vec.keys():
        if denom != 0:
            dict_vec[keys] = dict_vec[keys]/denom
    corp.append(dict_vec)
    
# Get similarity of each document with other document
# For each similarity greater than threshold, make group of similar tickets
i = 0
group = {}
grp = 1
for vec1 in corp:
    i = i+1
    sim_max = 0
    index_max = -1
    index = i
    all = []
    for vec2 in corp[i:]:
        index = index + 1
        sim = get_cosine(vec1, vec2)
        if sim >= threshold:
            all.append(index)
            all.append(sim)
            if index in group.keys():
                group[i] = group[index]
            if i in group.keys():
                group[index] = group[i]
            if (i not in group.keys()) and (index not in group.keys()):
                grp += 1
                group[index] = grp
                group[i] = grp
    if len(all) > 0:
        writer1.write('|'.join(map(str,[i] + all))+'\n')
    if i%100 == 0 :
        print "Done with :%s rows" %i

# Write output file with calculated groups
similar_texts = set(group.keys())
counter = 0
for row in open(raw_file,'rb'):
    counter += 1
    if counter == num_rows:
        break
    if counter in similar_texts:
        writer2.write('|'.join(map(str,[group[counter]]+[counter] +[row])))

writer1.close()
writer2.close()