-
Notifications
You must be signed in to change notification settings - Fork 2
/
datakit.py
91 lines (69 loc) · 3.19 KB
/
datakit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Copyright (c) 2013, Rui Carmo
Description: Clustering and statistics helpers
License: MIT (see LICENSE.md for details)
"""
import logging
import re
log = logging.getLogger()
_stopwords = {"en":"i,a,an,are,as,at,be,by,for,from,how,in,is,it,of,on,or,that,the,this,to,was,what,when,where".split(',')}
def strip_stopwords(sentence, lang="en"):
"""Removes stopwords and normalizes whitespace - adapted from Django"""
global _stopwords
words = sentence.split()
sentence = []
for word in words:
if word.lower() not in _stopwords[lang]:
sentence.append(word)
return u' '.join(sentence)
def jaccard_distance(a, b):
"""A simple distance function based on string overlap - adapted from sample code by Deepak Thukral"""
#Tokenize string into bag of words
feature1 = set(re.findall('\w+', strip_stopwords(a.lower()))[:100])
feature2 = set(re.findall('\w+', strip_stopwords(b.lower()))[:100])
similarity = 1.0 * len(feature1.intersection(feature2)) / len(feature1.union(feature2))
return 1 - similarity
def levenshtein_distance(a, b, limit=None):
"""Returns the Levenshtein edit distance between two strings - adapted from Whoosh"""
a = ''.join(re.findall('\w+', strip_stopwords(a.lower())))
b = ''.join(re.findall('\w+', strip_stopwords(b.lower())))
prev = None
thisrow = range(1, len(b) + 1) + [0]
for x in xrange(len(a)):
# Python lists wrap around for negative indices, so put the
# leftmost column at the *end* of the list. This matches with
# the zero-indexed strings and saves extra calculation.
prev, thisrow = thisrow, [0] * len(b) + [x + 1]
for y in xrange(len(b)):
delcost = prev[y] + 1
addcost = thisrow[y - 1] + 1
subcost = prev[y - 1] + (a[x] != b[y])
thisrow[y] = min(delcost, addcost, subcost)
if limit and x > limit and min(thisrow) > limit:
return limit + 1
return thisrow[len(b) - 1]
def damerau_levenshtein_distance(a, b, limit=None):
"""Returns the Damerau-Levenshtein edit distance between two strings - adapted from Whoosh"""
a = ''.join(re.findall('\w+', strip_stopwords(a.lower())))
b = ''.join(re.findall('\w+', strip_stopwords(b.lower())))
oneago = None
thisrow = list(range(1, len(b) + 1)) + [0]
for x in xrange(len(a)):
# Python lists wrap around for negative indices, so put the
# leftmost column at the *end* of the list. This matches with
# the zero-indexed strings and saves extra calculation.
twoago, oneago, thisrow = oneago, thisrow, [0] * len(b) + [x + 1]
for y in xrange(len(b)):
delcost = oneago[y] + 1
addcost = thisrow[y - 1] + 1
subcost = oneago[y - 1] + (a[x] != b[y])
thisrow[y] = min(delcost, addcost, subcost)
# This block deals with transpositions
if (x > 0 and y > 0 and a[x] == b[y - 1]
and a[x - 1] == b[y] and a[x] != b[y]):
thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
if limit and x > limit and min(thisrow) > limit:
return limit + 1
return thisrow[len(b) - 1]