-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathadjacency.py
37 lines (30 loc) · 1.41 KB
/
adjacency.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
"""
Regression model to handle adjacency.
"""
import nltk
from sklearn.linear_model import LogisticRegression
class AdjacencyModel:
def __init__(self, adjacent, non_adjacent):
x = [value for value in non_adjacent + adjacent]
y = [False for _ in non_adjacent] + [True for _ in adjacent]
reg = LogisticRegression()
reg.fit(x, y)
self.reg = reg
def proba(self, observed):
return self.reg.predict_proba([observed])[0][1]
def score(text1, text2):
"""
Score the similarity between two texts.
"""
return [word_similarity(text1, text2), bigram_similarity(text1, text2)]
return [word_similarity(text1, text2)]
def word_similarity(text1, text2):
tokens1 = set(token.lower() for token in nltk.word_tokenize(text1) if any(c.isalpha() for c in token))
tokens2 = set(token.lower() for token in nltk.word_tokenize(text2) if any (c.isalpha() for c in token))
return len(tokens1.intersection(tokens2)) / max(len(tokens1.union(tokens2)), 1)
def bigram_similarity(text1, text2):
tokens1 = list(token.lower() for token in nltk.word_tokenize(text1) if any(c.isalpha() for c in token))
tokens2 = list(token.lower() for token in nltk.word_tokenize(text2) if any (c.isalpha() for c in token))
bigrams1 = set(nltk.ngrams(tokens1, 2))
bigrams2 = set(nltk.ngrams(tokens2, 2))
return len(bigrams1.intersection(bigrams2)) / max(len(bigrams1.union(bigrams2)), 1)