-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtext_analyzer.py
executable file
·240 lines (187 loc) · 8.4 KB
/
text_analyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
ASCII Text Analyzer
This module implements APIs to analyze text for possible errors and provide suggestions against input dictionary
using Damerau-Levenshtein Distance algorithm. Here, Damerau-Levenshtein Distance is calculated by using Iterative Matrix method.
usage:
text_analyzer.py [-h] [-t T] [-f F] [-d D]
optional arguments:
-h, --help show this help message and exit
-t T Input text to be analyzed
-f F Input file path containing text to be analyzed
-d D Input file path of dictionary to be used with analyzer. Each new
word should be on new line.
References:
https://en.wikipedia.org/wiki/Levenshtein_distance
https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
@author: Rohit Chormale
"""
import decimal
decimal.getcontext().prec = 6
from decimal import Decimal
INSERT_COST = 1
DROP_COST = 1
SUBSTITUTE_COST = 2
TRANSPOSE_COST = 1
# if `strict` flag applied, the words having resemblance-score below this threshold, will be dropped from suggestions.
# see func `get_match_with_score` and `analyze_word` for more info
RESEMBLANCE_THRESHOLD = 0.4
DEFAULT_ENGLISH_DICT = ["hello", "this", "is", "a", "test", "program", "which", "silly", "as", "well", "easy"]
def calculate_damerau_levenshtein_distance(s, t):
"""
Calculate Damerau-Levenshtein distance using iterative matrix method.
Args:
s (string): source stringTrue
t (string): target string
Returns:
Integer value
"""
# increase matrix size null character
rows = len(s) + 1
cols = len(t) + 1
# create required matrix space
matrix = [[ 0 for j in range(cols)]for i in range(rows)]
# calculate costs of first row (horizontal). Here 0/NULL indicates empty string.
# As source string is empty, we can only perform insert operations to convert it in target
for j in range(cols):
matrix[0][j] = j * INSERT_COST
# calculate costs of first column (vertical). Here, to convert source to null,
# we can only perform drop operations.
for i in range(rows):
matrix[i][0] = i * DROP_COST
# now calculate diagonal elements. here we will iterate cols by row to calculate how many operations we need
# to convert source to target. This conversion will follow below rules,
# - if source and target characters are same at given diagonal position, cost will be 0 i.e. cost upto previous characters
# - if source and target characters are not same, then we have 3 options insert, drop and substitute.
# We will choose operation with the lowest cost
for j in range(1, cols):
for i in range(1, rows):
if s[i-1] == t[j-1]:
DIAGONAL_COST = 0
else:
DIAGONAL_COST = SUBSTITUTE_COST
matrix[i][j] = min(matrix[i][j-1] + INSERT_COST,
matrix[i-1][j] + DROP_COST,
matrix[i-1][j-1] + DIAGONAL_COST)
# damerau lookup using transpose operations for frequent scenarios user rearranged chars during typing
# we will choose lowest value in between old matrix and new one
if i > 1 and j > 1 and s[i-1] == t[j-2] and s[i-2] == t[j-1]:
matrix[i][j] = min(matrix[i][j],
matrix[i-2][j-2]+TRANSPOSE_COST)
return matrix[rows-1][cols-1]
def calculate_resemblance_score(s, t, distance):
"""
Calculate resemblance score in range of 0 to 1, using max possible distance/cost. Higher score indicate more resemblance.
If score == 1: both source and target strings are equal
Args:
source (string): source string
target (string): target string
distance (int): damerau-levenshtein distance between source and target string
Returns:
Decimal value in range of 0 to 1
"""
max_possible_distance = max(len(s), len(t)) * max(INSERT_COST, DROP_COST, SUBSTITUTE_COST, TRANSPOSE_COST)
return Decimal(1) - Decimal(distance)/Decimal(max_possible_distance)
def get_match_with_score(word, input_dict, limit=None, strict=False):
"""Returns possible matching words from input dictionary sorted by resemblance score.
Args:
word (string): word to find similar words
input_dict (list): dictionary to match input word against
limit (int): limiting output words
strict (boolean): enable RESEMBLANCE_THRESHOLD to drop suggestions having score below threshold
Returns:
List of tuples of possible matching words with resemblance score
e.g.[('foo', 0.9111), ('Bar', 0.6543), ('Baz', 0.333)]"""
suggestions = []
for i in input_dict:
dist = calculate_damerau_levenshtein_distance(word.lower(), i)
score = calculate_resemblance_score(word, i, dist)
if not strict:
suggestions.append((i, score))
else:
if score >= Decimal(0.4):
suggestions.append((i, score))
if limit is None or limit > len(suggestions):
return sorted(suggestions, key=lambda x: x[1], reverse=True)
return sorted(suggestions, key=lambda x: x[1], reverse=True)[:limit]
def analyze_word(word, input_dict, limit=1, strict=False):
"""
Analyze given word against input dictionary to find possible matches
Args:
word (string): input word to find matches
input_dict (list): input dictionary to match against word
limit (int): max number of suggestions
strict (boolean): remove words from suggestions which are below RESEMBLANCE_THRESHOLD
Returns:
word and list of possible suggestions.
e.g. 'fo', ['foo', 'bar', 'baz']
"""
match_with_scores = get_match_with_score(word, input_dict, limit, strict)
suggestions = [i[0] for i in match_with_scores]
return word, suggestions
def analyze_paragraph(text, input_dict):
"""
Analyze given paragraph again input dictionary to find wrong words and return possible suggestions.
Args:
text (string): text to analyze
input_dict (list): input dict to match against text paragraph
Returns:
list of tuples of word and possible matches.
"""
result = []
text_list = text.split()
for word in set(text_list):
# If word not present in dict, then only try to match suggestion
if word.lower() not in input_dict:
word, suggestions = analyze_word(word, input_dict, strict=True)
if suggestions:
result.append((word, suggestions))
return result
def parse_text_file(filename):
"""Parse given text file and return content of file.
Args:
filename (string): filepath to read text from
Returns:
content of path"""
with open(filename, 'r') as text_file:
text = text_file.read()
return text
def parse_dict_file(filename):
"""Parse given dictionary file and return list of words, assuming new line as delimiter.
Args:
filename (string): filepath to read dict from. Each word should be on new line
Returns:
List containing words
"""
with open(filename, 'r') as dict_file:
input_dict = [i.strip().lower() for i in dict_file.readlines()]
return input_dict
if __name__ == "__main__":
import argparse, sys, os
ap = argparse.ArgumentParser(description="Text Analyzer")
ap.add_argument("-t", help="Input text to be analyzed")
ap.add_argument("-f", help="Input file path containing text to be analyzed")
ap.add_argument("-d", help="Input file path of dictionary to be used with analyzer. Each new word should be on new line.")
args = vars(ap.parse_args())
text_path = args["f"]
text = args["t"]
dict_path = args["d"]
# Give preference over text if both -t and -f present
if not text:
# if input text file not present, exit
if not text_path or not os.path.isfile(text_path):
print("Filepath is not valid\n")
ap.print_help()
sys.exit(1)
# read content of input file
text = parse_text_file(text_path)
# if input dict file is present, load dict else use default dict
if dict_path and os.path.isfile(dict_path):
input_dict = parse_dict_file(dict_path)
else:
input_dict = DEFAULT_ENGLISH_DICT
# finally pass text and input dict to analyzer
result = analyze_paragraph(text, input_dict)
for word, suggestions in result:
print("%s => %s" %(word, ",".join(suggestions)))