-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathwsd.py
461 lines (413 loc) · 16 KB
/
wsd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
#!/usr/bin/env python
# encoding: utf-8
'''
Writing Smell Detector is a tool to help find problems in your writing.
'''
import os
import sys
import math
import json
from glob import glob
import codecs
__author__ = 'John Joseph Horton, utapyngo'
__copyright__ = 'Copyright (C) 2012 John Joseph Horton, utapyngo, oDesk'
__credits__ = ['qbonnard', 'eventh']
__license__ = 'GPL'
__maintainer__ = 'utapyngo'
__email__ = 'utapyngo@gmail.com'
__status__ = 'Development'
__version__ = '0.1'
def _setup_logger():
'''
Setup and return a console logger.
'''
import logging
logger = logging.getLogger(__name__)
console_handler = logging.StreamHandler()
console_handler.setFormatter(
logging.Formatter('%(levelname)s: %(message)s'))
logger.addHandler(console_handler)
logger.setLevel(logging.INFO)
return logger
LOG = _setup_logger()
# sys.stdout.encoding is None when piping to a file
# sys.stdout does not have the `encoding` attribute with GAE dev_appserver
_encoding = sys.stdout.encoding if hasattr(sys.stdout, 'encoding') else None
if _encoding is None:
_encoding = sys.getfilesystemencoding()
def print_console(*args):
'''
Print unicode args to console replacing unknown characters
'''
print(u' '.join(args).encode(_encoding, 'replace'))
class Rule(object):
'''
Base class of rules
'''
def __init__(self, ruleset, name, comments=[], props={}):
self.ruleset = ruleset
self.name = name
self.comments = comments
self.props = props
self.patterns = [{'original': ''}]
def itermatches(self, text):
pass
def process(self, text):
'''
Apply the rule to text and return the result
'''
matched_lines = {}
pattern_matches = {}
def add_line(line, lineno):
'''
Add a line to the matched_lines dictionary.
If the line contains \n, split it into multiple lines
and add all of them.
'''
for i, chunk in enumerate(line.strip('\n').split('\n')):
matched_lines[lineno + i] = chunk
for pattern, matches in self.itermatches(text):
rmatches = {}
line = None
for match in matches:
start, end = match
lineno = text.count('\n', 0, start) + 1
linestart = text.rfind('\n', 0, start) + 1
lineend = text.find('\n', end, -1)
if lineend > 0:
line = text[linestart:lineend]
else:
line = text[linestart:]
add_line(line, lineno)
# location of the match relative to current line
lstart, lend = start - linestart, end - linestart
linespan = line.strip().count('\n') + 1
if lineno in rmatches:
rmatches[lineno][0] = linespan
rmatches[lineno][1].append((lstart, lend))
else:
rmatches[lineno] = [linespan, [(lstart, lend)]]
if rmatches:
pattern_matches[pattern['original']] = rmatches
return ProcessedRule(self, matched_lines, pattern_matches)
class Ruleset(object):
'''
Set of rules
'''
def __init__(self, name, comments=[], props={}, uid=None):
self.name = name
self.comments = comments
self.props = props
if uid:
self.uid = uid
else:
self.uid = name
def process(self, text):
'''
Apply all rules to text and return the results
'''
matched_lines = {}
matched_rules = []
for rule in self.rules:
processed_rule = rule.process(text)
matched_lines.update(processed_rule.lines)
matched_rules.append(processed_rule)
return ProcessedRuleset(self, matched_lines, matched_rules)
class ProcessedRule(object):
'''
Processed rule.
Contains a reference to an original rule,
a dictionary of matched lines and
a dictionary of pattern matches.
'''
def __init__(self, rule, lines, pattern_matches):
self.rule = rule
# dictionary of { lineno: line }
self.lines = lines
# dictionary of { pattern: [(lineno, start, end), ...] }
self.pattern_matches = pattern_matches
#self.nummatches = sum(len(m) for m in pattern_matches.itervalues())
self.nummatches = sum(len(m) for m in pattern_matches.items())
class ProcessedRuleset(object):
'''
Processed ruleset.
Contains a reference to an original ruleset,
a dictionary of matched lines and
a list of processed rules.
'''
def __init__(self, ruleset, lines, rules):
self.ruleset = ruleset
self.lines = lines
self.rules = rules
self.nummatches = sum(rule.nummatches for rule in rules)
def to_dict(self):
'''
Build and return a dictionary suitable for serialization
which contains all the information about ruleset
including all matches.
'''
result = self.ruleset.data.copy()
matched_rules = []
for processed_rule in self.rules:
rule_data = processed_rule.rule.data.copy()
rule_data['matches'] = processed_rule.pattern_matches
matched_rules.append(rule_data)
result['rules'] = matched_rules
return result
class ProcessedRulesets(object):
'''
A collection of processed rulesets.
Contains methods for serialization of results.
'''
def __init__(self, rulesets, text):
'''
Perform processing of rulesets.
Args:
rulesets: A list of raw rulesets.
text: A text to process.
'''
processed_rulesets = []
matched_lines = {}
for ruleset in rulesets:
processed_ruleset = ruleset.process(text)
matched_lines.update(processed_ruleset.lines)
processed_rulesets.append(processed_ruleset)
self.rulesets = processed_rulesets
self.lines = matched_lines
self.text = text
def to_dict(self, include_lines):
'''
Args:
include_lines: A boolean indicating if matched lines
should be included.
Returns:
A dict suitable for seraalization
containing a rulesets key (a list of processed rulesets)
and optionally a lines key.
'''
result = {'rulesets': [r.to_dict() for r in self.rulesets]}
if include_lines:
result['lines'] = self.lines
return result
def to_html(self, embed_css=True, include_empty=False):
'''
Convert results into HTML.
Args:
embed_css: A boolean indicating whether css should be
embedded into the HTML code.
include_empty: A boolean indicating whether empty rulesets,
rules and patterns should be returned.
Returns:
A string of HTML code representing the results.
'''
from jinja2 import Environment, FileSystemLoader
loader = FileSystemLoader(
os.path.join(os.path.dirname(os.path.abspath(__file__)), 'html'))
env = Environment(loader=loader)
template = env.get_template('template.html')
return template.render(rulesets=self.rulesets,
lines=self.lines,
text=self.text,
css=loader.get_source(env, 'style.css')[0] if embed_css else None,
include_empty=include_empty)
def to_console(self, include_empty=False):
'''
Print results to console.
Args:
include_empty: A boolean indicating whether empty rulesets,
rules and patterns should be printed.
'''
# max number of digits in line number
max_digits = int(math.ceil(math.log10(self.text.count('\n') + 1)))
def print_line(line, lineno):
'''
Print a line together with its line number.
If the line contains \n, split it into multiple lines
and print all of them.
'''
for i, chunk in enumerate(line.strip().split('\n')):
print_console(u'{1:>{0}}: {2}'
.format(max_digits, lineno + i, chunk))
def print_pattern(rule, pattern):
matched_lines = rule.pattern_matches.get(pattern, {})
nummatches = len(matched_lines)
if not include_empty and nummatches == 0:
return
print()
print_console(u' Pattern: {0} ({1})'
.format(pattern, nummatches))
if hasattr(rule.rule, 'get_pattern_props'):
props = rule.rule.get_pattern_props(pattern)
for pp, pv in props.items():
print_console(u' {0}: {1}'.format(pp.title(), pv))
for lineno in sorted(matched_lines.keys()):
linespan, matches = matched_lines[lineno]
data = ''
for i in range(linespan):
data += self.lines[lineno + i] + '\n'
offset = 0
chunks = [{'data': data}]
for match in matches:
start = match[0] - offset
end = match[1] - offset
line = chunks.pop()['data']
chunks.append({'highlight': False,
'data': line[:start]})
chunks.append({'highlight': True,
'data': line[start:end]})
chunks.append({'highlight': False,
'data': line[end:]})
offset += len(line) - len(chunks[-1]['data'])
for chunk in chunks:
if chunk['highlight']:
chunk['data'] = '*' + chunk['data'] + '*'
print_line(''.join([c['data'] for c in chunks]), lineno)
for ruleset in self.rulesets:
if not include_empty and ruleset.nummatches == 0:
continue
print()
print()
print_console(u' {0} ({1})'
.format(ruleset.ruleset.name, ruleset.nummatches))
if ruleset.ruleset.comments:
for comment in ruleset.ruleset.comments:
print_console(comment)
for rule in ruleset.rules:
if not include_empty and rule.nummatches == 0:
continue
print()
print_console(u' Rule: {0} ({1})'
.format(rule.rule.name, rule.nummatches))
if rule.rule.comments:
for comment in rule.rule.comments:
print_console(comment)
for prop, value in rule.rule.props.items():
if value:
print_console(u' {0}: {1}'.format(prop.title(), value))
for pattern in rule.rule.patterns:
print_pattern(rule, pattern['original'])
class IterableEncoder(json.JSONEncoder):
'''
JSON encoder which supports encoding of iterables.
'''
def default(self, o):
try:
iterable = iter(o)
except TypeError:
pass
else:
return list(iterable)
return super(IterableEncoder, self).default(o)
def get_base_path():
return os.path.dirname(os.path.abspath(__file__))
def get_rule_types():
rules_mask = os.path.join(get_base_path(), '*_rules.py')
for fn in glob(rules_mask):
yield os.path.basename(fn)[:-9]
def parse_args():
'''
Parse and return command line arguments.
'''
ruletypes = get_rule_types()
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('text', type=str,
help='text file',)
parser.add_argument('rules', type=str, nargs='*',
help = 'rule type followed by arguments required by this type\n' +
'available types of rules are: ' + ', '.join(ruletypes))
#parser.add_argument('--info', type=str, metavar='RULETYPE',
# help='show number of arguments and docstrings of a specific rule type')
export = parser.add_argument_group('export')
html_group = parser.add_argument_group('html')
json_group = parser.add_argument_group('json')
export.add_argument('-o', '--outfile', action='store',
help='output file name')
export.add_argument('-f', '--output-format',
default='html', choices=('json', 'html'),
help='output file format')
export.add_argument('-e', '--include-empty', action='store_true',
help='Include empty rules to output')
html_group.add_argument('-nec', '--no-embed-css', action='store_true',
help="don't embed style.css into generated html file")
json_group.add_argument('-r', '--reftext', action='store_true',
help='insert a reference to the text file into the output \
json file instead of the list of matching lines')
json_group.add_argument('-a', '--abspath', action='store_true',
help='insert absolute path to the text file into the output \
json file instead of the path passed to command line')
json_group.add_argument('-i', '--indent',
type=int, action='store', default=4,
help='json indent size')
try:
args = parser.parse_args()
except Exception as exc:
LOG.error(exc)
sys.exit(1)
if args.indent == 0:
args.indent = None
return args
def main(args=parse_args()):
'''
Load text from args.text.
Load and run rulesets from args.ruleset list.
Store results to args.outfile if specified.
'''
# Check for errors
if not os.path.isfile(args.text):
LOG.error('File not found: ' + args.text)
return 1
# Load text
text = codecs.open(args.text, encoding='utf-8').read()
LOG.info('Loaded {0} bytes from {1}'.format(len(text), args.text))
# Load rules
rulesets = []
rules = args.rules
if rules:
while rules:
rule_name = rules[0]
rules[0:1] = []
mod = __import__('{0}_rules'.format(rule_name))
if hasattr(mod, 'get_rulesets'):
argcount = mod.get_rulesets.func_code.co_argcount
rule_args = rules[:argcount]
rules[:argcount] = []
rulesets.extend(mod.get_rulesets(*rule_args))
else:
import imp
for rule_name in get_rule_types():
try:
modname = '{0}_rules'.format(rule_name)
mod = imp.load_source(modname, os.path.join(get_base_path(), '{0}.py'.format(modname)))
rulesets.extend(mod.get_rulesets())
LOG.info('Loaded: {0}'.format(rule_name))
except Exception as e:
LOG.warn('Not loaded: {0}: {1}'.format(rule_name, e))
# Process rules
prulesets = ProcessedRulesets(rulesets, text)
# Output the result
if args.outfile:
if args.output_format == 'json':
if args.reftext:
path = os.path.abspath(args.text) if args.abspath else args.text
json_results = prulesets.to_dict(False)
import hashlib
json_results['text'] = {
'file': path,
'md5': hashlib.md5(args.text).hexdigest()
}
else:
json_results = prulesets.to_dict(True)
json.dump(json_results,
codecs.open(args.outfile, 'wb', encoding='utf-8'),
indent=args.indent, cls=IterableEncoder)
elif args.output_format == 'html':
html = prulesets.to_html(not args.no_embed_css, args.include_empty)
outfile = codecs.open(args.outfile, 'wb', encoding='utf-8')
outfile.write(html)
LOG.info('Results saved to: {0}'.format(args.outfile))
else:
prulesets.to_console(args.include_empty)
if __name__ == '__main__':
print(parse_args())
sys.exit(main(parse_args()))