-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdedup_pairwise.py
executable file
·123 lines (106 loc) · 3.55 KB
/
dedup_pairwise.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python3
"""Naively search for duplicate text files based on string similarity."""
import difflib
from math import factorial
from itertools import combinations
from tqdm import tqdm
tqdm.monitor_interval = 0
def combos(n, k=2):
"""Compute n choose k"""
return int(factorial(n) / (factorial(k) * factorial(n - k)))
def diff(s1, s2):
"""Compute the difference betweeen two strings normalized by the length
of the longest of the two strings"""
longest = max((s1, s2), key=len)
return sum(d[0] != ' ' for d in difflib.ndiff(s1, s2)) / len(longest)
class Text(object):
"""A class modeling a text document."""
def __init__(
self,
filename,
threshold=0.25
):
self.filename = filename
self.threshold = threshold
self.data = self.load()
def load(self):
with open(self.filename, mode='r') as f:
data = f.read()
self.size = len(data)
return data
def __hash__(self):
return hash(self.filename)
def __eq__(self, other):
return diff(self.data, other.data) <= self.threshold
def __repr__(self):
return (
f'{self.__class__.__name__}('
f'{self.filename}, threshold={self.threshold})'
)
def document_sorter(document):
"""A sorting key function that sorts documents by length (descending)
and filename (ascending)"""
return -document.size, document.filename
def find_duplicates(documents):
"""Naively search all pair-wise combinations of a collection of strings
for duplicates.
The return value is a dict whose keys are strings and whose values are
sets of strings are duplicates of the key string according to the given
similarity threshold."""
deduped = set()
duplicates = {}
for pair in tqdm(
iterable=combinations(documents, 2),
total=combos(len(documents), 2),
unit='pairs',
dynamic_ncols=True
):
a, b = sorted(pair, key=document_sorter)
if (a, b) not in deduped and (a == b):
deduped.add((a, b))
if a not in duplicates:
duplicates[a] = {b}
else:
duplicates[a].add(b)
return duplicates
def main(filenames, threshold=0.25):
documents = []
for filename in tqdm(
filenames,
unit='files',
dynamic_ncols=True
):
documents.append(Text(filename, threshold=threshold))
dupe_sets = find_duplicates(documents)
for uniuqe, duplicates in dupe_sets.items():
for duplicate in duplicates:
print('\t'.join((unique, duplicate)))
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description=__doc__
)
parser.add_argument(
'filenames',
type=argparse.FileType('r'),
help=(
'file listing a set of filenames to check for duplicates; '
'one filename per line (use "-" to read filenames as lines from '
'stdin)'
)
)
parser.add_argument(
'-r', '--threshold',
type=float,
default=0.25,
help=(
'threshold for considering two strings equivalent (lower thresholds '
'are more strict; higher thresholds are more lenient)'
)
)
args = parser.parse_args()
filenames = args.filenames.read().splitlines()
if args.filenames.name != '<stdin>':
args.filenames.close()
main(filenames, threshold=args.threshold)