-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdeletecopies.py
101 lines (85 loc) · 3.56 KB
/
deletecopies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env python
import os
import sys
import hashlib
from collections import defaultdict
import time
def chunk_reader(fobj, chunk_size=1024):
""" Generator that reads a file in chunks of bytes """
while True:
chunk = fobj.read(chunk_size)
if not chunk:
return
yield chunk
def get_hash(filename, first_chunk_only=False, hash_algo=hashlib.sha1):
hashobj = hash_algo()
with open(filename, "rb") as f:
if first_chunk_only:
hashobj.update(f.read(1024))
else:
for chunk in chunk_reader(f):
hashobj.update(chunk)
return hashobj.digest()
def check_for_duplicates(paths, statement):
z=0
start_time = time.time()
removed = 0
skipped = 0
files_by_size = defaultdict(list)
files_by_small_hash = defaultdict(list)
files_by_full_hash = dict()
for path in paths:
for dirpath, dirnames, filenames in os.walk(path):
for filename in filenames:
full_path = os.path.join(dirpath, filename)
try:
# if the target is a symlink (soft one), this will
# dereference it - change the value to the actual target file
full_path = os.path.realpath(full_path)
file_size = os.path.getsize(full_path)
except OSError:
# not accessible (permissions, etc) - pass on
continue
files_by_size[file_size].append(full_path)
print("Check " + str(z))
z+=1
# For all files with the same file size, get their hash on the first 1024 bytes
for files in files_by_size.values():
if len(files) < 2:
continue # this file size is unique, no need to spend cpu cycles on it
for filename in files:
try:
small_hash = get_hash(filename, first_chunk_only=True)
except OSError:
# the file access might've changed till the exec point got here
continue
files_by_small_hash[small_hash].append(filename)
print("Check " + str(z))
z+=1
# For all files with the hash on the first 1024 bytes, get their hash on the full
# file - collisions will be duplicates
for files in files_by_small_hash.values():
if len(files) < 2:
# the hash of the first 1k bytes is unique -> skip this file
continue
for filename in files:
try:
full_hash = get_hash(filename, first_chunk_only=False)
except OSError:
# the file access might've changed till the exec point got here
continue
if full_hash in files_by_full_hash:
duplicate = files_by_full_hash[full_hash]
print("Duplicate found:\n - %s\n - %s\n" % (filename, duplicate))
filename = filename.replace('\\\\','\\')
try:
os.remove(filename)
except PermissionError:
skipped+=1
print("Check "+ str(z) +"skipped")
removed +=1
else:
files_by_full_hash[full_hash] = filename
print("Check " + str(z))
z+=1
statement.set("✅ Finished cleaning - "+ str(removed-skipped) + " files removed in "+ str(round(time.time() - start_time, 4)) + " seconds & "+str(skipped)+" files with permission error")