-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrmAllDupByExt.py
120 lines (102 loc) · 4.33 KB
/
rmAllDupByExt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
import hashlib
import platform
import argparse
import subprocess
# set console use UTF-8
subprocess.run(['chcp', '65001'], shell=True, check=True)
# File types to be processed
file_types = ['.epub', '.mobi', '.azw3', '.azw', '.pdf', '.mp4', '.avi', '.mkv', '.wmv']
# Dictionary to store hash values and their corresponding file lists
hash_to_files = {}
def get_file_hash(file_path):
try:
# Open the file in binary mode
with open(file_path, 'rb') as f:
# Read the file content
content = f.read()
# Compute the hash value of the file content
file_hash = hashlib.md5(content).hexdigest()
return file_hash
except Exception as e:
print(f"Failed to read file: {file_path}. Error: {e}")
return None
def find_duplicates(directory):
global hash_to_files
# Traverse all files in the directory and its subdirectories
for root, dirs, files in os.walk(directory):
for file in files:
file_path = os.path.join(root, file)
# Check if the file type is in the list of file types to be processed
if os.path.splitext(file_path)[1].lower() in file_types:
# Compute the hash value of the file
file_hash = get_file_hash(file_path)
if file_hash:
# Add the file to the list corresponding to its hash value
hash_to_files.setdefault(file_hash, []).append(file_path)
def create_batch_file(delete_commands):
# Generate content for the batch file
batch_file_content = '@echo off\n'
# setting UTF-8 for the batch
batch_file_content += 'chcp 65001\n'
batch_file_content += '\n'.join(delete_commands)
# Add 'pause' command to the end of the batch file
batch_file_content += '\npause'
# Write content to the batch file with UTF-8 encoding
with open('delete_duplicates.bat', 'w', encoding='utf-8') as f:
f.write(batch_file_content)
print("Batch file 'delete_duplicates.bat' created successfully.")
def delete_file(file_path):
try:
# Attempt to delete the file
os.remove(file_path)
print(f"Successfully deleted file: {file_path}")
except Exception as e:
print(f"Failed to delete file: {file_path}. Error: {e}")
def delete_duplicates(rm):
global hash_to_files
delete_commands = []
for file_paths in hash_to_files.values():
# If there are duplicate files in the list, keep the first one and delete the rest
for file_path in file_paths[1:]:
print(f"Found duplicate file: {file_path}")
delete_commands.append(f"del /F \"{file_path}\"" if platform.system() == 'Windows' else f"rm -rf \"{file_path}\"")
if rm:
delete_file(file_path)
create_batch_file(delete_commands)
if not delete_commands:
print("No duplicate files found.")
def scan_directory(directory):
print(f"Scanning directory: {directory}...")
find_duplicates(directory)
print(f"Finished scanning directory: {directory}")
def scan_drives_windows():
# Get the root directories of all drives on Windows
drives = [f"{d}:\\" for d in 'ABDCEFGHIJKLMNOPQRSTUVWXYZ' if os.path.exists(f"{d}:\\")]
return drives
def scan_drives_linux():
# Get all root directories on Linux
return ['/']
def main():
parser = argparse.ArgumentParser(description='Delete duplicate files.')
parser.add_argument('-rm', action='store_true', help='Delete duplicate files immediately, by default generate delete script')
parser.add_argument('-d', '--directory', type=str, default='/', help='Directory to scan for duplicate files (default: root directory).')
args = parser.parse_args()
# Choose the appropriate function to get the drive list based on the operating system
if platform.system() == 'Windows':
scan_drives_func = scan_drives_windows
else:
scan_drives_func = scan_drives_linux
# If the directory argument is not provided, scan all drives
if args.directory == '/':
# Traverse the root directory of each drive
for drive in scan_drives_func():
scan_directory(drive)
else:
# Scan the specified directory
scan_directory(args.directory)
# Delete duplicate files
delete_duplicates(args.rm)
print("Done!")
if __name__ == "__main__":
main()