Merge pull request #32 from Ashutosh-agarwal2004/feature/duplicate-file-detector

devmalik7 · web-flow · commit 02d7a609a881 · 2025-10-08T00:44:05.000+05:30
Feature/duplicate file detector issue: #8
diff --git a/Python/duplicate_file_detector/README.md b/Python/duplicate_file_detector/README.md
@@ -0,0 +1,45 @@
+# 🧮 File Duplicator Detector
+
+A simple Python tool to **find and manage duplicate files** in any folder — by **name**, **size**, or **content hash**.
+
+---
+
+## 🚀 Features
+- Detect duplicates by:
+  - 🔠 **File name**
+  - 📏 **File size**
+  - 🔐 **File hash (SHA-256)**
+- Option to:
+  - 🧾 **List duplicates only**
+  - 🗑 **Safely delete duplicates** (keeps one copy)
+
+---
+
+## ⚙️ How to Run
+1. Clone the repository:
+   ```bash
+   git clone https://github.com/<your-username>/<repo>.git
+   cd REPOSITORY-NAME/Python/duplicate_file_detector
+   ```
+
+2. Run the script:
+   ```bash
+   python find_duplicates.py
+   ```
+
+3. Follow on-screen instructions:
+   - Choose scan mode (by name / size / hash)
+   - Decide whether to delete duplicates
+
+---
+
+## 📦 Requirements
+No external libraries needed.
+
+## 🖼️ Example Output
+![App Screenshot](Python\duplicate_file_detector\Screenshots "Screenshot1")
+
+
+## 👨‍💻 Author
+Ashutosh Agarwal(https://github.com/Ashutosh-agarwal2004)
+
diff --git a/Python/duplicate_file_detector/Screenshots/Screenshot1.PNG b/Python/duplicate_file_detector/Screenshots/Screenshot1.PNG
diff --git a/Python/duplicate_file_detector/find_duplicates.py b/Python/duplicate_file_detector/find_duplicates.py
@@ -0,0 +1,76 @@
+import os
+import hashlib
+from collections import defaultdict
+
+def get_file_hash(path, chunk_size=8192):
+    """Generate SHA256 hash for a file."""
+    h = hashlib.sha256()
+    try:
+        with open(path, "rb") as f:
+            while chunk := f.read(chunk_size):
+                h.update(chunk)
+        return h.hexdigest()
+    except (PermissionError, FileNotFoundError):
+        return None
+
+def scan_folder(folder_path, mode="hash"):
+    """Scan folder and find duplicates based on mode ('name', 'size', or 'hash')."""
+    duplicates = defaultdict(list)
+    for root, _, files in os.walk(folder_path):
+        for file in files:
+            file_path = os.path.join(root, file)
+            if mode == "name":
+                key = file
+            elif mode == "size":
+                try:
+                    key = os.path.getsize(file_path)
+                except OSError:
+                    continue
+            elif mode == "hash":
+                key = get_file_hash(file_path)
+                if not key:
+                    continue
+            else:
+                raise ValueError("Mode must be 'name', 'size', or 'hash'.")
+            duplicates[key].append(file_path)
+    return {k: v for k, v in duplicates.items() if len(v) > 1}
+
+def display_duplicates(duplicates):
+    if not duplicates:
+        print("✅ No duplicates found.")
+        return
+    print("\n🔍 Duplicate Files Found:\n")
+    for group, files in duplicates.items():
+        print(f"Group ({len(files)} files):")
+        for f in files:
+            print("  ", f)
+        print("-" * 60)
+
+def delete_duplicates(duplicates):
+    for group, files in duplicates.items():
+        keep = files[0]
+        print(f"\nKeeping: {keep}")
+        for f in files[1:]:
+            try:
+                os.remove(f)
+                print(f"🗑 Deleted duplicate: {f}")
+            except Exception as e:
+                print(f"⚠️ Could not delete {f}: {e}")
+
+if __name__ == "__main__":
+    folder = input("Enter folder path to scan: ").strip()
+    print("\nSelect comparison mode:")
+    print("1. By name\n2. By size\n3. By hash (recommended)")
+    mode_choice = input("Enter choice (1/2/3): ").strip()
+    mode = {"1": "name", "2": "size", "3": "hash"}.get(mode_choice, "hash")
+
+    duplicates = scan_folder(folder, mode)
+    display_duplicates(duplicates)
+
+    if duplicates:
+        action = input("\nDo you want to delete duplicates? (y/n): ").strip().lower()
+        if action == "y":
+            delete_duplicates(duplicates)
+            print("\n✅ Deletion complete.")
+        else:
+            print("\nNo files were deleted.")