Skip to content

Commit

Permalink
updated fileFilter_test.py file
Browse files Browse the repository at this point in the history
Deleted FileFilter folder and added unit tests for setMinFileSize method of fileFilter class in tests folder.
  • Loading branch information
kushwxha committed Oct 2, 2024
1 parent 9c5faae commit bf3c4de
Show file tree
Hide file tree
Showing 8 changed files with 145 additions and 61 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ python twinTrim.py <directory> [OPTIONS]
- `--max-size`: Specify the maximum file size to include in the scan (e.g., `1gb`).
- `--file-type`: Specify the file type to include (e.g., `.txt`, `.jpg`).
- `--exclude`: Exclude specific files by name.
- `--label-color`: Set the font color of the output label of the progress bar.
- `--bar-color`: Set the color of the progress bar.

### Examples

Expand Down
42 changes: 0 additions & 42 deletions tests/fileFilter.py

This file was deleted.

44 changes: 44 additions & 0 deletions tests/fileFilter_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import pytest
from twinTrim.dataStructures.fileFilter import FileFilter


def test_set_min_file_size_valid():
"""Test setting valid minimum file size values."""
file_filter = FileFilter()

# Test with a larger valid size
file_filter.setMinFileSize("20kb")
assert file_filter.minFileSize == "20kb", "Failed to set min file size to 20kb"

# Test with a smaller valid size
file_filter.setMinFileSize("5kb")
assert file_filter.minFileSize == "5kb", "Failed to set min file size to 5kb"

# Test with an edge case (1kb)
file_filter.setMinFileSize("1kb")
assert file_filter.minFileSize == "1kb", "Failed to set min file size to 1kb"

def test_set_min_file_size_empty_string():
"""Test setting an empty string for minimum file size."""
file_filter = FileFilter()

# Empty string should be allowed since no validation exists
file_filter.setMinFileSize("")
assert file_filter.minFileSize == "", "Failed to set min file size to an empty string"

def test_set_min_file_size_special_characters():
"""Test setting special characters or random string as min file size."""
file_filter = FileFilter()

# Special characters should be accepted since no validation exists
file_filter.setMinFileSize("!!invalid!!")
assert file_filter.minFileSize == "!!invalid!!", "Failed to set min file size to special characters"

def test_set_min_file_size_numeric_string():
"""Test setting numeric string as minimum file size."""
file_filter = FileFilter()

# Numeric string should be accepted, even without units
file_filter.setMinFileSize("123")
assert file_filter.minFileSize == "123", "Failed to set min file size to a numeric string"

76 changes: 76 additions & 0 deletions tests/fileMetadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import threading
import pytest
from twinTrim.dataStructures.fileMetadata import add_or_update_normal_file, normalStore, normalStore_lock

# Mocking the get_file_hash function so that we don't have to worry about the actual file
def mock_get_file_hash(file_path):
return f"hash_{file_path}"

# This will automatically replaces the get_file_hash function with the mock_get_file_hash function
@pytest.fixture(autouse=True)
def mock_get_file_hash_func(monkeypatch):
monkeypatch.setattr("twinTrim.dataStructures.fileMetadata.get_file_hash", mock_get_file_hash)

# Automatically reset the normalStore before each test
@pytest.fixture(autouse=True)
def reset_normal_store():
normalStore.clear()

def test_add_or_update_normal_file_concurrently():
"Test that normalStore is consistant when add_or_update_normal_file is called concurrently with different file paths by multiple threads"

# List of file paths to be added concurrently
file_paths = [f"file_{i}" for i in range(10)]

# this function will be called by each thread to add a file path
def worker(file_path):
add_or_update_normal_file(file_path)

# Threads for concurrent execution
threads = [threading.Thread(target=worker, args=(file_path,)) for file_path in file_paths]

# starting all the threads for concurrent execution
for thread in threads:
thread.start()

# waiting for all the threads to finish
for thread in threads:
thread.join()

# Checking that normalStore has exactly 10 unique file hashes (since all file paths are unique)
assert len(normalStore) == 10

# Checking that each file hash has exactly 1 file path
for file_path in file_paths:
file_hash = mock_get_file_hash(file_path)
assert file_hash in normalStore
assert normalStore[file_hash].filepaths == [file_path]

def test_add_or_update_normal_file_with_duplicates_concurrently():
"Test that adding duplicate file paths to normalStore does not create duplicate entries"

file_paths = "duplicate_file.txt"
num_threads = 5

# this function will be called by each thread to add a file path
def worker():
add_or_update_normal_file(file_paths)

# Create a list of threads to add the same file path concurrently
threads = [threading.Thread(target=worker) for _ in range(num_threads)]

# starting all the threads for concurrent execution
for thread in threads:
thread.start()

# waiting for all the threads to finish
for thread in threads:
thread.join()

# Checking that normalStore has exactly 1 unique file hash (since all file paths are the same)
file_hash = mock_get_file_hash(file_paths)
assert len(normalStore) == 1
assert file_hash in normalStore

# Checking that the file path is added only once despite being added by multiple threads
assert normalStore[file_hash].filepaths == [file_paths]
2 changes: 1 addition & 1 deletion twinTrim/dataStructures/fileFilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

class FileFilter:
def __init__(self):
self.minFileSize = "10kb"
self.minFileSize = "0kb"
self.maxFileSize = "1gb"
self.fileType = r"^.+\.*$"
self.fileExclude = []
Expand Down
3 changes: 1 addition & 2 deletions twinTrim/dataStructures/fileMetadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,4 @@ def add_or_update_normal_file(file_path: str):
if existing_file_metadata is None:
normalStore[file_hash] = new_file_metadata
else:
existing_file_metadata.insert_file(file_path)

existing_file_metadata.insert_file(file_path)
27 changes: 15 additions & 12 deletions twinTrim/flagController.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,28 @@
from twinTrim.dataStructures.fileMetadata import FileMetadata, normalStore , add_or_update_normal_file
from tqdm import tqdm

def handleAllFlag(directory,file_filter):
#bar function to be used by scanner thread pools
progress_bar_format = "{desc}: {n_fmt}/{total_fmt} | ETA={remaining} | {rate_fmt} {bar} {percentage:.3f}%"
"""adaptive progress bar, returns tqdm object"""
def progress_bar_func(bar_desc, total, unit='file',color="yellow", bar_color='white'):
try: #default to yellow
bar_desc_obj = click.style(bar_desc, fg=color, bold=True) # set to bold by default
except:
print(f"Warning, invalid ColorType: {color} falling back to default yellow.")
bar_desc_obj = click.style(bar_desc, fg='yellow', bold=True)
return tqdm(total=total, desc=bar_desc_obj, unit=unit, bar_format= progress_bar_format, colour=bar_color)

def handleAllFlag(directory,file_filter,pb_color,bar_color):
"""Handle all duplicates automatically without asking if --all flag is set."""
all_start_time = time.time()
yellow = '\033[93m'
reset = '\033[0m'
progress_bar_format = f"{yellow}{{l_bar}}{{bar}}{{r_bar}}{{bar}}{reset}"

# Collect all file paths to process
all_files = [os.path.join(root, file_name) for root, _, files in os.walk(directory) for file_name in files]
all_files = [f for f in all_files if file_filter.filter_files(f)]
total_files = len(all_files)

# Use ThreadPoolExecutor to handle files concurrently
with ThreadPoolExecutor() as executor, tqdm(total=total_files, desc="Scanning files", unit="file", bar_format=progress_bar_format) as progress_bar:
with ThreadPoolExecutor() as executor, progress_bar_func("Scanning", total_files, color=pb_color, bar_color=bar_color) as progress_bar:
futures = {executor.submit(add_or_update_file, file_path): file_path for file_path in all_files}

# Update progress bar as files are processed
Expand All @@ -35,24 +43,19 @@ def handleAllFlag(directory,file_filter):
click.echo(click.style("All duplicates deleted!", fg='green'))


def find_duplicates(directory, file_filter):
def find_duplicates(directory, file_filter, pb_color, bar_color):
"""Find duplicate files in the given directory and store them in normalStore."""
# Collect all file paths first and apply filters
all_files = [os.path.join(root, file_name) for root, _, files in os.walk(directory) for file_name in files]
all_files = [f for f in all_files if file_filter.filter_files(f)] # Apply filters

# Calculate the total number of files and ensure it is finite
total_files = len(all_files)

# Define yellow color ANSI escape code
yellow = '\033[93m'
reset = '\033[0m'
progress_bar_format = f"{yellow}{{l_bar}}{{bar}}{{r_bar}}{{bar}}{reset}"

def process_file(file_path):
add_or_update_normal_file(file_path)

with ThreadPoolExecutor() as executor, tqdm(total=total_files, desc="Scanning files", unit="file", bar_format=progress_bar_format) as progress_bar:
with ThreadPoolExecutor() as executor, progress_bar_func("Hashing", total_files, color=pb_color, bar_color=bar_color) as progress_bar:
# Submit tasks to the executor
futures = {executor.submit(process_file, file_path): file_path for file_path in all_files}

Expand Down
10 changes: 6 additions & 4 deletions twinTrim/flags.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@
@click.command()
@click.argument("directory", type=click.Path(exists=True))
@click.option("--all", is_flag=True, help="Delete duplicates automatically without asking.")
@click.option("--min-size", default="10kb", type=str, help="Minimum file size in bytes.")
@click.option("--min-size", default="0kb", type=str, help="Minimum file size in bytes.")
@click.option("--max-size", default="1gb", type=str, help="Maximum file size in bytes.")
@click.option("--file-type", default=".*", help="File type to include (e.g., .txt, .jpg).")
@click.option("--exclude", multiple=True, help="Files to exclude by name.")
def cli(directory, all, min_size, max_size, file_type, exclude):
@click.option("--label-color", default="yellow", type=str, help="Color of the label of progress bar.")
@click.option("--bar-color", default='#aaaaaa', type=str, help="Color of the progress bar.")
def cli(directory, all, min_size, max_size, file_type, exclude, label_color, bar_color):
"""Find and manage duplicate files in the specified DIRECTORY."""

# Initialize the FileFilter object
Expand All @@ -26,11 +28,11 @@ def cli(directory, all, min_size, max_size, file_type, exclude):
file_filter.addFileExclude(file_name)

if all:
handleAllFlag(directory, file_filter)
handleAllFlag(directory, file_filter, label_color, bar_color)
return

start_time = time.time()
duplicates = find_duplicates(directory, file_filter)
duplicates = find_duplicates(directory, file_filter, label_color, bar_color)

end_time = time.time()
time_taken = end_time - start_time
Expand Down

0 comments on commit bf3c4de

Please sign in to comment.