diff --git a/.setup_dev.sh b/.setup_dev.sh index 7cd300e9..24a3aaaf 100644 --- a/.setup_dev.sh +++ b/.setup_dev.sh @@ -34,7 +34,7 @@ python -m pip install -e . > /dev/null echo "Installing developer dependencies in local environment" echo "This might take a few minutes. Only errors will be printed to stdout" python -m pip install -e .'[dev]' > /dev/null -if [ -f docs/requirements.txt ]; then python -m pip install -r docs/requirements.txt; fi +if [ -f docs/requirements.txt ]; then python -m pip install -r docs/requirements.txt > /dev/null; fi echo "Installing pre-commit" pre-commit install > /dev/null diff --git a/src/fibad/data_sets/hsc_data_set.py b/src/fibad/data_sets/hsc_data_set.py index a9854f09..2780225f 100644 --- a/src/fibad/data_sets/hsc_data_set.py +++ b/src/fibad/data_sets/hsc_data_set.py @@ -413,8 +413,8 @@ def _prune_objects(self, filters_ref: list[str], cutout_shape: Optional[tuple[in logger.info(f"Filters for object {object_id} were {filters_unsorted}") logger.debug(f"Reference filters were {filters_ref}") - # Drop objects that can't meet the coutout size provided elif cutout_shape is not None: + # Drop objects that can't meet the cutout size provided for shape in self.dims[object_id]: if shape[0] < cutout_shape[0] or shape[1] < cutout_shape[1]: msg = f"A file for object {object_id} has shape ({shape[1]}px, {shape[1]}px)" @@ -422,6 +422,29 @@ def _prune_objects(self, filters_ref: list[str], cutout_shape: Optional[tuple[in msg += f"({cutout_shape[0]}px, {cutout_shape[1]}px)" self._mark_for_prune(object_id, msg) break + + # Drop objects where the cutouts are not the same size + first_shape = None + for shape in self.dims[object_id]: + first_shape = shape if first_shape is None else first_shape + if shape != first_shape: + msg = f"The first filter for object {object_id} has a shape of " + msg += f"({first_shape[0]}px,{first_shape[1]}px) another filter has shape of" + msg += f"({shape[0]}px,{shape[1]}px)" + self._mark_for_prune(object_id, msg) + break + + # Drop objects where parsing the filenames does not reveal the object IDs + for filter, filepath in filters_unsorted.items(): + filename = Path(filepath).name + # Check beginning of filename vs object_id + if filename[:17] != object_id: + msg = f"Filter {filter} for object id {object_id} has filename {filepath} listed" + msg += "The filename does not match the object_id, and the filter_catalog or " + msg += "manifest is likely corrupt." + self._mark_for_prune(object_id, msg) + break + if index != 0 and index % 1_000_000 == 0: logger.info(f"Processed {index} objects for pruning") else: diff --git a/tests/fibad/test_hsc_dataset.py b/tests/fibad/test_hsc_dataset.py index 52c78f4e..8e4234f0 100644 --- a/tests/fibad/test_hsc_dataset.py +++ b/tests/fibad/test_hsc_dataset.py @@ -1,6 +1,7 @@ import logging import unittest.mock as mock from pathlib import Path +from typing import Any, Optional import numpy as np import pytest @@ -28,8 +29,8 @@ class FakeFitsFS: more filesystem operations without a really good reason. """ - def __init__(self, test_files: dict): - self.patchers: list[mock._patch[mock.Mock]] = [] + def __init__(self, test_files: dict, filter_catalog: Optional[dict] = None): + self.patchers: list[mock._patch[Any]] = [] self.test_files = test_files @@ -40,6 +41,17 @@ def __init__(self, test_files: dict): mock_fits_open = mock.Mock(side_effect=self._open_file) self.patchers.append(mock.patch("astropy.io.fits.open", mock_fits_open)) + if filter_catalog is not None: + mock_read_filter_catalog = mock.patch( + "fibad.data_sets.hsc_data_set.HSCDataSet._read_filter_catalog", + lambda x, y: "Not a real table", + ) + self.patchers.append(mock_read_filter_catalog) + mock_parse_filter_catalog = mock.patch( + "fibad.data_sets.hsc_data_set.HSCDataSet._parse_filter_catalog", lambda x, y: filter_catalog + ) + self.patchers.append(mock_parse_filter_catalog) + def _open_file(self, filename: Path, **kwargs) -> mock.Mock: shape = self.test_files[filename.name] mock_open_ctx = mock.Mock() @@ -118,6 +130,42 @@ def generate_files( return test_files +def generate_filter_catalog(test_files: dict, config: dict) -> dict: + """Generates a filter catalog dict for use with FakeFitsFS from a filesystem dictionary + created by generate_files. + + This allows tests to alter the parsed filter_catalog, and interrogate what decisions HSCDataSet makes + when a manifest or filter_catalog file contains corrupt information. + + Caveats: + Always Run this prior to creating any mocks, or using a FakeFitsFS + + This function calls the HSCDataSet parsing code to actually assemble the files array, which means: + 1) This test setup is fairly tightly coupled to the internals of HSCDataSet + 2) When writing tests using this, ensure any functionality you depend on from the slow file-scan + initialization codepath is tested indepently. + + Parameters + ---------- + test_files : dict + Test files dictionary created with generate_files. + config : dict + The config you will use to initialize the test HSCDataSet object in your test. + Create this with mkconfig() + + Returns + ------- + dict + Dictionary from ObjectID -> (Dictionary from Filter -> Filename) + """ + # Ensure the filter_catalog codepath won't be triggered + config["data_set"]["filter_catalog"] = False + + # Use our initialization code to create a parsed files object + with FakeFitsFS(test_files): + return HSCDataSet(config).files + + def test_load(caplog): """Test to ensure loading a perfectly regular set of files works""" caplog.set_level(logging.WARNING) @@ -314,6 +362,52 @@ def test_prune_size(caplog): assert "too small" in caplog.text +def test_prune_filter_size_mismatch(caplog): + """Test to ensure images with different sizes per filter will be dropped""" + caplog.set_level(logging.WARNING) + test_files = {} + test_files.update(generate_files(num_objects=10, num_filters=5, shape=(100, 100), offset=0)) + test_files["00000000000000000_all_filters_HSC-R.fits"] = (99, 99) + + with FakeFitsFS(test_files): + a = HSCDataSet(mkconfig(crop_to=(99, 99))) + + assert len(a) == 9 + assert a.shape() == (5, 99, 99) + + # We should warn that we are dropping objects and the reason + assert "Dropping object" in caplog.text + assert "first filter" in caplog.text + + +def test_prune_bad_filename(caplog): + """Test to ensure images with filenames set wrong will be dropped""" + caplog.set_level(logging.WARNING) + test_files = {} + test_files.update(generate_files(num_objects=10, num_filters=5, shape=(100, 100), offset=0)) + + config = mkconfig(crop_to=(99, 99), filter_catalog="notarealfile.fits") + + # Create a filter catalog with wrong file information. + filter_catalog = generate_filter_catalog(test_files, config) + filters = list(filter_catalog["00000000000000000"].keys()) + filter_catalog["00000000000000000"][filters[0]] = filter_catalog["00000000000000001"][filters[0]] + + with FakeFitsFS(test_files, filter_catalog): + # Initialize HSCDataset exercising the filter_catalog provided initialization pathway + a = HSCDataSet(config) + + # Verify that the broken object has been dropped + assert len(a) == 9 + + # Verify the shape is correct. + assert a.shape() == (5, 99, 99) + + # We should warn that we are dropping objects and the correct reason + assert "Dropping object" in caplog.text + assert "manifest is likely corrupt" in caplog.text + + def test_partial_filter(caplog): """Test to ensure when we only load some of the filters, only those filters end up in the dataset""" caplog.set_level(logging.WARNING)