Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .setup_dev.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ python -m pip install -e . > /dev/null
echo "Installing developer dependencies in local environment"
echo "This might take a few minutes. Only errors will be printed to stdout"
python -m pip install -e .'[dev]' > /dev/null
if [ -f docs/requirements.txt ]; then python -m pip install -r docs/requirements.txt; fi
if [ -f docs/requirements.txt ]; then python -m pip install -r docs/requirements.txt > /dev/null; fi
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you haven't already, it would be worthwhile percolating this up to the Python Project Template.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


echo "Installing pre-commit"
pre-commit install > /dev/null
Expand Down
25 changes: 24 additions & 1 deletion src/fibad/data_sets/hsc_data_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,15 +413,38 @@ def _prune_objects(self, filters_ref: list[str], cutout_shape: Optional[tuple[in
logger.info(f"Filters for object {object_id} were {filters_unsorted}")
logger.debug(f"Reference filters were {filters_ref}")

# Drop objects that can't meet the coutout size provided
elif cutout_shape is not None:
# Drop objects that can't meet the cutout size provided
for shape in self.dims[object_id]:
if shape[0] < cutout_shape[0] or shape[1] < cutout_shape[1]:
msg = f"A file for object {object_id} has shape ({shape[1]}px, {shape[1]}px)"
msg += " this is too small for the given cutout size of "
msg += f"({cutout_shape[0]}px, {cutout_shape[1]}px)"
self._mark_for_prune(object_id, msg)
break

# Drop objects where the cutouts are not the same size
first_shape = None
for shape in self.dims[object_id]:
first_shape = shape if first_shape is None else first_shape
if shape != first_shape:
msg = f"The first filter for object {object_id} has a shape of "
msg += f"({first_shape[0]}px,{first_shape[1]}px) another filter has shape of"
msg += f"({shape[0]}px,{shape[1]}px)"
self._mark_for_prune(object_id, msg)
break

# Drop objects where parsing the filenames does not reveal the object IDs
for filter, filepath in filters_unsorted.items():
filename = Path(filepath).name
# Check beginning of filename vs object_id
if filename[:17] != object_id:
msg = f"Filter {filter} for object id {object_id} has filename {filepath} listed"
msg += "The filename does not match the object_id, and the filter_catalog or "
msg += "manifest is likely corrupt."
self._mark_for_prune(object_id, msg)
break

if index != 0 and index % 1_000_000 == 0:
logger.info(f"Processed {index} objects for pruning")
else:
Expand Down
98 changes: 96 additions & 2 deletions tests/fibad/test_hsc_dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import unittest.mock as mock
from pathlib import Path
from typing import Any, Optional

import numpy as np
import pytest
Expand Down Expand Up @@ -28,8 +29,8 @@ class FakeFitsFS:
more filesystem operations without a really good reason.
"""

def __init__(self, test_files: dict):
self.patchers: list[mock._patch[mock.Mock]] = []
def __init__(self, test_files: dict, filter_catalog: Optional[dict] = None):
self.patchers: list[mock._patch[Any]] = []

self.test_files = test_files

Expand All @@ -40,6 +41,17 @@ def __init__(self, test_files: dict):
mock_fits_open = mock.Mock(side_effect=self._open_file)
self.patchers.append(mock.patch("astropy.io.fits.open", mock_fits_open))

if filter_catalog is not None:
mock_read_filter_catalog = mock.patch(
"fibad.data_sets.hsc_data_set.HSCDataSet._read_filter_catalog",
lambda x, y: "Not a real table",
)
self.patchers.append(mock_read_filter_catalog)
mock_parse_filter_catalog = mock.patch(
"fibad.data_sets.hsc_data_set.HSCDataSet._parse_filter_catalog", lambda x, y: filter_catalog
)
self.patchers.append(mock_parse_filter_catalog)

def _open_file(self, filename: Path, **kwargs) -> mock.Mock:
shape = self.test_files[filename.name]
mock_open_ctx = mock.Mock()
Expand Down Expand Up @@ -118,6 +130,42 @@ def generate_files(
return test_files


def generate_filter_catalog(test_files: dict, config: dict) -> dict:
"""Generates a filter catalog dict for use with FakeFitsFS from a filesystem dictionary
created by generate_files.

This allows tests to alter the parsed filter_catalog, and interrogate what decisions HSCDataSet makes
when a manifest or filter_catalog file contains corrupt information.

Caveats:
Always Run this prior to creating any mocks, or using a FakeFitsFS

This function calls the HSCDataSet parsing code to actually assemble the files array, which means:
1) This test setup is fairly tightly coupled to the internals of HSCDataSet
2) When writing tests using this, ensure any functionality you depend on from the slow file-scan
initialization codepath is tested indepently.

Parameters
----------
test_files : dict
Test files dictionary created with generate_files.
config : dict
The config you will use to initialize the test HSCDataSet object in your test.
Create this with mkconfig()

Returns
-------
dict
Dictionary from ObjectID -> (Dictionary from Filter -> Filename)
"""
# Ensure the filter_catalog codepath won't be triggered
config["data_set"]["filter_catalog"] = False

# Use our initialization code to create a parsed files object
with FakeFitsFS(test_files):
return HSCDataSet(config).files


def test_load(caplog):
"""Test to ensure loading a perfectly regular set of files works"""
caplog.set_level(logging.WARNING)
Expand Down Expand Up @@ -314,6 +362,52 @@ def test_prune_size(caplog):
assert "too small" in caplog.text


def test_prune_filter_size_mismatch(caplog):
"""Test to ensure images with different sizes per filter will be dropped"""
caplog.set_level(logging.WARNING)
test_files = {}
test_files.update(generate_files(num_objects=10, num_filters=5, shape=(100, 100), offset=0))
test_files["00000000000000000_all_filters_HSC-R.fits"] = (99, 99)

with FakeFitsFS(test_files):
a = HSCDataSet(mkconfig(crop_to=(99, 99)))

assert len(a) == 9
assert a.shape() == (5, 99, 99)

# We should warn that we are dropping objects and the reason
assert "Dropping object" in caplog.text
assert "first filter" in caplog.text


def test_prune_bad_filename(caplog):
"""Test to ensure images with filenames set wrong will be dropped"""
caplog.set_level(logging.WARNING)
test_files = {}
test_files.update(generate_files(num_objects=10, num_filters=5, shape=(100, 100), offset=0))

config = mkconfig(crop_to=(99, 99), filter_catalog="notarealfile.fits")

# Create a filter catalog with wrong file information.
filter_catalog = generate_filter_catalog(test_files, config)
filters = list(filter_catalog["00000000000000000"].keys())
filter_catalog["00000000000000000"][filters[0]] = filter_catalog["00000000000000001"][filters[0]]

with FakeFitsFS(test_files, filter_catalog):
# Initialize HSCDataset exercising the filter_catalog provided initialization pathway
a = HSCDataSet(config)

# Verify that the broken object has been dropped
assert len(a) == 9

# Verify the shape is correct.
assert a.shape() == (5, 99, 99)

# We should warn that we are dropping objects and the correct reason
assert "Dropping object" in caplog.text
assert "manifest is likely corrupt" in caplog.text


def test_partial_filter(caplog):
"""Test to ensure when we only load some of the filters, only those filters end up in the dataset"""
caplog.set_level(logging.WARNING)
Expand Down