Skip to content

Commit

Permalink
Added progressive log entries for HSCDataSet file scan
Browse files Browse the repository at this point in the history
Intending to run this on hyak to tune parameters.
  • Loading branch information
mtauraso committed Nov 15, 2024
1 parent 4e2456a commit 3aa0e85
Showing 1 changed file with 12 additions and 5 deletions.
17 changes: 12 additions & 5 deletions src/fibad/data_sets/hsc_data_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,9 +470,10 @@ def _scan_file_dimensions(self) -> dim_dict:
logger.info("Scanning for dimensions...")

retval = {}

with MultiPool() as pool:
args = ((object_id, list(self._object_files(object_id))) for object_id in self.ids())
with MultiPool(processes=10) as pool:
args = (
(object_id, list(self._object_files(object_id))) for object_id in self.ids(log_every=100_000)
)
retval = dict(pool.map(self._scan_file_dimension, args))
return retval

Expand Down Expand Up @@ -761,7 +762,7 @@ def _get_file(self, index: int) -> Path:
filter = filter_names[index % self.num_filters]
return self._file_to_path(filters[filter])

def ids(self):
def ids(self, log_every=None):
"""Public read-only iterator over all object_ids that enforces a strict total order across
objects. Will not work prior to self.files initialization in __init__
Expand All @@ -770,8 +771,14 @@ def ids(self):
Iterator[str]
Object IDs currently in the dataset
"""
for object_id in self.files:
log = log_every is not None and isinstance(log_every, int)
for index, object_id in enumerate(self.files):
if log and index != 0 and index % log_every == 0:
logger.info(f"Processed {index} objects")

Check warning on line 777 in src/fibad/data_sets/hsc_data_set.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/data_sets/hsc_data_set.py#L777

Added line #L777 was not covered by tests
yield object_id
else:
if log:
logger.info(f"Processed {index} objects")

def _all_files_full(self):
"""
Expand Down

0 comments on commit 3aa0e85

Please sign in to comment.