diff --git a/docs/source/extensions/numbadoc.py b/docs/source/extensions/numbadoc.py index 5b4202a1..06de3418 100644 --- a/docs/source/extensions/numbadoc.py +++ b/docs/source/extensions/numbadoc.py @@ -27,7 +27,7 @@ def import_object(self) -> bool: """ success = super().import_object() if success: - # Store away numba wrapper + # store away numba wrapper self.jitobj = self.object # And bend references to underlying python function if hasattr(self.object, "py_func"): diff --git a/docs/source/notebooks/DataCompression.ipynb b/docs/source/notebooks/DataCompression.ipynb index fad9c9bc..74a26c92 100644 --- a/docs/source/notebooks/DataCompression.ipynb +++ b/docs/source/notebooks/DataCompression.ipynb @@ -61,8 +61,8 @@ "metadata": {}, "outputs": [], "source": [ - "store = lgdo.LH5Store()\n", - "store.write_object(data, \"data\", \"data.lh5\", wo_mode=\"of\")\n", + "store = lgdo.lh5.LH5Store()\n", + "store.write(data, \"data\", \"data.lh5\", wo_mode=\"of\")\n", "lgdo.show(\"data.lh5\")" ] }, @@ -110,7 +110,7 @@ "metadata": {}, "outputs": [], "source": [ - "lgdo.lh5_store.DEFAULT_HDF5_SETTINGS" + "lgdo.lh5.store.DEFAULT_HDF5_SETTINGS" ] }, { @@ -131,18 +131,18 @@ "outputs": [], "source": [ "# use another built-in filter\n", - "lgdo.lh5_store.DEFAULT_HDF5_SETTINGS = {\"compression\": \"lzf\"}\n", + "lgdo.lh5.store.DEFAULT_HDF5_SETTINGS = {\"compression\": \"lzf\"}\n", "\n", "# specify filter name and options\n", - "lgdo.lh5_store.DEFAULT_HDF5_SETTINGS = {\"compression\": \"gzip\", \"compression_opts\": 7}\n", + "lgdo.lh5.store.DEFAULT_HDF5_SETTINGS = {\"compression\": \"gzip\", \"compression_opts\": 7}\n", "\n", "# specify a registered filter provided by hdf5plugin\n", "import hdf5plugin\n", "\n", - "lgdo.lh5_store.DEFAULT_HDF5_SETTINGS = {\"compression\": hdf5plugin.Blosc()}\n", + "lgdo.lh5.store.DEFAULT_HDF5_SETTINGS = {\"compression\": hdf5plugin.Blosc()}\n", "\n", "# shuffle bytes before compressing (typically better compression ratio with no performance penalty)\n", - "lgdo.lh5_store.DEFAULT_HDF5_SETTINGS = {\"shuffle\": True, \"compression\": \"lzf\"}" + "lgdo.lh5.store.DEFAULT_HDF5_SETTINGS = {\"shuffle\": True, \"compression\": \"lzf\"}" ] }, { @@ -166,7 +166,7 @@ "metadata": {}, "outputs": [], "source": [ - "store.write_object(data, \"data\", \"data.lh5\", wo_mode=\"of\")\n", + "store.write(data, \"data\", \"data.lh5\", wo_mode=\"of\")\n", "show_h5ds_opts(\"data/col1\")" ] }, @@ -175,7 +175,7 @@ "id": "f597a9e2", "metadata": {}, "source": [ - "Nice. Shuffling bytes before compressing significantly reduced size on disk. Last but not least, `create_dataset()` keyword arguments can be passed to `write_object()`. They will be forwarded as is, overriding default settings." + "Nice. Shuffling bytes before compressing significantly reduced size on disk. Last but not least, `create_dataset()` keyword arguments can be passed to `write()`. They will be forwarded as is, overriding default settings." ] }, { @@ -185,9 +185,7 @@ "metadata": {}, "outputs": [], "source": [ - "store.write_object(\n", - " data, \"data\", \"data.lh5\", wo_mode=\"of\", shuffle=True, compression=\"gzip\"\n", - ")\n", + "store.write(data, \"data\", \"data.lh5\", wo_mode=\"of\", shuffle=True, compression=\"gzip\")\n", "show_h5ds_opts(\"data/col1\")" ] }, @@ -207,7 +205,7 @@ "outputs": [], "source": [ "data[\"col2\"].attrs[\"hdf5_settings\"] = {\"compression\": \"gzip\"}\n", - "store.write_object(data, \"data\", \"data.lh5\", wo_mode=\"of\")\n", + "store.write(data, \"data\", \"data.lh5\", wo_mode=\"of\")\n", "\n", "show_h5ds_opts(\"data/col1\")\n", "show_h5ds_opts(\"data/col2\")" @@ -221,7 +219,7 @@ "We are now storing table columns with different compression settings.\n", "\n", "
\n", - "**Note:** since any [h5py.Group.create_dataset()](https://docs.h5py.org/en/stable/high/group.html#h5py.Group.create_dataset) keyword argument can be used in `write_object()` or set in the `hdf5_settings` attribute, other HDF5 dataset settings can be configured, like the chunk size.\n", + "**Note:** since any [h5py.Group.create_dataset()](https://docs.h5py.org/en/stable/high/group.html#h5py.Group.create_dataset) keyword argument can be used in `write()` or set in the `hdf5_settings` attribute, other HDF5 dataset settings can be configured, like the chunk size.\n", "
" ] }, @@ -232,7 +230,7 @@ "metadata": {}, "outputs": [], "source": [ - "store.write_object(data, \"data\", \"data.lh5\", wo_mode=\"of\", chunks=2)" + "store.write(data, \"data\", \"data.lh5\", wo_mode=\"of\", chunks=2)" ] }, { @@ -257,7 +255,7 @@ "from legendtestdata import LegendTestData\n", "\n", "ldata = LegendTestData()\n", - "wfs, n_rows = store.read_object(\n", + "wfs, n_rows = store.read(\n", " \"geds/raw/waveform\",\n", " ldata.get_path(\"lh5/LDQTA_r117_20200110T105115Z_cal_geds_raw.lh5\"),\n", ")\n", @@ -347,7 +345,7 @@ " t0=wfs.t0,\n", " dt=wfs.dt,\n", ")\n", - "store.write_object(enc_wfs, \"waveforms\", \"data.lh5\", wo_mode=\"o\")\n", + "store.write(enc_wfs, \"waveforms\", \"data.lh5\", wo_mode=\"o\")\n", "lgdo.show(\"data.lh5\", attrs=True)" ] }, @@ -372,7 +370,7 @@ "metadata": {}, "outputs": [], "source": [ - "obj, _ = store.read_object(\"waveforms\", \"data.lh5\")\n", + "obj, _ = store.read(\"waveforms\", \"data.lh5\")\n", "obj.values" ] }, @@ -391,7 +389,7 @@ "metadata": {}, "outputs": [], "source": [ - "obj, _ = store.read_object(\"waveforms\", \"data.lh5\", decompress=False)\n", + "obj, _ = store.read(\"waveforms\", \"data.lh5\", decompress=False)\n", "obj.values" ] }, @@ -433,9 +431,9 @@ "from lgdo.compression import ULEB128ZigZagDiff\n", "\n", "wfs.values.attrs[\"compression\"] = ULEB128ZigZagDiff()\n", - "store.write_object(wfs, \"waveforms\", \"data.lh5\", wo_mode=\"of\")\n", + "store.write(wfs, \"waveforms\", \"data.lh5\", wo_mode=\"of\")\n", "\n", - "obj, _ = store.read_object(\"waveforms\", \"data.lh5\", decompress=False)\n", + "obj, _ = store.read(\"waveforms\", \"data.lh5\", decompress=False)\n", "obj.values.attrs[\"codec\"]" ] }, @@ -447,8 +445,8 @@ "Further reading:\n", "\n", "- [Available waveform compression algorithms](https://legend-pydataobj.readthedocs.io/en/stable/api/lgdo.compression.html)\n", - "- [read_object() docstring](https://legend-pydataobj.readthedocs.io/en/stable/api/lgdo.html#lgdo.lh5_store.LH5Store.read_object)\n", - "- [write_object() docstring](https://legend-pydataobj.readthedocs.io/en/stable/api/lgdo.html#lgdo.lh5_store.LH5Store.write_object)" + "- [read() docstring](https://legend-pydataobj.readthedocs.io/en/stable/api/lgdo.html#lgdo.lh5.store.LH5Store.read)\n", + "- [write() docstring](https://legend-pydataobj.readthedocs.io/en/stable/api/lgdo.html#lgdo.lh5_store.LH5Store.write)" ] } ], diff --git a/docs/source/notebooks/LH5Files.ipynb b/docs/source/notebooks/LH5Files.ipynb index 8563f4bd..9c594be9 100644 --- a/docs/source/notebooks/LH5Files.ipynb +++ b/docs/source/notebooks/LH5Files.ipynb @@ -38,7 +38,7 @@ "id": "c136b537", "metadata": {}, "source": [ - "We can use `lgdo.lh5_store.ls()` [[docs]](https://legend-pydataobj.readthedocs.io/en/stable/api/lgdo.html#lgdo.lh5_store.ls) to inspect the file contents:" + "We can use `lgdo.lh5.ls()` [[docs]](https://legend-pydataobj.readthedocs.io/en/stable/api/lgdo.html#lgdo.lh5.ls) to inspect the file contents:" ] }, { @@ -131,7 +131,7 @@ "metadata": {}, "outputs": [], "source": [ - "store.read_object(\"geds/raw\", lh5_file)" + "store.read(\"geds/raw\", lh5_file)" ] }, { @@ -149,7 +149,7 @@ "metadata": {}, "outputs": [], "source": [ - "obj, n_rows = store.read_object(\"geds/raw/timestamp\", lh5_file)\n", + "obj, n_rows = store.read(\"geds/raw/timestamp\", lh5_file)\n", "obj" ] }, @@ -170,7 +170,7 @@ "metadata": {}, "outputs": [], "source": [ - "obj, n_rows = store.read_object(\"geds/raw/timestamp\", lh5_file, start_row=15, n_rows=10)\n", + "obj, n_rows = store.read(\"geds/raw/timestamp\", lh5_file, start_row=15, n_rows=10)\n", "print(obj)" ] }, @@ -189,7 +189,7 @@ "metadata": {}, "outputs": [], "source": [ - "obj, n_rows = store.read_object(\n", + "obj, n_rows = store.read(\n", " \"geds/raw\", lh5_file, field_mask=(\"timestamp\", \"energy\"), idx=[1, 3, 7, 9, 10, 15]\n", ")\n", "print(obj)" @@ -200,7 +200,7 @@ "id": "b3f52d77", "metadata": {}, "source": [ - "As you might have noticed, `read_object()` loads all the requested data in memory at once. This can be a problem when dealing with large datasets. `LH5Iterator` [[docs]](https://legend-pydataobj.readthedocs.io/en/stable/api/lgdo.html#lgdo.lh5_store.LH5Iterator) makes it possible to handle data one chunk at a time (sequentially) to avoid running out of memory:" + "As you might have noticed, `read_object()` loads all the requested data in memory at once. This can be a problem when dealing with large datasets. `LH5Iterator` [[docs]](https://legend-pydataobj.readthedocs.io/en/stable/api/lgdo.html#lgdo.lh5.iterator.LH5Iterator) makes it possible to handle data one chunk at a time (sequentially) to avoid running out of memory:" ] }, { @@ -260,9 +260,7 @@ "source": [ "store = LH5Store()\n", "\n", - "store.write_object(\n", - " scalar, name=\"message\", lh5_file=\"my_objects.lh5\", wo_mode=\"overwrite_file\"\n", - ")" + "store.write(scalar, name=\"message\", lh5_file=\"my_objects.lh5\", wo_mode=\"overwrite_file\")" ] }, { @@ -300,10 +298,8 @@ "metadata": {}, "outputs": [], "source": [ - "store.write_object(array, name=\"numbers\", group=\"closet\", lh5_file=\"my_objects.lh5\")\n", - "store.write_object(\n", - " wf_table, name=\"waveforms\", group=\"closet\", lh5_file=\"my_objects.lh5\"\n", - ")\n", + "store.write(array, name=\"numbers\", group=\"closet\", lh5_file=\"my_objects.lh5\")\n", + "store.write(wf_table, name=\"waveforms\", group=\"closet\", lh5_file=\"my_objects.lh5\")\n", "show(\"my_objects.lh5\")" ] }, diff --git a/src/lgdo/__init__.py b/src/lgdo/__init__.py index 5e6eb7e0..25efb08d 100644 --- a/src/lgdo/__init__.py +++ b/src/lgdo/__init__.py @@ -66,11 +66,11 @@ "VectorOfVectors", "VectorOfEncodedVectors", "WaveformTable", - "LH5Iterator", - "LH5Store", "load_dfs", "load_nda", "ls", "show", + "LH5Iterator", + "LH5Store", "__version__", ] diff --git a/src/lgdo/cli.py b/src/lgdo/cli.py index 24ba56d1..2273579a 100644 --- a/src/lgdo/cli.py +++ b/src/lgdo/cli.py @@ -9,7 +9,7 @@ def lh5ls(): - """:func:`.show` command line interface.""" + """:func:`.lh5.show` command line interface.""" parser = argparse.ArgumentParser( prog="lh5ls", description="Inspect LEGEND HDF5 (LH5) file contents" ) diff --git a/src/lgdo/lgdo_utils.py b/src/lgdo/lgdo_utils.py index 05b46bd5..cddd2111 100644 --- a/src/lgdo/lgdo_utils.py +++ b/src/lgdo/lgdo_utils.py @@ -1,149 +1,56 @@ -"""Implements utilities for LEGEND Data Objects.""" from __future__ import annotations -import glob -import logging -import os -import string +from warnings import warn import numpy as np from . import types as lgdo +from .lh5 import utils -log = logging.getLogger(__name__) - -def get_element_type(obj: object) -> str: - """Get the LGDO element type of a scalar or array. - - For use in LGDO datatype attributes. - - Parameters - ---------- - obj - if a ``str``, will automatically return ``string`` if the object has - a :class:`numpy.dtype`, that will be used for determining the element - type otherwise will attempt to case the type of the object to a - :class:`numpy.dtype`. - - Returns - ------- - element_type - A string stating the determined element type of the object. - """ - - # special handling for strings - if isinstance(obj, str): - return "string" - - # the rest use dtypes - dt = obj.dtype if hasattr(obj, "dtype") else np.dtype(type(obj)) - kind = dt.kind - - if kind == "b": - return "bool" - if kind == "V": - return "blob" - if kind in ["i", "u", "f"]: - return "real" - if kind == "c": - return "complex" - if kind in ["S", "U"]: - return "string" - - # couldn't figure it out - raise ValueError( - "cannot determine lgdo element_type for object of type", type(obj).__name__ +def copy(obj: lgdo.LGDO, dtype: np.dtype = None) -> None: + warn( + "lgdo_utils.copy will soon be removed and will be replaced soon with copy member functions of each LGDO data type.", + DeprecationWarning, + stacklevel=2, ) + return utils.copy(obj, dtype) -def copy(obj: lgdo.LGDO, dtype: np.dtype = None) -> lgdo.LGDO: - """Return a copy of an LGDO. - - Parameters - ---------- - obj - the LGDO to be copied. - dtype - NumPy dtype to be used for the copied object. - - """ - if dtype is None: - dtype = obj.dtype - - if isinstance(obj, lgdo.Array): - return lgdo.Array( - np.array(obj.nda, dtype=dtype, copy=True), attrs=dict(obj.attrs) - ) - - if isinstance(obj, lgdo.VectorOfVectors): - return lgdo.VectorOfVectors( - flattened_data=copy(obj.flattened_data, dtype=dtype), - cumulative_length=copy(obj.cumulative_length), - attrs=dict(obj.attrs), - ) - - else: - raise ValueError(f"copy of {type(obj)} not supported") +def get_element_type(obj: object) -> str: + warn( + "'lgdo.lgdo_utils' has been renamed to 'lgdo.utils'. " + "Please replace either 'import lgdo.lgdo_utils as utils' with 'import lgdo.utils as utils' " + "or 'from lgdo.lgdo_utils import get_element_type' with 'from lgdo.utils import get_element_type'." + "'lgdo.lgdo_utils' will be removed in a future release.", + DeprecationWarning, + stacklevel=2, + ) + return utils.get_element_type(obj) def parse_datatype(datatype: str) -> tuple[str, tuple[int, ...], str | list[str]]: - """Parse datatype string and return type, dimensions and elements. - - Parameters - ---------- - datatype - a LGDO-formatted datatype string. - - Returns - ------- - element_type - the datatype name dims if not ``None``, a tuple of dimensions for the - LGDO. Note this is not the same as the NumPy shape of the underlying - data object. See the LGDO specification for more information. Also see - :class:`~.types.ArrayOfEqualSizedArrays` and - :meth:`.lh5_store.LH5Store.read_object` for example code elements for - numeric objects, the element type for struct-like objects, the list of - fields in the struct. - """ - if "{" not in datatype: - return "scalar", None, datatype - - # for other datatypes, need to parse the datatype string - from parse import parse - - datatype, element_description = parse("{}{{{}}}", datatype) - if datatype.endswith(">"): - datatype, dims = parse("{}<{}>", datatype) - dims = [int(i) for i in dims.split(",")] - return datatype, tuple(dims), element_description - else: - return datatype, None, element_description.split(",") + warn( + "'lgdo.lgdo_utils' has been renamed to 'lgdo.utils'. " + "Please replace either 'import lgdo.lgdo_utils as utils' with 'import lgdo.utils as utils' " + "or 'from lgdo.lgdo_utils import parse_datatype' with 'from lgdo.utils import parse_datatype'." + "'lgdo.lgdo_utils' will be removed in a future release.", + DeprecationWarning, + stacklevel=2, + ) + return utils.parse_datatype(datatype) def expand_vars(expr: str, substitute: dict[str, str] = None) -> str: - """Expand (environment) variables. - - Note - ---- - Malformed variable names and references to non-existing variables are left - unchanged. - - Parameters - ---------- - expr - string expression, which may include (environment) variables prefixed by - ``$``. - substitute - use this dictionary to substitute variables. Takes precedence over - environment variables. - """ - if substitute is None: - substitute = {} - - # use provided mapping - # then expand env variables - return os.path.expandvars(string.Template(expr).safe_substitute(substitute)) + warn( + "'lgdo.lgdo_utils' has been renamed to 'lgdo.utils'. " + "Please replace either 'import lgdo.lgdo_utils as utils' with 'import lgdo.utils as utils' " + "or 'from lgdo.lgdo_utils import expand_vars' with 'from lgdo.utils import expand_vars'." + "'lgdo.lgdo_utils' will be removed in a future release.", + DeprecationWarning, + stacklevel=2, + ) + return utils.expand_vars(expr, substitute) def expand_path( @@ -152,45 +59,12 @@ def expand_path( list: bool = False, base_path: str = None, ) -> str | list: - """Expand (environment) variables and wildcards to return absolute paths. - - Parameters - ---------- - path - name of path, which may include environment variables and wildcards. - list - if ``True``, return a list. If ``False``, return a string; if ``False`` - and a unique file is not found, raise an exception. - substitute - use this dictionary to substitute variables. Environment variables take - precedence. - base_path - name of base path. Returned paths will be relative to base. - - Returns - ------- - path or list of paths - Unique absolute path, or list of all absolute paths - """ - if base_path is not None and base_path != "": - base_path = os.path.expanduser(os.path.expandvars(base_path)) - path = os.path.join(base_path, path) - - # first expand variables - _path = expand_vars(path, substitute) - - # then expand wildcards - paths = sorted(glob.glob(os.path.expanduser(_path))) - - if base_path is not None and base_path != "": - paths = [os.path.relpath(p, base_path) for p in paths] - - if not list: - if len(paths) == 0: - raise FileNotFoundError(f"could not find path matching {path}") - elif len(paths) > 1: - raise FileNotFoundError(f"found multiple paths matching {path}") - else: - return paths[0] - else: - return paths + warn( + "'lgdo.lgdo_utils' has been renamed to 'lgdo.utils'. " + "Please replace either 'import lgdo.lgdo_utils as utils' with 'import lgdo.utils as utils' " + "or 'from lgdo.lgdo_utils import expand_path' with 'from lgdo.utils import expand_path'." + "'lgdo.lgdo_utils' will be removed in a future release. ", + DeprecationWarning, + stacklevel=2, + ) + return utils.expand_path(path, substitute, list, base_path) diff --git a/src/lgdo/lh5/__init__.py b/src/lgdo/lh5/__init__.py new file mode 100644 index 00000000..6263372a --- /dev/null +++ b/src/lgdo/lh5/__init__.py @@ -0,0 +1,18 @@ +"""Routines from reading and writing LEGEND Data Objects in HDF5 files. +Currently the primary on-disk format for LGDO object is LEGEND HDF5 (LH5) files. IO +is done via the class :class:`.store.LH5Store`. LH5 files can also be +browsed easily in python like any `HDF5 `_ file using +`h5py `_. +""" + +from .iterator import LH5Iterator +from .store import LH5Store, load_dfs, load_nda, ls, show + +__all__ = [ + "LH5Iterator", + "LH5Store", + "load_dfs", + "load_nda", + "ls", + "show", +] diff --git a/src/lgdo/lh5/iterator.py b/src/lgdo/lh5/iterator.py new file mode 100644 index 00000000..534a7c05 --- /dev/null +++ b/src/lgdo/lh5/iterator.py @@ -0,0 +1,310 @@ +from __future__ import annotations + +import logging +import typing as typing + +import numpy as np +import pandas as pd + +from ..types import Array, Scalar, Struct, VectorOfVectors +from .store import LH5Store +from .utils import expand_path + +LGDO = typing.Union[Array, Scalar, Struct, VectorOfVectors] + + +class LH5Iterator(typing.Iterator): + """ + A class for iterating through one or more LH5 files, one block of entries + at a time. This also accepts an entry list/mask to enable event selection, + and a field mask. + + This class can be used either for random access: + + >>> lh5_obj, n_rows = lh5_it.read(entry) + + to read the block of entries starting at entry. In case of multiple files + or the use of an event selection, entry refers to a global event index + across files and does not count events that are excluded by the selection. + + This can also be used as an iterator: + + >>> for lh5_obj, entry, n_rows in LH5Iterator(...): + >>> # do the thing! + + This is intended for if you are reading a large quantity of data but + want to limit your memory usage (particularly when reading in waveforms!). + The ``lh5_obj`` that is read by this class is reused in order to avoid + reallocation of memory; this means that if you want to hold on to data + between reads, you will have to copy it somewhere! + """ + + def __init__( + self, + lh5_files: str | list[str], + groups: str | list[str], + base_path: str = "", + entry_list: list[int] | list[list[int]] = None, + entry_mask: list[bool] | list[list[bool]] = None, + field_mask: dict[str, bool] | list[str] | tuple[str] = None, + buffer_len: int = 3200, + friend: typing.Iterator = None, + ) -> None: + """ + Parameters + ---------- + lh5_files + file or files to read from. May include wildcards and environment + variables. + groups + HDF5 group(s) to read. If a list is provided for both lh5_files + and group, they must be the same size. If a file is wild-carded, + the same group will be assigned to each file found + entry_list + list of entry numbers to read. If a nested list is provided, + expect one top-level list for each file, containing a list of + local entries. If a list of ints is provided, use global entries. + entry_mask + mask of entries to read. If a list of arrays is provided, expect + one for each file. Ignore if a selection list is provided. + field_mask + mask of which fields to read. See :meth:`LH5Store.read` for + more details. + buffer_len + number of entries to read at a time while iterating through files. + friend + a ''friend'' LH5Iterator that will be read in parallel with this. + The friend should have the same length and entry list. A single + LH5 table containing columns from both iterators will be returned. + """ + self.lh5_st = LH5Store(base_path=base_path, keep_open=True) + + # List of files, with wildcards and env vars expanded + if isinstance(lh5_files, str): + lh5_files = [lh5_files] + if isinstance(groups, list): + lh5_files *= len(groups) + elif not isinstance(lh5_files, list): + raise ValueError("lh5_files must be a string or list of strings") + + if isinstance(groups, str): + groups = [groups] * len(lh5_files) + elif not isinstance(groups, list): + raise ValueError("group must be a string or list of strings") + + if not len(groups) == len(lh5_files): + raise ValueError("lh5_files and groups must have same length") + + self.lh5_files = [] + self.groups = [] + for f, g in zip(lh5_files, groups): + f_exp = expand_path(f, list=True, base_path=base_path) + self.lh5_files += f_exp + self.groups += [g] * len(f_exp) + + if entry_list is not None and entry_mask is not None: + raise ValueError( + "entry_list and entry_mask arguments are mutually exclusive" + ) + + # Map to last row in each file + self.file_map = np.full(len(self.lh5_files), np.iinfo("i").max, "i") + # Map to last iterator entry for each file + self.entry_map = np.full(len(self.lh5_files), np.iinfo("i").max, "i") + self.buffer_len = buffer_len + + if len(self.lh5_files) > 0: + f = self.lh5_files[0] + g = self.groups[0] + self.lh5_buffer = self.lh5_st.get_buffer( + g, + f, + size=self.buffer_len, + field_mask=field_mask, + ) + self.file_map[0] = self.lh5_st.read_n_rows(g, f) + else: + raise RuntimeError(f"can't open any files from {lh5_files}") + + self.n_rows = 0 + self.current_entry = 0 + self.next_entry = 0 + + self.field_mask = field_mask + + # List of entry indices from each file + self.local_entry_list = None + self.global_entry_list = None + if entry_list is not None: + entry_list = list(entry_list) + if isinstance(entry_list[0], int): + self.local_entry_list = [None] * len(self.file_map) + self.global_entry_list = np.array(entry_list, "i") + self.global_entry_list.sort() + + else: + self.local_entry_list = [[]] * len(self.file_map) + for i_file, local_list in enumerate(entry_list): + self.local_entry_list[i_file] = np.array(local_list, "i") + self.local_entry_list[i_file].sort() + + elif entry_mask is not None: + # Convert entry mask into an entry list + if isinstance(entry_mask, pd.Series): + entry_mask = entry_mask.values + if isinstance(entry_mask, np.ndarray): + self.local_entry_list = [None] * len(self.file_map) + self.global_entry_list = np.nonzero(entry_mask)[0] + else: + self.local_entry_list = [[]] * len(self.file_map) + for i_file, local_mask in enumerate(entry_mask): + self.local_entry_list[i_file] = np.nonzero(local_mask)[0] + + # Attach the friend + if friend is not None: + if not isinstance(friend, typing.Iterator): + raise ValueError("Friend must be an Iterator") + self.lh5_buffer.join(friend.lh5_buffer) + self.friend = friend + + def _get_file_cumlen(self, i_file: int) -> int: + """Helper to get cumulative file length of file""" + if i_file < 0: + return 0 + fcl = self.file_map[i_file] + if fcl == np.iinfo("i").max: + fcl = self._get_file_cumlen(i_file - 1) + self.lh5_st.read_n_rows( + self.groups[i_file], self.lh5_files[i_file] + ) + self.file_map[i_file] = fcl + return fcl + + def _get_file_cumentries(self, i_file: int) -> int: + """Helper to get cumulative iterator entries in file""" + if i_file < 0: + return 0 + n = self.entry_map[i_file] + if n == np.iinfo("i").max: + elist = self.get_file_entrylist(i_file) + fcl = self._get_file_cumlen(i_file) + if elist is None: + # no entry list provided + n = fcl + else: + file_entries = self.get_file_entrylist(i_file) + n = len(file_entries) + # check that file entries fall inside of file + if n > 0 and file_entries[-1] >= fcl: + logging.warning(f"Found entries out of range for file {i_file}") + n = np.searchsorted(file_entries, fcl, "right") + n += self._get_file_cumentries(i_file - 1) + self.entry_map[i_file] = n + return n + + def get_file_entrylist(self, i_file: int) -> np.ndarray: + """Helper to get entry list for file""" + # If no entry list is provided + if self.local_entry_list is None: + return None + + elist = self.local_entry_list[i_file] + if elist is None: + # Get local entrylist for this file from global entry list + f_start = self._get_file_cumlen(i_file - 1) + f_end = self._get_file_cumlen(i_file) + i_start = self._get_file_cumentries(i_file - 1) + i_stop = np.searchsorted(self.global_entry_list, f_end, "right") + elist = np.array(self.global_entry_list[i_start:i_stop], "i") - f_start + self.local_entry_list[i_file] = elist + return elist + + def get_global_entrylist(self) -> np.ndarray: + """Get global entry list, constructing it if needed""" + if self.global_entry_list is None and self.local_entry_list is not None: + self.global_entry_list = np.zeros(len(self), "i") + for i_file in range(len(self.lh5_files)): + i_start = self.get_file_cumentries(i_file - 1) + i_stop = self.get_file_cumentries(i_file) + f_start = self.get_file_cumlen(i_file - 1) + self.global_entry_list[i_start:i_stop] = ( + self.get_file_entrylist(i_file) + f_start + ) + return self.global_entry_list + + def read(self, entry: int) -> tuple[LGDO, int]: + """Read the nextlocal chunk of events, starting at entry. Return the + LH5 buffer and number of rows read.""" + self.n_rows = 0 + i_file = np.searchsorted(self.entry_map, entry, "right") + + # if file hasn't been opened yet, search through files + # sequentially until we find the right one + if i_file < len(self.lh5_files) and self.entry_map[i_file] == np.iinfo("i").max: + while i_file < len(self.lh5_files) and entry >= self._get_file_cumentries( + i_file + ): + i_file += 1 + + if i_file == len(self.lh5_files): + return (self.lh5_buffer, self.n_rows) + local_entry = entry - self._get_file_cumentries(i_file - 1) + + while self.n_rows < self.buffer_len and i_file < len(self.file_map): + # Loop through files + local_idx = self.get_file_entrylist(i_file) + if local_idx is not None and len(local_idx) == 0: + i_file += 1 + local_entry = 0 + continue + + i_local = local_idx[local_entry] if local_idx is not None else local_entry + self.lh5_buffer, n_rows = self.lh5_st.read( + self.groups[i_file], + self.lh5_files[i_file], + start_row=i_local, + n_rows=self.buffer_len - self.n_rows, + idx=local_idx, + field_mask=self.field_mask, + obj_buf=self.lh5_buffer, + obj_buf_start=self.n_rows, + ) + + self.n_rows += n_rows + i_file += 1 + local_entry = 0 + + self.current_entry = entry + + if self.friend is not None: + self.friend.read(entry) + + return (self.lh5_buffer, self.n_rows) + + def reset_field_mask(self, mask): + """Replaces the field mask of this iterator and any friends with mask""" + self.field_mask = mask + if self.friend is not None: + self.friend.reset_field_mask(mask) + + def __len__(self) -> int: + """Return the total number of entries.""" + return ( + self._get_file_cumentries(len(self.lh5_files) - 1) + if len(self.entry_map) > 0 + else 0 + ) + + def __iter__(self) -> typing.Iterator: + """Loop through entries in blocks of size buffer_len.""" + self.current_entry = 0 + self.next_entry = 0 + return self + + def __next__(self) -> tuple[LGDO, int, int]: + """Read next buffer_len entries and return lh5_table, iterator entry + and n_rows read.""" + buf, n_rows = self.read(self.next_entry) + self.next_entry = self.current_entry + n_rows + if n_rows == 0: + raise StopIteration + return (buf, self.current_entry, n_rows) diff --git a/src/lgdo/lh5/store.py b/src/lgdo/lh5/store.py new file mode 100644 index 00000000..3c2aa696 --- /dev/null +++ b/src/lgdo/lh5/store.py @@ -0,0 +1,1535 @@ +""" +This module implements routines from reading and writing LEGEND Data Objects in +HDF5 files. +""" +from __future__ import annotations + +import fnmatch +import glob +import logging +import os +import sys +from bisect import bisect_left +from collections import defaultdict +from typing import Any, Union + +import h5py +import numba as nb +import numpy as np +import pandas as pd + +from .. import compression as compress +from ..compression import WaveformCodec +from ..types import ( + Array, + ArrayOfEncodedEqualSizedArrays, + ArrayOfEqualSizedArrays, + FixedSizeArray, + Scalar, + Struct, + Table, + VectorOfEncodedVectors, + VectorOfVectors, + WaveformTable, +) +from .utils import expand_path, parse_datatype + +LGDO = Union[Array, Scalar, Struct, VectorOfVectors] + +log = logging.getLogger(__name__) + +DEFAULT_HDF5_SETTINGS: dict[str, ...] = {"shuffle": True, "compression": "gzip"} +DEFAULT_HDF5_COMPRESSION = None + + +class LH5Store: + """ + Class to represent a store of LEGEND HDF5 files. The two main methods + implemented by the class are :meth:`read` and :meth:`write`. + + Examples + -------- + >>> from lgdo import LH5Store + >>> store = LH5Store() + >>> obj, _ = store.read("/geds/waveform", "file.lh5") + >>> type(obj) + lgdo.waveform_table.WaveformTable + """ + + def __init__(self, base_path: str = "", keep_open: bool = False) -> None: + """ + Parameters + ---------- + base_path + directory path to prepend to LH5 files. + keep_open + whether to keep files open by storing the :mod:`h5py` objects as + class attributes. + """ + self.base_path = "" if base_path == "" else expand_path(base_path) + self.keep_open = keep_open + self.files = {} + + def gimme_file(self, lh5_file: str | h5py.File, mode: str = "r") -> h5py.File: + """Returns a :mod:`h5py` file object from the store or creates a new one. + + Parameters + ---------- + lh5_file + LH5 file name. + mode + mode in which to open file. See :class:`h5py.File` documentation. + """ + if isinstance(lh5_file, h5py.File): + return lh5_file + if mode == "r": + lh5_file = expand_path(lh5_file, base_path=self.base_path) + if lh5_file in self.files.keys(): + return self.files[lh5_file] + if self.base_path != "": + full_path = os.path.join(self.base_path, lh5_file) + else: + full_path = lh5_file + if mode != "r": + directory = os.path.dirname(full_path) + if directory != "" and not os.path.exists(directory): + log.debug(f"making path {directory}") + os.makedirs(directory) + if mode == "r" and not os.path.exists(full_path): + raise FileNotFoundError(f"file {full_path} not found") + if mode != "r" and os.path.exists(full_path): + log.debug(f"opening existing file {full_path} in mode '{mode}'") + h5f = h5py.File(full_path, mode) + if self.keep_open: + self.files[lh5_file] = h5f + return h5f + + def gimme_group( + self, + group: str | h5py.Group, + base_group: h5py.Group, + grp_attrs: dict[str, Any] = None, + overwrite: bool = False, + ) -> h5py.Group: + """ + Returns an existing :class:`h5py` group from a base group or creates a + new one. Can also set (or replace) group attributes. + + Parameters + ---------- + group + name of the HDF5 group. + base_group + HDF5 group to be used as a base. + grp_attrs + HDF5 group attributes. + overwrite + whether overwrite group attributes, ignored if `grp_attrs` is + ``None``. + """ + if not isinstance(group, h5py.Group): + if group in base_group: + group = base_group[group] + else: + group = base_group.create_group(group) + if grp_attrs is not None: + group.attrs.update(grp_attrs) + return group + if ( + grp_attrs is not None + and len(set(grp_attrs.items()) ^ set(group.attrs.items())) > 0 + ): + if not overwrite: + raise RuntimeError("grp_attrs != group.attrs but overwrite not set") + else: + log.debug(f"overwriting {group}.attrs...") + for key in group.attrs.keys(): + group.attrs.pop(key) + group.attrs.update(grp_attrs) + return group + + def get_buffer( + self, + name: str, + lh5_file: str | h5py.File | list[str | h5py.File], + size: int = None, + field_mask: dict[str, bool] | list[str] | tuple[str] = None, + ) -> LGDO: + """Returns an LH5 object appropriate for use as a pre-allocated buffer + in a read loop. Sets size to `size` if object has a size. + """ + obj, n_rows = self.read(name, lh5_file, n_rows=0, field_mask=field_mask) + if hasattr(obj, "resize") and size is not None: + obj.resize(new_size=size) + return obj + + def read( + self, + name: str, + lh5_file: str | h5py.File | list[str | h5py.File], + start_row: int = 0, + n_rows: int = sys.maxsize, + idx: np.ndarray | list | tuple | list[np.ndarray | list | tuple] = None, + use_h5idx: bool = False, + field_mask: dict[str, bool] | list[str] | tuple[str] = None, + obj_buf: LGDO = None, + obj_buf_start: int = 0, + decompress: bool = True, + ) -> tuple[LGDO, int]: + """Read LH5 object data from a file. + + Use the ``idx`` parameter to read out particular rows of the data. The ``use_h5idx`` flag + controls whether *only* those rows are read from disk or if the rows are indexed after reading + the entire object. Reading individual rows can be orders of magnitude slower than reading + the whole object and then indexing the desired rows. The default behavior (``use_h5idx=False``) + is to use slightly more memory for a much faster read. See + `legend-pydataobj #29 `_ + for additional information. + + Parameters + ---------- + name + Name of the LH5 object to be read (including its group path). + lh5_file + The file(s) containing the object to be read out. If a list of + files, array-like object data will be concatenated into the output + object. + start_row + Starting entry for the object read (for array-like objects). For a + list of files, only applies to the first file. + n_rows + The maximum number of rows to read (for array-like objects). The + actual number of rows read will be returned as one of the return + values (see below). + idx + For NumPy-style "fancying indexing" for the read to select only some + rows, e.g. after applying some cuts to particular columns. + Only selection along the first axis is supported, so tuple arguments + must be one-tuples. If `n_rows` is not false, `idx` will be truncated to + `n_rows` before reading. To use with a list of files, can pass in a list of + `idx`'s (one for each file) or use a long contiguous list (e.g. built from a previous + identical read). If used in conjunction with `start_row` and `n_rows`, + will be sliced to obey those constraints, where `n_rows` is + interpreted as the (max) number of *selected* values (in `idx`) to be + read out. Note that the ``use_h5idx`` parameter controls some behaviour of the + read and that the default behavior (``use_h5idx=False``) prioritizes speed over + a small memory penalty. + use_h5idx + ``True`` will directly pass the ``idx`` parameter to the underlying + ``h5py`` call such that only the selected rows are read directly into memory, + which conserves memory at the cost of speed. There can be a significant penalty + to speed for larger files (1 - 2 orders of magnitude longer time). + ``False`` (default) will read the entire object into memory before + performing the indexing. The default is much faster but requires additional memory, + though a relatively small amount in the typical use case. It is recommended to + leave this parameter as its default. + field_mask + For tables and structs, determines which fields get written out. + Only applies to immediate fields of the requested objects. If a dict + is used, a default dict will be made with the default set to the + opposite of the first element in the dict. This way if one specifies + a few fields at ``False``, all but those fields will be read out, + while if one specifies just a few fields as ``True``, only those + fields will be read out. If a list is provided, the listed fields + will be set to ``True``, while the rest will default to ``False``. + obj_buf + Read directly into memory provided in `obj_buf`. Note: the buffer + will be expanded to accommodate the data requested. To maintain the + buffer length, send in ``n_rows = len(obj_buf)``. + obj_buf_start + Start location in ``obj_buf`` for read. For concatenating data to + array-like objects. + decompress + Decompress data encoded with LGDO's compression routines right + after reading. The option has no effect on data encoded with HDF5 + built-in filters, which is always decompressed upstream by HDF5. + + + Returns + ------- + (object, n_rows_read) + `object` is the read-out object `n_rows_read` is the number of rows + successfully read out. Essential for arrays when the amount of data + is smaller than the object buffer. For scalars and structs + `n_rows_read` will be``1``. For tables it is redundant with + ``table.loc``. + """ + # Handle list-of-files recursively + if not isinstance(lh5_file, (str, h5py.File)): + lh5_file = list(lh5_file) + n_rows_read = 0 + + # to know whether we are reading in a list of files. + # this is part of the fix for reading data by idx + # (see https://github.com/legend-exp/legend-pydataobj/issues/29) + # so that we only make a copy of the data if absolutely necessary + # or if we can read the data from file without having to make a copy + self.in_file_loop = True + + for i, h5f in enumerate(lh5_file): + if isinstance(idx, list) and len(idx) > 0 and not np.isscalar(idx[0]): + # a list of lists: must be one per file + idx_i = idx[i] + elif idx is not None: + # make idx a proper tuple if it's not one already + if not (isinstance(idx, tuple) and len(idx) == 1): + idx = (idx,) + # idx is a long continuous array + n_rows_i = self.read_n_rows(name, h5f) + # find the length of the subset of idx that contains indices + # that are less than n_rows_i + n_rows_to_read_i = bisect_left(idx[0], n_rows_i) + # now split idx into idx_i and the remainder + idx_i = (idx[0][:n_rows_to_read_i],) + idx = (idx[0][n_rows_to_read_i:] - n_rows_i,) + else: + idx_i = None + n_rows_i = n_rows - n_rows_read + + # maybe someone passed in a list of len==1? + if i == (len(lh5_file) - 1): + self.in_file_loop = False + + obj_buf, n_rows_read_i = self.read( + name, + lh5_file[i], + start_row=start_row, + n_rows=n_rows_i, + idx=idx_i, + use_h5idx=use_h5idx, + field_mask=field_mask, + obj_buf=obj_buf, + obj_buf_start=obj_buf_start, + decompress=decompress, + ) + + n_rows_read += n_rows_read_i + if n_rows_read >= n_rows or obj_buf is None: + return obj_buf, n_rows_read + start_row = 0 + obj_buf_start += n_rows_read_i + + self.in_file_loop = False + + return obj_buf, n_rows_read + + # get the file from the store + h5f = self.gimme_file(lh5_file, "r") + if not h5f or name not in h5f: + raise KeyError(f"'{name}' not in {h5f.filename}") + + log.debug( + f"reading {h5f.filename}:{name}[{start_row}:{n_rows}], decompress = {decompress}, " + + (f" with field mask {field_mask}" if field_mask else "") + ) + + # make idx a proper tuple if it's not one already + if not (isinstance(idx, tuple) and len(idx) == 1): + if idx is not None: + idx = (idx,) + + # get the object's datatype + if "datatype" not in h5f[name].attrs: + raise RuntimeError( + f"'{name}' in file {lh5_file} is missing the datatype attribute" + ) + + datatype = h5f[name].attrs["datatype"] + datatype, shape, elements = parse_datatype(datatype) + + # check field_mask and make it a default dict + if datatype == "struct" or datatype == "table": + if field_mask is None: + field_mask = defaultdict(lambda: True) + elif isinstance(field_mask, dict): + default = True + if len(field_mask) > 0: + default = not field_mask[list(field_mask.keys())[0]] + field_mask = defaultdict(lambda: default, field_mask) + elif isinstance(field_mask, (list, tuple)): + field_mask = defaultdict( + lambda: False, {field: True for field in field_mask} + ) + elif not isinstance(field_mask, defaultdict): + raise RuntimeError("bad field_mask of type", type(field_mask).__name__) + elif field_mask is not None: + raise RuntimeError(f"datatype {datatype} does not accept a field_mask") + + # Scalar + # scalars are dim-0 datasets + if datatype == "scalar": + value = h5f[name][()] + if elements == "bool": + value = np.bool_(value) + if obj_buf is not None: + obj_buf.value = value + obj_buf.attrs.update(h5f[name].attrs) + return obj_buf, 1 + else: + return Scalar(value=value, attrs=h5f[name].attrs), 1 + + # Struct + # recursively build a struct, return as a dictionary + if datatype == "struct": + # ignore obj_buf. + # TODO: could append new fields or overwrite/concat to existing + # fields. If implemented, get_buffer() above should probably also + # (optionally?) prep buffers for each field + if obj_buf is not None: + raise NotImplementedError("obj_buf not implemented for LGOD Structs") + + # loop over fields and read + obj_dict = {} + for field in elements: + if not field_mask[field]: + continue + # TODO: it's strange to pass start_row, n_rows, idx to struct + # fields. If they all had shared indexing, they should be in a + # table... Maybe should emit a warning? Or allow them to be + # dicts keyed by field name? + if "int_keys" in h5f[name].attrs: + if dict(h5f[name].attrs)["int_keys"]: + f = int(field) + else: + f = str(field) + obj_dict[f], _ = self.read( + name + "/" + field, + h5f, + start_row=start_row, + n_rows=n_rows, + idx=idx, + use_h5idx=use_h5idx, + decompress=decompress, + ) + # modify datatype in attrs if a field_mask was used + attrs = dict(h5f[name].attrs) + if field_mask is not None: + selected_fields = [] + for field in elements: + if field_mask[field]: + selected_fields.append(field) + attrs["datatype"] = "struct" + "{" + ",".join(selected_fields) + "}" + return Struct(obj_dict=obj_dict, attrs=attrs), 1 + + # Below here is all array-like types. So trim idx if needed + if idx is not None: + # chop off indices < start_row + i_first_valid = bisect_left(idx[0], start_row) + idxa = idx[0][i_first_valid:] + # don't readout more than n_rows indices + idx = (idxa[:n_rows],) # works even if n_rows > len(idxa) + + # Table or WaveformTable + if datatype == "table": + col_dict = {} + + # read out each of the fields + rows_read = [] + for field in elements: + if not field_mask[field]: + continue + + fld_buf = None + if obj_buf is not None: + if not isinstance(obj_buf, Table) or field not in obj_buf: + raise ValueError( + f"obj_buf for LGDO Table '{name}' not formatted correctly" + ) + + else: + fld_buf = obj_buf[field] + + col_dict[field], n_rows_read = self.read( + name + "/" + field, + h5f, + start_row=start_row, + n_rows=n_rows, + idx=idx, + use_h5idx=use_h5idx, + obj_buf=fld_buf, + obj_buf_start=obj_buf_start, + decompress=decompress, + ) + if obj_buf is not None and obj_buf_start + n_rows_read > len(obj_buf): + obj_buf.resize(obj_buf_start + n_rows_read) + + rows_read.append(n_rows_read) + + # warn if all columns don't read in the same number of rows + if len(rows_read) > 0: + n_rows_read = rows_read[0] + else: + n_rows_read = 0 + log.warning(f"Table '{name}' has no subgroups accepted by field mask") + + for n in rows_read[1:]: + if n != n_rows_read: + log.warning( + f"Table '{name}' got strange n_rows_read = {n}, {n_rows_read} was expected ({rows_read})" + ) + + # modify datatype in attrs if a field_mask was used + attrs = dict(h5f[name].attrs) + if field_mask is not None: + selected_fields = [] + for field in elements: + if field_mask[field]: + selected_fields.append(field) + attrs["datatype"] = "table" + "{" + ",".join(selected_fields) + "}" + + # fields have been read out, now return a table + if obj_buf is None: + # if col_dict contains just 3 objects called t0, dt, and values, + # return a WaveformTable + if ( + len(col_dict) == 3 + and "t0" in col_dict + and "dt" in col_dict + and "values" in col_dict + ): + table = WaveformTable( + t0=col_dict["t0"], dt=col_dict["dt"], values=col_dict["values"] + ) + else: + table = Table(col_dict=col_dict, attrs=attrs) + + # set (write) loc to end of tree + table.loc = n_rows_read + return table, n_rows_read + else: + # We have read all fields into the object buffer. Run + # checks: All columns should be the same size. So update + # table's size as necessary, warn if any mismatches are found + obj_buf.resize(do_warn=True) + # set (write) loc to end of tree + obj_buf.loc = obj_buf_start + n_rows_read + # check attributes + if set(obj_buf.attrs.keys()) != set(attrs.keys()): + raise RuntimeError( + f"attrs mismatch. obj_buf.attrs: " + f"{obj_buf.attrs}, h5f[{name}].attrs: {attrs}" + ) + return obj_buf, n_rows_read + + # ArrayOfEncodedEqualSizedArrays and VectorOfEncodedVectors + for cond, enc_lgdo in [ + ( + datatype == "array_of_encoded_equalsized_arrays", + ArrayOfEncodedEqualSizedArrays, + ), + (elements.startswith("encoded_array"), VectorOfEncodedVectors), + ]: + if cond: + if ( + not decompress + and obj_buf is not None + and not isinstance(obj_buf, enc_lgdo) + ): + raise ValueError(f"obj_buf for '{name}' not a {enc_lgdo}") + + # read out decoded_size, either a Scalar or an Array + decoded_size_buf = encoded_data_buf = None + if obj_buf is not None and not decompress: + decoded_size_buf = obj_buf.decoded_size + encoded_data_buf = obj_buf.encoded_data + + decoded_size, _ = self.read( + f"{name}/decoded_size", + h5f, + start_row=start_row, + n_rows=n_rows, + idx=idx, + use_h5idx=use_h5idx, + obj_buf=None if decompress else decoded_size_buf, + obj_buf_start=0 if decompress else obj_buf_start, + ) + + # read out encoded_data, a VectorOfVectors + encoded_data, n_rows_read = self.read( + f"{name}/encoded_data", + h5f, + start_row=start_row, + n_rows=n_rows, + idx=idx, + use_h5idx=use_h5idx, + obj_buf=None if decompress else encoded_data_buf, + obj_buf_start=0 if decompress else obj_buf_start, + ) + + # return the still encoded data in the buffer object, if there + if obj_buf is not None and not decompress: + return obj_buf, n_rows_read + + # otherwise re-create the encoded LGDO + rawdata = enc_lgdo( + encoded_data=encoded_data, + decoded_size=decoded_size, + attrs=h5f[name].attrs, + ) + + # already return if no decompression is requested + if not decompress: + return rawdata, n_rows_read + + # if no buffer, decode and return + elif obj_buf is None and decompress: + return compress.decode(rawdata), n_rows_read + + # eventually expand provided obj_buf, if too short + buf_size = obj_buf_start + n_rows_read + if len(obj_buf) < buf_size: + obj_buf.resize(buf_size) + + # use the (decoded object type) buffer otherwise + if enc_lgdo == ArrayOfEncodedEqualSizedArrays: + if not isinstance(obj_buf, ArrayOfEqualSizedArrays): + raise ValueError( + f"obj_buf for decoded '{name}' not an ArrayOfEqualSizedArrays" + ) + + compress.decode(rawdata, obj_buf[obj_buf_start:buf_size]) + + elif enc_lgdo == VectorOfEncodedVectors: + if not isinstance(obj_buf, VectorOfVectors): + raise ValueError( + f"obj_buf for decoded '{name}' not a VectorOfVectors" + ) + + # FIXME: not a good idea. an in place decoding version + # of decode would be needed to avoid extra memory + # allocations + for i, wf in enumerate(compress.decode(rawdata)): + obj_buf[obj_buf_start + i] = wf + + return obj_buf, n_rows_read + + # VectorOfVectors + # read out vector of vectors of different size + if elements.startswith("array"): + if obj_buf is not None and not isinstance(obj_buf, VectorOfVectors): + raise ValueError(f"obj_buf for '{name}' not a LGDO VectorOfVectors") + + # read out cumulative_length + cumulen_buf = None if obj_buf is None else obj_buf.cumulative_length + cumulative_length, n_rows_read = self.read( + f"{name}/cumulative_length", + h5f, + start_row=start_row, + n_rows=n_rows, + idx=idx, + use_h5idx=use_h5idx, + obj_buf=cumulen_buf, + obj_buf_start=obj_buf_start, + ) + # get a view of just what was read out for cleaner code below + this_cumulen_nda = cumulative_length.nda[ + obj_buf_start : obj_buf_start + n_rows_read + ] + + if idx is not None and n_rows_read > 0: + # get the starting indices for each array in flattended data: + # the starting index for array[i] is cumulative_length[i-1] + idx2 = (np.asarray(idx[0]).copy() - 1,) + # re-read cumulative_length with these indices + # note this will allocate memory for fd_starts! + fd_start = None + if idx2[0][0] == -1: + idx2 = (idx2[0][1:],) + fd_start = 0 # this variable avoids an ndarray append + fd_starts, fds_n_rows_read = self.read( + f"{name}/cumulative_length", + h5f, + start_row=start_row, + n_rows=n_rows, + idx=idx2, + use_h5idx=use_h5idx, + ) + fd_starts = fd_starts.nda # we just need the nda + if fd_start is None: + fd_start = fd_starts[0] + + # compute the length that flattened_data will have after the + # fancy-indexed read + fd_n_rows = np.sum(this_cumulen_nda[-len(fd_starts) :] - fd_starts) + if fd_start == 0: + fd_n_rows += this_cumulen_nda[0] + + # now make fd_idx + fd_idx = np.empty(fd_n_rows, dtype="uint32") + fd_idx = _make_fd_idx(fd_starts, this_cumulen_nda, fd_idx) + + # Now clean up this_cumulen_nda, to be ready + # to match the in-memory version of flattened_data. Note: these + # operations on the view change the original array because they are + # numpy arrays, not lists. + this_cumulen_nda[-len(fd_starts) :] -= fd_starts + np.cumsum(this_cumulen_nda, out=this_cumulen_nda) + + else: + fd_idx = None + + # determine the start_row and n_rows for the flattened_data readout + fd_start = 0 + if start_row > 0 and n_rows_read > 0: + # need to read out the cumulen sample -before- the first sample + # read above in order to get the starting row of the first + # vector to read out in flattened_data + fd_start = h5f[f"{name}/cumulative_length"][start_row - 1] + + # check limits for values that will be used subsequently + if this_cumulen_nda[-1] < fd_start: + log.debug( + f"this_cumulen_nda[-1] = {this_cumulen_nda[-1]}, " + f"fd_start = {fd_start}, " + f"start_row = {start_row}, " + f"n_rows_read = {n_rows_read}" + ) + raise RuntimeError( + f"cumulative_length non-increasing between entries " + f"{start_row} and {start_row+n_rows_read} ??" + ) + + # determine the number of rows for the flattened_data readout + fd_n_rows = this_cumulen_nda[-1] if n_rows_read > 0 else 0 + + # Now done with this_cumulen_nda, so we can clean it up to be ready + # to match the in-memory version of flattened_data. Note: these + # operations on the view change the original array because they are + # numpy arrays, not lists. + # + # First we need to subtract off the in-file offset for the start of + # read for flattened_data + this_cumulen_nda -= fd_start + + # If we started with a partially-filled buffer, add the + # appropriate offset for the start of the in-memory flattened + # data for this read. + fd_buf_start = np.uint32(0) + if obj_buf_start > 0: + fd_buf_start = cumulative_length.nda[obj_buf_start - 1] + this_cumulen_nda += fd_buf_start + + # Now prepare the object buffer if necessary + fd_buf = None + if obj_buf is not None: + fd_buf = obj_buf.flattened_data + # grow fd_buf if necessary to hold the data + fdb_size = fd_buf_start + fd_n_rows + if len(fd_buf) < fdb_size: + fd_buf.resize(fdb_size) + + # now read + flattened_data, dummy_rows_read = self.read( + f"{name}/flattened_data", + h5f, + start_row=fd_start, + n_rows=fd_n_rows, + idx=fd_idx, + use_h5idx=use_h5idx, + obj_buf=fd_buf, + obj_buf_start=fd_buf_start, + ) + if obj_buf is not None: + return obj_buf, n_rows_read + return ( + VectorOfVectors( + flattened_data=flattened_data, + cumulative_length=cumulative_length, + attrs=h5f[name].attrs, + ), + n_rows_read, + ) + + # Array + # FixedSizeArray + # ArrayOfEqualSizedArrays + # read out all arrays by slicing + if "array" in datatype: + if obj_buf is not None: + if not isinstance(obj_buf, Array): + raise ValueError(f"obj_buf for '{name}' not an LGDO Array") + obj_buf = None + + # compute the number of rows to read + # we culled idx above for start_row and n_rows, now we have to apply + # the constraint of the length of the dataset + ds_n_rows = h5f[name].shape[0] + if idx is not None: + if len(idx[0]) > 0 and idx[0][-1] >= ds_n_rows: + log.warning( + "idx indexed past the end of the array in the file. Culling..." + ) + n_rows_to_read = bisect_left(idx[0], ds_n_rows) + idx = (idx[0][:n_rows_to_read],) + if len(idx[0]) == 0: + log.warning("idx empty after culling.") + n_rows_to_read = len(idx[0]) + else: + n_rows_to_read = ds_n_rows - start_row + if n_rows_to_read > n_rows: + n_rows_to_read = n_rows + + # if idx is passed, check if we can make it a slice instead (faster) + change_idx_to_slice = False + + # prepare the selection for the read. Use idx if available + if idx is not None: + # check if idx is empty and convert to slice instead + if len(idx[0]) == 0: + source_sel = np.s_[0:0] + change_idx_to_slice = True + # check if idx is contiguous and increasing + # if so, convert it to a slice instead (faster) + elif np.all(np.diff(idx[0]) == 1): + source_sel = np.s_[idx[0][0] : idx[0][-1] + 1] + change_idx_to_slice = True + else: + source_sel = idx + else: + source_sel = np.s_[start_row : start_row + n_rows_to_read] + + # Now read the array + if obj_buf is not None and n_rows_to_read > 0: + buf_size = obj_buf_start + n_rows_to_read + if len(obj_buf) < buf_size: + obj_buf.resize(buf_size) + dest_sel = np.s_[obj_buf_start:buf_size] + + # this is required to make the read of multiple files faster + # until a better solution found. + if change_idx_to_slice or idx is None or use_h5idx: + h5f[name].read_direct(obj_buf.nda, source_sel, dest_sel) + else: + # it is faster to read the whole object and then do fancy indexing + obj_buf.nda[dest_sel] = h5f[name][...][source_sel] + + nda = obj_buf.nda + else: + if n_rows == 0: + tmp_shape = (0,) + h5f[name].shape[1:] + nda = np.empty(tmp_shape, h5f[name].dtype) + else: + if change_idx_to_slice or idx is None or use_h5idx: + nda = h5f[name][source_sel] + else: + # it is faster to read the whole object and then do fancy indexing + nda = h5f[name][...][source_sel] + + # if reading a list of files recursively, this is given to obj_buf on + # the first file read. obj_buf needs to be resized and therefore + # it needs to hold the data itself (not a view of the data). + # a view is returned by the source_sel indexing, which cannot be resized + # by ndarray.resize(). + if hasattr(self, "in_file_loop") and self.in_file_loop: + nda = np.copy(nda) + + # special handling for bools + # (c and Julia store as uint8 so cast to bool) + if elements == "bool": + nda = nda.astype(np.bool_) + + # Finally, set attributes and return objects + attrs = h5f[name].attrs + if obj_buf is None: + if datatype == "array": + return Array(nda=nda, attrs=attrs), n_rows_to_read + if datatype == "fixedsize_array": + return FixedSizeArray(nda=nda, attrs=attrs), n_rows_to_read + if datatype == "array_of_equalsized_arrays": + return ( + ArrayOfEqualSizedArrays(nda=nda, dims=shape, attrs=attrs), + n_rows_to_read, + ) + else: + if set(obj_buf.attrs.keys()) != set(attrs.keys()): + raise RuntimeError( + f"attrs mismatch. " + f"obj_buf.attrs: {obj_buf.attrs}, " + f"h5f[{name}].attrs: {attrs}" + ) + return obj_buf, n_rows_to_read + + raise RuntimeError("don't know how to read datatype {datatype}") + + def write( + self, + obj: LGDO, + name: str, + lh5_file: str | h5py.File, + group: str | h5py.Group = "/", + start_row: int = 0, + n_rows: int = None, + wo_mode: str = "append", + write_start: int = 0, + **h5py_kwargs, + ) -> None: + """Write an LGDO into an LH5 file. + + If the `obj` :class:`.LGDO` has a `compression` attribute, its value is + interpreted as the algorithm to be used to compress `obj` before + writing to disk. The type of `compression` can be: + + string, kwargs dictionary, hdf5plugin filter + interpreted as the name of a built-in or custom `HDF5 compression + filter `_ + (``"gzip"``, ``"lzf"``, :mod:`hdf5plugin` filter object etc.) and + passed directly to :meth:`h5py.Group.create_dataset`. + + :class:`.WaveformCodec` object + If `obj` is a :class:`.WaveformTable` and ``obj.values`` holds the + attribute, compress ``values`` using this algorithm. More + documentation about the supported waveform compression algorithms at + :mod:`.lgdo.compression`. + + If the `obj` :class:`.LGDO` has a `hdf5_settings` attribute holding a + dictionary, it is interpreted as a list of keyword arguments to be + forwarded directly to :meth:`h5py.Group.create_dataset` (exactly like + the first format of `compression` above). This is the preferred way to + specify HDF5 dataset options such as chunking etc. If compression + options are specified, they take precedence over those set with the + `compression` attribute. + + Note + ---- + The `compression` LGDO attribute takes precedence over the default HDF5 + compression settings. The `hdf5_settings` attribute takes precedence + over `compression`. These attributes are not written to disk. + + Note + ---- + HDF5 compression is skipped for the `encoded_data.flattened_data` + dataset of :class:`.VectorOfEncodedVectors` and + :class:`.ArrayOfEncodedEqualSizedArrays`. + + Parameters + ---------- + obj + LH5 object. if object is array-like, writes `n_rows` starting from + `start_row` in `obj`. + name + name of the object in the output HDF5 file. + lh5_file + HDF5 file name or :class:`h5py.File` object. + group + HDF5 group name or :class:`h5py.Group` object in which `obj` should + be written. + start_row + first row in `obj` to be written. + n_rows + number of rows in `obj` to be written. + wo_mode + - ``write_safe`` or ``w``: only proceed with writing if the + object does not already exist in the file. + - ``append`` or ``a``: append along axis 0 (the first dimension) + of array-like objects and array-like subfields of structs. + :class:`~.lgdo.scalar.Scalar` objects get overwritten. + - ``overwrite`` or ``o``: replace data in the file if present, + starting from `write_start`. Note: overwriting with `write_start` = + end of array is the same as ``append``. + - ``overwrite_file`` or ``of``: delete file if present prior to + writing to it. `write_start` should be 0 (its ignored). + - ``append_column`` or ``ac``: append columns from an :class:`~.lgdo.table.Table` + `obj` only if there is an existing :class:`~.lgdo.table.Table` in the `lh5_file` with + the same `name` and :class:`~.lgdo.table.Table.size`. If the sizes don't match, + or if there are matching fields, it errors out. + write_start + row in the output file (if already existing) to start overwriting + from. + **h5py_kwargs + additional keyword arguments forwarded to + :meth:`h5py.Group.create_dataset` to specify, for example, an HDF5 + compression filter to be applied before writing non-scalar + datasets. **Note: `compression` Ignored if compression is specified + as an `obj` attribute.** + """ + log.debug( + f"writing {repr(obj)}[{start_row}:{n_rows}] as " + f"{lh5_file}:{group}/{name}[{write_start}:], " + f"mode = {wo_mode}, h5py_kwargs = {h5py_kwargs}" + ) + + if wo_mode == "write_safe": + wo_mode = "w" + if wo_mode == "append": + wo_mode = "a" + if wo_mode == "overwrite": + wo_mode = "o" + if wo_mode == "overwrite_file": + wo_mode = "of" + write_start = 0 + if wo_mode == "append_column": + wo_mode = "ac" + if wo_mode not in ["w", "a", "o", "of", "ac"]: + raise ValueError(f"unknown wo_mode '{wo_mode}'") + + # "mode" is for the h5df.File and wo_mode is for this function + # In hdf5, 'a' is really "modify" -- in addition to appending, you can + # change any object in the file. So we use file:append for + # write_object:overwrite. + mode = "w" if wo_mode == "of" else "a" + lh5_file = self.gimme_file(lh5_file, mode=mode) + group = self.gimme_group(group, lh5_file) + if wo_mode == "w" and name in group: + raise RuntimeError(f"can't overwrite '{name}' in wo_mode 'write_safe'") + + # struct or table or waveform table + if isinstance(obj, Struct): + # In order to append a column, we need to update the `table{old_fields}` value in `group.attrs['datatype"]` to include the new fields. + # One way to do this is to override `obj.attrs["datatype"]` to include old and new fields. Then we can write the fields to the table as normal. + if wo_mode == "ac": + old_group = self.gimme_group(name, group) + datatype, shape, fields = parse_datatype(old_group.attrs["datatype"]) + if datatype not in ["table", "struct"]: + raise RuntimeError( + f"Trying to append columns to an object of type {datatype}" + ) + + # If the mode is `append_column`, make sure we aren't appending a table that has a column of the same name as in the existing table + # Also make sure that the field we are adding has the same size + if len(list(set(fields).intersection(set(obj.keys())))) != 0: + raise ValueError( + f"Can't append {list(set(fields).intersection(set(obj.keys())))} column(s) to a table with the same field(s)" + ) + # It doesn't matter what key we access, as all fields in the old table have the same size + if old_group[list(old_group.keys())[0]].size != obj.size: + raise ValueError( + f"Table sizes don't match. Trying to append column of size {obj.size} to a table of size {old_group[list(old_group.keys())[0]].size}." + ) + + # Now we can append the obj.keys() to the old fields, and then update obj.attrs. + fields.extend(list(obj.keys())) + obj.attrs.pop("datatype") + obj.attrs["datatype"] = "table" + "{" + ",".join(fields) + "}" + + group = self.gimme_group( + name, + group, + grp_attrs=obj.attrs, + overwrite=(wo_mode in ["o", "ac"]), + ) + # If the mode is overwrite, then we need to peek into the file's table's existing fields + # If we are writing a new table to the group that does not contain an old field, we should delete that old field from the file + if wo_mode == "o": + # Find the old keys in the group that are not present in the new table's keys, then delete them + for key in list(set(group.keys()) - set(obj.keys())): + log.debug(f"{key} is not present in new table, deleting field") + del group[key] + + for field in obj.keys(): + # eventually compress waveform table values with LGDO's + # custom codecs before writing + # if waveformtable.values.attrs["compression"] is NOT a + # WaveformCodec, just leave it there + obj_fld = None + if ( + isinstance(obj, WaveformTable) + and field == "values" + and not isinstance(obj.values, VectorOfEncodedVectors) + and not isinstance(obj.values, ArrayOfEncodedEqualSizedArrays) + and "compression" in obj.values.attrs + and isinstance(obj.values.attrs["compression"], WaveformCodec) + ): + codec = obj.values.attrs["compression"] + obj_fld = compress.encode(obj.values, codec=codec) + else: + obj_fld = obj[field] + + # Convert keys to string for dataset names + f = str(field) + self.write( + obj_fld, + f, + lh5_file, + group=group, + start_row=start_row, + n_rows=n_rows, + wo_mode=wo_mode, + write_start=write_start, + **h5py_kwargs, + ) + return + + # scalars + elif isinstance(obj, Scalar): + if name in group: + if wo_mode in ["o", "a"]: + log.debug(f"overwriting {name} in {group}") + del group[name] + else: + raise RuntimeError( + f"tried to overwrite {name} in {group} for wo_mode {wo_mode}" + ) + ds = group.create_dataset(name, shape=(), data=obj.value) + ds.attrs.update(obj.attrs) + return + + # vector of encoded vectors + elif isinstance(obj, (VectorOfEncodedVectors, ArrayOfEncodedEqualSizedArrays)): + group = self.gimme_group( + name, group, grp_attrs=obj.attrs, overwrite=(wo_mode == "o") + ) + + # ask not to further compress flattened_data, it is already compressed! + obj.encoded_data.flattened_data.attrs["compression"] = None + + self.write( + obj.encoded_data, + "encoded_data", + lh5_file, + group=group, + start_row=start_row, + n_rows=n_rows, + wo_mode=wo_mode, + write_start=write_start, + **h5py_kwargs, + ) + + self.write( + obj.decoded_size, + "decoded_size", + lh5_file, + group=group, + start_row=start_row, + n_rows=n_rows, + wo_mode=wo_mode, + write_start=write_start, + **h5py_kwargs, + ) + + # vector of vectors + elif isinstance(obj, VectorOfVectors): + group = self.gimme_group( + name, group, grp_attrs=obj.attrs, overwrite=(wo_mode == "o") + ) + if ( + n_rows is None + or n_rows > obj.cumulative_length.nda.shape[0] - start_row + ): + n_rows = obj.cumulative_length.nda.shape[0] - start_row + + # if appending we need to add an appropriate offset to the + # cumulative lengths as appropriate for the in-file object + offset = 0 # declare here because we have to subtract it off at the end + if (wo_mode == "a" or wo_mode == "o") and "cumulative_length" in group: + len_cl = len(group["cumulative_length"]) + if wo_mode == "a": + write_start = len_cl + if len_cl > 0: + offset = group["cumulative_length"][write_start - 1] + + # First write flattened_data array. Only write rows with data. + fd_start = 0 if start_row == 0 else obj.cumulative_length.nda[start_row - 1] + fd_n_rows = obj.cumulative_length.nda[start_row + n_rows - 1] - fd_start + self.write( + obj.flattened_data, + "flattened_data", + lh5_file, + group=group, + start_row=fd_start, + n_rows=fd_n_rows, + wo_mode=wo_mode, + write_start=offset, + **h5py_kwargs, + ) + + # now offset is used to give appropriate in-file values for + # cumulative_length. Need to adjust it for start_row + if start_row > 0: + offset -= obj.cumulative_length.nda[start_row - 1] + + # Add offset to obj.cumulative_length itself to avoid memory allocation. + # Then subtract it off after writing! (otherwise it will be changed + # upon return) + cl_dtype = obj.cumulative_length.nda.dtype.type + obj.cumulative_length.nda += cl_dtype(offset) + + self.write( + obj.cumulative_length, + "cumulative_length", + lh5_file, + group=group, + start_row=start_row, + n_rows=n_rows, + wo_mode=wo_mode, + write_start=write_start, + **h5py_kwargs, + ) + obj.cumulative_length.nda -= cl_dtype(offset) + + return + + # if we get this far, must be one of the Array types + elif isinstance(obj, Array): + if n_rows is None or n_rows > obj.nda.shape[0] - start_row: + n_rows = obj.nda.shape[0] - start_row + + nda = obj.nda[start_row : start_row + n_rows] + + # hack to store bools as uint8 for c / Julia compliance + if nda.dtype.name == "bool": + nda = nda.astype(np.uint8) + + # need to create dataset from ndarray the first time for speed + # creating an empty dataset and appending to that is super slow! + if (wo_mode != "a" and write_start == 0) or name not in group: + # this is needed in order to have a resizable (in the first + # axis) data set, i.e. rows can be appended later + # NOTE: this automatically turns chunking on! + maxshape = (None,) + nda.shape[1:] + h5py_kwargs.setdefault("maxshape", maxshape) + + if wo_mode == "o" and name in group: + log.debug(f"overwriting {name} in {group}") + del group[name] + + # set default compression options + for k, v in DEFAULT_HDF5_SETTINGS.items(): + h5py_kwargs.setdefault(k, v) + + # compress using the 'compression' LGDO attribute, if available + if "compression" in obj.attrs: + comp_algo = obj.attrs["compression"] + if isinstance(comp_algo, dict): + h5py_kwargs |= obj.attrs["compression"] + else: + h5py_kwargs["compression"] = obj.attrs["compression"] + + # and even the 'hdf5_settings' one, preferred + if "hdf5_settings" in obj.attrs: + h5py_kwargs |= obj.attrs["hdf5_settings"] + + # create HDF5 dataset + ds = group.create_dataset(name, data=nda, **h5py_kwargs) + + # attach HDF5 dataset attributes, but not "compression"! + _attrs = obj.getattrs(datatype=True) + _attrs.pop("compression", None) + _attrs.pop("hdf5_settings", None) + ds.attrs.update(_attrs) + return + + # Now append or overwrite + ds = group[name] + if not isinstance(ds, h5py.Dataset): + raise RuntimeError( + f"existing HDF5 object '{name}' in group '{group}'" + " is not a dataset! Cannot overwrite or append" + ) + + old_len = ds.shape[0] + if wo_mode == "a": + write_start = old_len + add_len = write_start + nda.shape[0] - old_len + ds.resize(old_len + add_len, axis=0) + ds[write_start:] = nda + return + + else: + raise RuntimeError( + f"do not know how to write '{name}' of type '{type(obj).__name__}'" + ) + + def read_n_rows(self, name: str, lh5_file: str | h5py.File) -> int | None: + """Look up the number of rows in an Array-like object called `name` in + `lh5_file`. + + Return ``None`` if it is a :class:`.Scalar` or a :class:`.Struct`.""" + # this is basically a stripped down version of read_object + h5f = self.gimme_file(lh5_file, "r") + if not h5f or name not in h5f: + raise KeyError(f"'{name}' not in {lh5_file}") + + # get the datatype + if "datatype" not in h5f[name].attrs: + raise RuntimeError( + f"'{name}' in file {lh5_file} is missing the datatype attribute" + ) + + datatype = h5f[name].attrs["datatype"] + datatype, shape, elements = parse_datatype(datatype) + + # scalars are dim-0 datasets + if datatype == "scalar": + return None + + # structs don't have rows + if datatype == "struct": + return None + + # tables should have elements with all the same length + if datatype == "table": + # read out each of the fields + rows_read = None + for field in elements: + n_rows_read = self.read_n_rows(name + "/" + field, h5f) + if not rows_read: + rows_read = n_rows_read + elif rows_read != n_rows_read: + log.warning( + f"'{field}' field in table '{name}' has {rows_read} rows, " + f"{n_rows_read} was expected" + ) + return rows_read + + # length of vector of vectors is the length of its cumulative_length + if elements.startswith("array"): + return self.read_n_rows(f"{name}/cumulative_length", h5f) + + # length of vector of encoded vectors is the length of its decoded_size + if ( + elements.startswith("encoded_array") + or datatype == "array_of_encoded_equalsized_arrays" + ): + return self.read_n_rows(f"{name}/encoded_data", h5f) + + # return array length (without reading the array!) + if "array" in datatype: + # compute the number of rows to read + return h5f[name].shape[0] + + raise RuntimeError(f"don't know how to read datatype '{datatype}'") + + +def ls(lh5_file: str | h5py.Group, lh5_group: str = "") -> list[str]: + """Return a list of LH5 groups in the input file and group, similar + to ``ls`` or ``h5ls``. Supports wildcards in group names. + + + Parameters + ---------- + lh5_file + name of file. + lh5_group + group to search. add a ``/`` to the end of the group name if you want to + list all objects inside that group. + """ + + log.debug( + f"Listing objects in '{lh5_file}'" + + ("" if lh5_group == "" else f" (and group {lh5_group})") + ) + + lh5_st = LH5Store() + # To use recursively, make lh5_file a h5group instead of a string + if isinstance(lh5_file, str): + lh5_file = lh5_st.gimme_file(lh5_file, "r") + if lh5_group.startswith("/"): + lh5_group = lh5_group[1:] + + if lh5_group == "": + lh5_group = "*" + + splitpath = lh5_group.split("/", 1) + matchingkeys = fnmatch.filter(lh5_file.keys(), splitpath[0]) + + if len(splitpath) == 1: + return matchingkeys + else: + ret = [] + for key in matchingkeys: + ret.extend([f"{key}/{path}" for path in ls(lh5_file[key], splitpath[1])]) + return ret + + +def show( + lh5_file: str | h5py.Group, + lh5_group: str = "/", + attrs: bool = False, + indent: str = "", + header: bool = True, +) -> None: + """Print a tree of LH5 file contents with LGDO datatype. + + Parameters + ---------- + lh5_file + the LH5 file. + lh5_group + print only contents of this HDF5 group. + attrs + print the HDF5 attributes too. + indent + indent the diagram with this string. + header + print `lh5_group` at the top of the diagram. + + Examples + -------- + >>> from lgdo import show + >>> show("file.lh5", "/geds/raw") + /geds/raw + ├── channel · array<1>{real} + ├── energy · array<1>{real} + ├── timestamp · array<1>{real} + ├── waveform · table{t0,dt,values} + │ ├── dt · array<1>{real} + │ ├── t0 · array<1>{real} + │ └── values · array_of_equalsized_arrays<1,1>{real} + └── wf_std · array<1>{real} + """ + # open file + if isinstance(lh5_file, str): + lh5_file = h5py.File(expand_path(lh5_file), "r") + + # go to group + if lh5_group != "/": + lh5_file = lh5_file[lh5_group] + + if header: + print(f"\033[1m{lh5_group}\033[0m") # noqa: T201 + + # get an iterator over the keys in the group + it = iter(lh5_file) + key = None + + # make sure there is actually something in this file/group + try: + key = next(it) # get first key + except StopIteration: + print(f"{indent}└── empty") # noqa: T201 + return + + # loop over keys + while True: + val = lh5_file[key] + # we want to print the LGDO datatype + dtype = val.attrs.get("datatype", default="no datatype") + if dtype == "no datatype" and isinstance(val, h5py.Group): + dtype = "HDF5 group" + + _attrs = "" + if attrs: + attrs_d = dict(val.attrs) + attrs_d.pop("datatype", "") + _attrs = "── " + str(attrs_d) if attrs_d else "" + + # is this the last key? + killme = False + try: + k_new = next(it) # get next key + except StopIteration: + char = "└──" + killme = True # we'll have to kill this loop later + else: + char = "├──" + + print(f"{indent}{char} \033[1m{key}\033[0m · {dtype} {_attrs}") # noqa: T201 + + # if it's a group, call this function recursively + if isinstance(val, h5py.Group): + show( + val, + indent=indent + (" " if killme else "│ "), + header=False, + attrs=attrs, + ) + + # break or move to next key + if killme: + break + else: + key = k_new + + +def load_nda( + f_list: str | list[str], + par_list: list[str], + lh5_group: str = "", + idx_list: list[np.ndarray | list | tuple] = None, +) -> dict[str, np.ndarray]: + r"""Build a dictionary of :class:`numpy.ndarray`\ s from LH5 data. + + Given a list of files, a list of LH5 table parameters, and an optional + group path, return a NumPy array with all values for each parameter. + + Parameters + ---------- + f_list + A list of files. Can contain wildcards. + par_list + A list of parameters to read from each file. + lh5_group + group path within which to find the specified parameters. + idx_list + for fancy-indexed reads. Must be one index array for each file in + `f_list`. + + Returns + ------- + par_data + A dictionary of the parameter data keyed by the elements of `par_list`. + Each entry contains the data for the specified parameter concatenated + over all files in `f_list`. + """ + if isinstance(f_list, str): + f_list = [f_list] + if idx_list is not None: + idx_list = [idx_list] + if idx_list is not None and len(f_list) != len(idx_list): + raise ValueError( + f"f_list length ({len(f_list)}) != idx_list length ({len(idx_list)})!" + ) + + # Expand wildcards + f_list = [f for f_wc in f_list for f in sorted(glob.glob(os.path.expandvars(f_wc)))] + + sto = LH5Store() + par_data = {par: [] for par in par_list} + for ii, f in enumerate(f_list): + f = sto.gimme_file(f, "r") + for par in par_list: + if f"{lh5_group}/{par}" not in f: + raise RuntimeError(f"'{lh5_group}/{par}' not in file {f_list[ii]}") + + if idx_list is None: + data, _ = sto.read(f"{lh5_group}/{par}", f) + else: + data, _ = sto.read(f"{lh5_group}/{par}", f, idx=idx_list[ii]) + if not data: + continue + par_data[par].append(data.nda) + par_data = {par: np.concatenate(par_data[par]) for par in par_list} + return par_data + + +def load_dfs( + f_list: str | list[str], + par_list: list[str], + lh5_group: str = "", + idx_list: list[np.ndarray | list | tuple] = None, +) -> pd.DataFrame: + """Build a :class:`pandas.DataFrame` from LH5 data. + + Given a list of files (can use wildcards), a list of LH5 columns, and + optionally the group path, return a :class:`pandas.DataFrame` with all + values for each parameter. + + See Also + -------- + :func:`load_nda` + + Returns + ------- + dataframe + contains columns for each parameter in `par_list`, and rows containing + all data for the associated parameters concatenated over all files in + `f_list`. + """ + return pd.DataFrame( + load_nda(f_list, par_list, lh5_group=lh5_group, idx_list=idx_list) + ) + + +@nb.njit(parallel=False, fastmath=True) +def _make_fd_idx(starts, stops, idx): + k = 0 + if len(starts) < len(stops): + for i in range(stops[0]): + idx[k] = i + k += 1 + stops = stops[1:] + for j in range(len(starts)): + for i in range(starts[j], stops[j]): + idx[k] = i + k += 1 + return (idx,) diff --git a/src/lgdo/lh5/utils.py b/src/lgdo/lh5/utils.py new file mode 100644 index 00000000..bc1fd425 --- /dev/null +++ b/src/lgdo/lh5/utils.py @@ -0,0 +1,118 @@ +"""Implements utilities for LEGEND Data Objects.""" +from __future__ import annotations + +import glob +import logging +import os +import string + +log = logging.getLogger(__name__) + + +def parse_datatype(datatype: str) -> tuple[str, tuple[int, ...], str | list[str]]: + """Parse datatype string and return type, dimensions and elements. + + Parameters + ---------- + datatype + a LGDO-formatted datatype string. + + Returns + ------- + element_type + the datatype name dims if not ``None``, a tuple of dimensions for the + LGDO. Note this is not the same as the NumPy shape of the underlying + data object. See the LGDO specification for more information. Also see + :class:`~.types.ArrayOfEqualSizedArrays` and + :meth:`.lh5_store.LH5Store.read` for example code elements for + numeric objects, the element type for struct-like objects, the list of + fields in the struct. + """ + if "{" not in datatype: + return "scalar", None, datatype + + # for other datatypes, need to parse the datatype string + from parse import parse + + datatype, element_description = parse("{}{{{}}}", datatype) + if datatype.endswith(">"): + datatype, dims = parse("{}<{}>", datatype) + dims = [int(i) for i in dims.split(",")] + return datatype, tuple(dims), element_description + else: + return datatype, None, element_description.split(",") + + +def expand_vars(expr: str, substitute: dict[str, str] = None) -> str: + """Expand (environment) variables. + + Note + ---- + Malformed variable names and references to non-existing variables are left + unchanged. + + Parameters + ---------- + expr + string expression, which may include (environment) variables prefixed by + ``$``. + substitute + use this dictionary to substitute variables. Takes precedence over + environment variables. + """ + if substitute is None: + substitute = {} + + # use provided mapping + # then expand env variables + return os.path.expandvars(string.Template(expr).safe_substitute(substitute)) + + +def expand_path( + path: str, + substitute: dict[str, str] = None, + list: bool = False, + base_path: str = None, +) -> str | list: + """Expand (environment) variables and wildcards to return absolute paths. + + Parameters + ---------- + path + name of path, which may include environment variables and wildcards. + list + if ``True``, return a list. If ``False``, return a string; if ``False`` + and a unique file is not found, raise an exception. + substitute + use this dictionary to substitute variables. Environment variables take + precedence. + base_path + name of base path. Returned paths will be relative to base. + + Returns + ------- + path or list of paths + Unique absolute path, or list of all absolute paths + """ + if base_path is not None and base_path != "": + base_path = os.path.expanduser(os.path.expandvars(base_path)) + path = os.path.join(base_path, path) + + # first expand variables + _path = expand_vars(path, substitute) + + # then expand wildcards + paths = sorted(glob.glob(os.path.expanduser(_path))) + + if base_path is not None and base_path != "": + paths = [os.path.relpath(p, base_path) for p in paths] + + if not list: + if len(paths) == 0: + raise FileNotFoundError(f"could not find path matching {path}") + elif len(paths) > 1: + raise FileNotFoundError(f"found multiple paths matching {path}") + else: + return paths[0] + else: + return paths diff --git a/src/lgdo/lh5_store.py b/src/lgdo/lh5_store.py index 7103d05c..ce8b72cd 100644 --- a/src/lgdo/lh5_store.py +++ b/src/lgdo/lh5_store.py @@ -1,166 +1,91 @@ -""" -This module implements routines from reading and writing LEGEND Data Objects in -HDF5 files. -""" from __future__ import annotations -import fnmatch -import glob -import logging -import os import sys -from bisect import bisect_left -from collections import defaultdict -from typing import Any, Iterator, Union +from typing import Iterator, Union +from warnings import warn import h5py -import numba as nb import numpy as np import pandas as pd -from . import compression as compress -from .compression import WaveformCodec -from .lgdo_utils import expand_path, parse_datatype -from .types import ( - Array, - ArrayOfEncodedEqualSizedArrays, - ArrayOfEqualSizedArrays, - FixedSizeArray, - Scalar, - Struct, - Table, - VectorOfEncodedVectors, - VectorOfVectors, - WaveformTable, -) - +from . import lh5 +from .types import Array # noqa: F401 +from .types import ArrayOfEncodedEqualSizedArrays # noqa: F401 +from .types import ArrayOfEqualSizedArrays # noqa: F401 +from .types import FixedSizeArray # noqa: F401 +from .types import Scalar # noqa: F401 +from .types import Struct # noqa: F401 +from .types import Table # noqa: F401 +from .types import VectorOfEncodedVectors # noqa: F401 +from .types import VectorOfVectors # noqa: F401 +from .types import WaveformTable # noqa: F401 + +DEFAULT_HDF5_COMPRESSION = None LGDO = Union[Array, Scalar, Struct, VectorOfVectors] - -log = logging.getLogger(__name__) - DEFAULT_HDF5_SETTINGS: dict[str, ...] = {"shuffle": True, "compression": "gzip"} -class LH5Store: - """ - Class to represent a store of LEGEND HDF5 files. The two main methods - implemented by the class are :meth:`read_object` and :meth:`write_object`. - - Examples - -------- - >>> from lgdo import LH5Store - >>> store = LH5Store() - >>> obj, _ = store.read_object("/geds/waveform", "file.lh5") - >>> type(obj) - lgdo.waveform_table.WaveformTable - """ - - def __init__(self, base_path: str = "", keep_open: bool = False) -> None: - """ - Parameters - ---------- - base_path - directory path to prepend to LH5 files. - keep_open - whether to keep files open by storing the :mod:`h5py` objects as - class attributes. - """ - self.base_path = "" if base_path == "" else expand_path(base_path) - self.keep_open = keep_open - self.files = {} - - def gimme_file(self, lh5_file: str | h5py.File, mode: str = "r") -> h5py.File: - """Returns a :mod:`h5py` file object from the store or creates a new one. - - Parameters - ---------- - lh5_file - LH5 file name. - mode - mode in which to open file. See :class:`h5py.File` documentation. - """ - if isinstance(lh5_file, h5py.File): - return lh5_file - if mode == "r": - lh5_file = expand_path(lh5_file, base_path=self.base_path) - if lh5_file in self.files.keys(): - return self.files[lh5_file] - if self.base_path != "": - full_path = os.path.join(self.base_path, lh5_file) - else: - full_path = lh5_file - if mode != "r": - directory = os.path.dirname(full_path) - if directory != "" and not os.path.exists(directory): - log.debug(f"making path {directory}") - os.makedirs(directory) - if mode == "r" and not os.path.exists(full_path): - raise FileNotFoundError(f"file {full_path} not found") - if mode != "r" and os.path.exists(full_path): - log.debug(f"opening existing file {full_path} in mode '{mode}'") - h5f = h5py.File(full_path, mode) - if self.keep_open: - self.files[lh5_file] = h5f - return h5f - - def gimme_group( +class LH5Iterator(lh5.LH5Iterator): + def __init__( self, - group: str | h5py.Group, - base_group: h5py.Group, - grp_attrs: dict[str, Any] = None, - overwrite: bool = False, - ) -> h5py.Group: - """ - Returns an existing :class:`h5py` group from a base group or creates a - new one. Can also set (or replace) group attributes. - - Parameters - ---------- - group - name of the HDF5 group. - base_group - HDF5 group to be used as a base. - grp_attrs - HDF5 group attributes. - overwrite - whether overwrite group attributes, ignored if `grp_attrs` is - ``None``. - """ - if not isinstance(group, h5py.Group): - if group in base_group: - group = base_group[group] - else: - group = base_group.create_group(group) - if grp_attrs is not None: - group.attrs.update(grp_attrs) - return group - if ( - grp_attrs is not None - and len(set(grp_attrs.items()) ^ set(group.attrs.items())) > 0 - ): - if not overwrite: - raise RuntimeError("grp_attrs != group.attrs but overwrite not set") - else: - log.debug(f"overwriting {group}.attrs...") - for key in group.attrs.keys(): - group.attrs.pop(key) - group.attrs.update(grp_attrs) - return group + lh5_files: str | list[str], + groups: str | list[str], + base_path: str = "", + entry_list: list[int] | list[list[int]] = None, + entry_mask: list[bool] | list[list[bool]] = None, + field_mask: dict[str, bool] | list[str] | tuple[str] = None, + buffer_len: int = 3200, + friend: Iterator = None, + ) -> None: + warn( + "lgdo.lh5_store has moved to a subfolder lgdo.lh5 containing LH5Iterator." + "Please replace 'from lgdo.lh5_store import LH5Iterator' with 'from lgdo.lh5 import LH5Iterator'." + "lgdo.lh5_store will be removed in a future release.", + DeprecationWarning, + stacklevel=2, + ) + super().__init__( + lh5_files, + groups, + base_path, + entry_list, + entry_mask, + field_mask, + buffer_len, + friend, + ) - def get_buffer( + def write_object( self, + obj: LGDO, name: str, - lh5_file: str | h5py.File | list[str | h5py.File], - size: int = None, - field_mask: dict[str, bool] | list[str] | tuple[str] = None, - ) -> LGDO: - """Returns an LH5 object appropriate for use as a pre-allocated buffer - in a read loop. Sets size to `size` if object has a size. - """ - obj, n_rows = self.read_object(name, lh5_file, n_rows=0, field_mask=field_mask) - if hasattr(obj, "resize") and size is not None: - obj.resize(new_size=size) - return obj + lh5_file: str | h5py.File, + group: str | h5py.Group = "/", + start_row: int = 0, + n_rows: int = None, + wo_mode: str = "append", + write_start: int = 0, + **h5py_kwargs, + ) -> None: + warn( + "lgdo.lh5_store has moved to a subfolder lgdo.lh5 containing LH5Iterator. " + "The object you are calling this function from uses the old LH5Iterator class." + "Please replace 'from lgdo.lh5_store import LH5Iterator' with 'from lgdo.lh5 import LH5Iterator'." + "lgdo.lh5_store will be removed in a future release.", + DeprecationWarning, + stacklevel=2, + ) + self.write( + obj, + name, + lh5_file, + group, + start_row, + n_rows, + wo_mode, + write_start, + h5py_kwargs, + ) def read_object( self, @@ -169,1165 +94,85 @@ def read_object( start_row: int = 0, n_rows: int = sys.maxsize, idx: np.ndarray | list | tuple | list[np.ndarray | list | tuple] = None, - use_h5idx: bool = False, field_mask: dict[str, bool] | list[str] | tuple[str] = None, obj_buf: LGDO = None, obj_buf_start: int = 0, decompress: bool = True, ) -> tuple[LGDO, int]: - """Read LH5 object data from a file. - - Use the ``idx`` parameter to read out particular rows of the data. The ``use_h5idx`` flag - controls whether *only* those rows are read from disk or if the rows are indexed after reading - the entire object. Reading individual rows can be orders of magnitude slower than reading - the whole object and then indexing the desired rows. The default behavior (``use_h5idx=False``) - is to use slightly more memory for a much faster read. See - `legend-pydataobj #29 `_ - for additional information. - - Parameters - ---------- - name - Name of the LH5 object to be read (including its group path). - lh5_file - The file(s) containing the object to be read out. If a list of - files, array-like object data will be concatenated into the output - object. - start_row - Starting entry for the object read (for array-like objects). For a - list of files, only applies to the first file. - n_rows - The maximum number of rows to read (for array-like objects). The - actual number of rows read will be returned as one of the return - values (see below). - idx - For NumPy-style "fancying indexing" for the read to select only some - rows, e.g. after applying some cuts to particular columns. - Only selection along the first axis is supported, so tuple arguments - must be one-tuples. If `n_rows` is not false, `idx` will be truncated to - `n_rows` before reading. To use with a list of files, can pass in a list of - `idx`'s (one for each file) or use a long contiguous list (e.g. built from a previous - identical read). If used in conjunction with `start_row` and `n_rows`, - will be sliced to obey those constraints, where `n_rows` is - interpreted as the (max) number of *selected* values (in `idx`) to be - read out. Note that the ``use_h5idx`` parameter controls some behaviour of the - read and that the default behavior (``use_h5idx=False``) prioritizes speed over - a small memory penalty. - use_h5idx - ``True`` will directly pass the ``idx`` parameter to the underlying - ``h5py`` call such that only the selected rows are read directly into memory, - which conserves memory at the cost of speed. There can be a significant penalty - to speed for larger files (1 - 2 orders of magnitude longer time). - ``False`` (default) will read the entire object into memory before - performing the indexing. The default is much faster but requires additional memory, - though a relatively small amount in the typical use case. It is recommended to - leave this parameter as its default. - field_mask - For tables and structs, determines which fields get written out. - Only applies to immediate fields of the requested objects. If a dict - is used, a default dict will be made with the default set to the - opposite of the first element in the dict. This way if one specifies - a few fields at ``False``, all but those fields will be read out, - while if one specifies just a few fields as ``True``, only those - fields will be read out. If a list is provided, the listed fields - will be set to ``True``, while the rest will default to ``False``. - obj_buf - Read directly into memory provided in `obj_buf`. Note: the buffer - will be expanded to accommodate the data requested. To maintain the - buffer length, send in ``n_rows = len(obj_buf)``. - obj_buf_start - Start location in ``obj_buf`` for read. For concatenating data to - array-like objects. - decompress - Decompress data encoded with LGDO's compression routines right - after reading. The option has no effect on data encoded with HDF5 - built-in filters, which is always decompressed upstream by HDF5. - - - Returns - ------- - (object, n_rows_read) - `object` is the read-out object `n_rows_read` is the number of rows - successfully read out. Essential for arrays when the amount of data - is smaller than the object buffer. For scalars and structs - `n_rows_read` will be``1``. For tables it is redundant with - ``table.loc``. - """ - # Handle list-of-files recursively - if not isinstance(lh5_file, (str, h5py.File)): - lh5_file = list(lh5_file) - n_rows_read = 0 - - # to know whether we are reading in a list of files. - # this is part of the fix for reading data by idx - # (see https://github.com/legend-exp/legend-pydataobj/issues/29) - # so that we only make a copy of the data if absolutely necessary - # or if we can read the data from file without having to make a copy - self.in_file_loop = True - - for i, h5f in enumerate(lh5_file): - if isinstance(idx, list) and len(idx) > 0 and not np.isscalar(idx[0]): - # a list of lists: must be one per file - idx_i = idx[i] - elif idx is not None: - # make idx a proper tuple if it's not one already - if not (isinstance(idx, tuple) and len(idx) == 1): - idx = (idx,) - # idx is a long continuous array - n_rows_i = self.read_n_rows(name, h5f) - # find the length of the subset of idx that contains indices - # that are less than n_rows_i - n_rows_to_read_i = bisect_left(idx[0], n_rows_i) - # now split idx into idx_i and the remainder - idx_i = (idx[0][:n_rows_to_read_i],) - idx = (idx[0][n_rows_to_read_i:] - n_rows_i,) - else: - idx_i = None - n_rows_i = n_rows - n_rows_read - - # maybe someone passed in a list of len==1? - if i == (len(lh5_file) - 1): - self.in_file_loop = False - - obj_buf, n_rows_read_i = self.read_object( - name, - lh5_file[i], - start_row=start_row, - n_rows=n_rows_i, - idx=idx_i, - use_h5idx=use_h5idx, - field_mask=field_mask, - obj_buf=obj_buf, - obj_buf_start=obj_buf_start, - decompress=decompress, - ) - - n_rows_read += n_rows_read_i - if n_rows_read >= n_rows or obj_buf is None: - return obj_buf, n_rows_read - start_row = 0 - obj_buf_start += n_rows_read_i - - self.in_file_loop = False - - return obj_buf, n_rows_read - - # get the file from the store - h5f = self.gimme_file(lh5_file, "r") - if not h5f or name not in h5f: - raise KeyError(f"'{name}' not in {h5f.filename}") - - log.debug( - f"reading {h5f.filename}:{name}[{start_row}:{n_rows}], decompress = {decompress}, " - + (f" with field mask {field_mask}" if field_mask else "") + warn( + "lgdo.lh5_store has moved to a subfolder lgdo.lh5 containing LH5Iterator. " + "The object you are calling this function from uses the old LH5Iterator class." + "Please replace 'from lgdo.lh5_store import LH5Iterator' with 'from lgdo.lh5 import LH5Iterator'." + "lgdo.lh5_store will be removed in a future release.", + DeprecationWarning, + stacklevel=2, ) - - # make idx a proper tuple if it's not one already - if not (isinstance(idx, tuple) and len(idx) == 1): - if idx is not None: - idx = (idx,) - - # get the object's datatype - if "datatype" not in h5f[name].attrs: - raise RuntimeError( - f"'{name}' in file {lh5_file} is missing the datatype attribute" - ) - - datatype = h5f[name].attrs["datatype"] - datatype, shape, elements = parse_datatype(datatype) - - # check field_mask and make it a default dict - if datatype == "struct" or datatype == "table": - if field_mask is None: - field_mask = defaultdict(lambda: True) - elif isinstance(field_mask, dict): - default = True - if len(field_mask) > 0: - default = not field_mask[list(field_mask.keys())[0]] - field_mask = defaultdict(lambda: default, field_mask) - elif isinstance(field_mask, (list, tuple)): - field_mask = defaultdict( - lambda: False, {field: True for field in field_mask} - ) - elif not isinstance(field_mask, defaultdict): - raise RuntimeError("bad field_mask of type", type(field_mask).__name__) - elif field_mask is not None: - raise RuntimeError(f"datatype {datatype} does not accept a field_mask") - - # Scalar - # scalars are dim-0 datasets - if datatype == "scalar": - value = h5f[name][()] - if elements == "bool": - value = np.bool_(value) - if obj_buf is not None: - obj_buf.value = value - obj_buf.attrs.update(h5f[name].attrs) - return obj_buf, 1 - else: - return Scalar(value=value, attrs=h5f[name].attrs), 1 - - # Struct - # recursively build a struct, return as a dictionary - if datatype == "struct": - # ignore obj_buf. - # TODO: could append new fields or overwrite/concat to existing - # fields. If implemented, get_buffer() above should probably also - # (optionally?) prep buffers for each field - if obj_buf is not None: - raise NotImplementedError("obj_buf not implemented for LGOD Structs") - - # loop over fields and read - obj_dict = {} - for field in elements: - if not field_mask[field]: - continue - # TODO: it's strange to pass start_row, n_rows, idx to struct - # fields. If they all had shared indexing, they should be in a - # table... Maybe should emit a warning? Or allow them to be - # dicts keyed by field name? - if "int_keys" in h5f[name].attrs: - if dict(h5f[name].attrs)["int_keys"]: - f = int(field) - else: - f = str(field) - obj_dict[f], _ = self.read_object( - name + "/" + field, - h5f, - start_row=start_row, - n_rows=n_rows, - idx=idx, - use_h5idx=use_h5idx, - decompress=decompress, - ) - # modify datatype in attrs if a field_mask was used - attrs = dict(h5f[name].attrs) - if field_mask is not None: - selected_fields = [] - for field in elements: - if field_mask[field]: - selected_fields.append(field) - attrs["datatype"] = "struct" + "{" + ",".join(selected_fields) + "}" - return Struct(obj_dict=obj_dict, attrs=attrs), 1 - - # Below here is all array-like types. So trim idx if needed - if idx is not None: - # chop off indices < start_row - i_first_valid = bisect_left(idx[0], start_row) - idxa = idx[0][i_first_valid:] - # don't readout more than n_rows indices - idx = (idxa[:n_rows],) # works even if n_rows > len(idxa) - - # Table or WaveformTable - if datatype == "table": - col_dict = {} - - # read out each of the fields - rows_read = [] - for field in elements: - if not field_mask[field]: - continue - - fld_buf = None - if obj_buf is not None: - if not isinstance(obj_buf, Table) or field not in obj_buf: - raise ValueError( - f"obj_buf for LGDO Table '{name}' not formatted correctly" - ) - - else: - fld_buf = obj_buf[field] - - col_dict[field], n_rows_read = self.read_object( - name + "/" + field, - h5f, - start_row=start_row, - n_rows=n_rows, - idx=idx, - use_h5idx=use_h5idx, - obj_buf=fld_buf, - obj_buf_start=obj_buf_start, - decompress=decompress, - ) - if obj_buf is not None and obj_buf_start + n_rows_read > len(obj_buf): - obj_buf.resize(obj_buf_start + n_rows_read) - - rows_read.append(n_rows_read) - - # warn if all columns don't read in the same number of rows - if len(rows_read) > 0: - n_rows_read = rows_read[0] - else: - n_rows_read = 0 - log.warning(f"Table '{name}' has no subgroups accepted by field mask") - - for n in rows_read[1:]: - if n != n_rows_read: - log.warning( - f"Table '{name}' got strange n_rows_read = {n}, {n_rows_read} was expected ({rows_read})" - ) - - # modify datatype in attrs if a field_mask was used - attrs = dict(h5f[name].attrs) - if field_mask is not None: - selected_fields = [] - for field in elements: - if field_mask[field]: - selected_fields.append(field) - attrs["datatype"] = "table" + "{" + ",".join(selected_fields) + "}" - - # fields have been read out, now return a table - if obj_buf is None: - # if col_dict contains just 3 objects called t0, dt, and values, - # return a WaveformTable - if ( - len(col_dict) == 3 - and "t0" in col_dict - and "dt" in col_dict - and "values" in col_dict - ): - table = WaveformTable( - t0=col_dict["t0"], dt=col_dict["dt"], values=col_dict["values"] - ) - else: - table = Table(col_dict=col_dict, attrs=attrs) - - # set (write) loc to end of tree - table.loc = n_rows_read - return table, n_rows_read - else: - # We have read all fields into the object buffer. Run - # checks: All columns should be the same size. So update - # table's size as necessary, warn if any mismatches are found - obj_buf.resize(do_warn=True) - # set (write) loc to end of tree - obj_buf.loc = obj_buf_start + n_rows_read - # check attributes - if set(obj_buf.attrs.keys()) != set(attrs.keys()): - raise RuntimeError( - f"attrs mismatch. obj_buf.attrs: " - f"{obj_buf.attrs}, h5f[{name}].attrs: {attrs}" - ) - return obj_buf, n_rows_read - - # ArrayOfEncodedEqualSizedArrays and VectorOfEncodedVectors - for cond, enc_lgdo in [ - ( - datatype == "array_of_encoded_equalsized_arrays", - ArrayOfEncodedEqualSizedArrays, - ), - (elements.startswith("encoded_array"), VectorOfEncodedVectors), - ]: - if cond: - if ( - not decompress - and obj_buf is not None - and not isinstance(obj_buf, enc_lgdo) - ): - raise ValueError(f"obj_buf for '{name}' not a {enc_lgdo}") - - # read out decoded_size, either a Scalar or an Array - decoded_size_buf = encoded_data_buf = None - if obj_buf is not None and not decompress: - decoded_size_buf = obj_buf.decoded_size - encoded_data_buf = obj_buf.encoded_data - - decoded_size, _ = self.read_object( - f"{name}/decoded_size", - h5f, - start_row=start_row, - n_rows=n_rows, - idx=idx, - use_h5idx=use_h5idx, - obj_buf=None if decompress else decoded_size_buf, - obj_buf_start=0 if decompress else obj_buf_start, - ) - - # read out encoded_data, a VectorOfVectors - encoded_data, n_rows_read = self.read_object( - f"{name}/encoded_data", - h5f, - start_row=start_row, - n_rows=n_rows, - idx=idx, - use_h5idx=use_h5idx, - obj_buf=None if decompress else encoded_data_buf, - obj_buf_start=0 if decompress else obj_buf_start, - ) - - # return the still encoded data in the buffer object, if there - if obj_buf is not None and not decompress: - return obj_buf, n_rows_read - - # otherwise re-create the encoded LGDO - rawdata = enc_lgdo( - encoded_data=encoded_data, - decoded_size=decoded_size, - attrs=h5f[name].attrs, - ) - - # already return if no decompression is requested - if not decompress: - return rawdata, n_rows_read - - # if no buffer, decode and return - elif obj_buf is None and decompress: - return compress.decode(rawdata), n_rows_read - - # eventually expand provided obj_buf, if too short - buf_size = obj_buf_start + n_rows_read - if len(obj_buf) < buf_size: - obj_buf.resize(buf_size) - - # use the (decoded object type) buffer otherwise - if enc_lgdo == ArrayOfEncodedEqualSizedArrays: - if not isinstance(obj_buf, ArrayOfEqualSizedArrays): - raise ValueError( - f"obj_buf for decoded '{name}' not an ArrayOfEqualSizedArrays" - ) - - compress.decode(rawdata, obj_buf[obj_buf_start:buf_size]) - - elif enc_lgdo == VectorOfEncodedVectors: - if not isinstance(obj_buf, VectorOfVectors): - raise ValueError( - f"obj_buf for decoded '{name}' not a VectorOfVectors" - ) - - # FIXME: not a good idea. an in place decoding version - # of decode would be needed to avoid extra memory - # allocations - for i, wf in enumerate(compress.decode(rawdata)): - obj_buf[obj_buf_start + i] = wf - - return obj_buf, n_rows_read - - # VectorOfVectors - # read out vector of vectors of different size - if elements.startswith("array"): - if obj_buf is not None and not isinstance(obj_buf, VectorOfVectors): - raise ValueError(f"obj_buf for '{name}' not a LGDO VectorOfVectors") - - # read out cumulative_length - cumulen_buf = None if obj_buf is None else obj_buf.cumulative_length - cumulative_length, n_rows_read = self.read_object( - f"{name}/cumulative_length", - h5f, - start_row=start_row, - n_rows=n_rows, - idx=idx, - use_h5idx=use_h5idx, - obj_buf=cumulen_buf, - obj_buf_start=obj_buf_start, - ) - # get a view of just what was read out for cleaner code below - this_cumulen_nda = cumulative_length.nda[ - obj_buf_start : obj_buf_start + n_rows_read - ] - - if idx is not None and n_rows_read > 0: - # get the starting indices for each array in flattended data: - # the starting index for array[i] is cumulative_length[i-1] - idx2 = (np.asarray(idx[0]).copy() - 1,) - # re-read cumulative_length with these indices - # note this will allocate memory for fd_starts! - fd_start = None - if idx2[0][0] == -1: - idx2 = (idx2[0][1:],) - fd_start = 0 # this variable avoids an ndarray append - fd_starts, fds_n_rows_read = self.read_object( - f"{name}/cumulative_length", - h5f, - start_row=start_row, - n_rows=n_rows, - idx=idx2, - use_h5idx=use_h5idx, - ) - fd_starts = fd_starts.nda # we just need the nda - if fd_start is None: - fd_start = fd_starts[0] - - # compute the length that flattened_data will have after the - # fancy-indexed read - fd_n_rows = np.sum(this_cumulen_nda[-len(fd_starts) :] - fd_starts) - if fd_start == 0: - fd_n_rows += this_cumulen_nda[0] - - # now make fd_idx - fd_idx = np.empty(fd_n_rows, dtype="uint32") - fd_idx = _make_fd_idx(fd_starts, this_cumulen_nda, fd_idx) - - # Now clean up this_cumulen_nda, to be ready - # to match the in-memory version of flattened_data. Note: these - # operations on the view change the original array because they are - # numpy arrays, not lists. - this_cumulen_nda[-len(fd_starts) :] -= fd_starts - np.cumsum(this_cumulen_nda, out=this_cumulen_nda) - - else: - fd_idx = None - - # determine the start_row and n_rows for the flattened_data readout - fd_start = 0 - if start_row > 0 and n_rows_read > 0: - # need to read out the cumulen sample -before- the first sample - # read above in order to get the starting row of the first - # vector to read out in flattened_data - fd_start = h5f[f"{name}/cumulative_length"][start_row - 1] - - # check limits for values that will be used subsequently - if this_cumulen_nda[-1] < fd_start: - log.debug( - f"this_cumulen_nda[-1] = {this_cumulen_nda[-1]}, " - f"fd_start = {fd_start}, " - f"start_row = {start_row}, " - f"n_rows_read = {n_rows_read}" - ) - raise RuntimeError( - f"cumulative_length non-increasing between entries " - f"{start_row} and {start_row+n_rows_read} ??" - ) - - # determine the number of rows for the flattened_data readout - fd_n_rows = this_cumulen_nda[-1] if n_rows_read > 0 else 0 - - # Now done with this_cumulen_nda, so we can clean it up to be ready - # to match the in-memory version of flattened_data. Note: these - # operations on the view change the original array because they are - # numpy arrays, not lists. - # - # First we need to subtract off the in-file offset for the start of - # read for flattened_data - this_cumulen_nda -= fd_start - - # If we started with a partially-filled buffer, add the - # appropriate offset for the start of the in-memory flattened - # data for this read. - fd_buf_start = np.uint32(0) - if obj_buf_start > 0: - fd_buf_start = cumulative_length.nda[obj_buf_start - 1] - this_cumulen_nda += fd_buf_start - - # Now prepare the object buffer if necessary - fd_buf = None - if obj_buf is not None: - fd_buf = obj_buf.flattened_data - # grow fd_buf if necessary to hold the data - fdb_size = fd_buf_start + fd_n_rows - if len(fd_buf) < fdb_size: - fd_buf.resize(fdb_size) - - # now read - flattened_data, dummy_rows_read = self.read_object( - f"{name}/flattened_data", - h5f, - start_row=fd_start, - n_rows=fd_n_rows, - idx=fd_idx, - use_h5idx=use_h5idx, - obj_buf=fd_buf, - obj_buf_start=fd_buf_start, - ) - if obj_buf is not None: - return obj_buf, n_rows_read - return ( - VectorOfVectors( - flattened_data=flattened_data, - cumulative_length=cumulative_length, - attrs=h5f[name].attrs, - ), - n_rows_read, - ) - - # Array - # FixedSizeArray - # ArrayOfEqualSizedArrays - # read out all arrays by slicing - if "array" in datatype: - if obj_buf is not None: - if not isinstance(obj_buf, Array): - raise ValueError(f"obj_buf for '{name}' not an LGDO Array") - obj_buf = None - - # compute the number of rows to read - # we culled idx above for start_row and n_rows, now we have to apply - # the constraint of the length of the dataset - ds_n_rows = h5f[name].shape[0] - if idx is not None: - if len(idx[0]) > 0 and idx[0][-1] >= ds_n_rows: - log.warning( - "idx indexed past the end of the array in the file. Culling..." - ) - n_rows_to_read = bisect_left(idx[0], ds_n_rows) - idx = (idx[0][:n_rows_to_read],) - if len(idx[0]) == 0: - log.warning("idx empty after culling.") - n_rows_to_read = len(idx[0]) - else: - n_rows_to_read = ds_n_rows - start_row - if n_rows_to_read > n_rows: - n_rows_to_read = n_rows - - # if idx is passed, check if we can make it a slice instead (faster) - change_idx_to_slice = False - - # prepare the selection for the read. Use idx if available - if idx is not None: - # check if idx is empty and convert to slice instead - if len(idx[0]) == 0: - source_sel = np.s_[0:0] - change_idx_to_slice = True - # check if idx is contiguous and increasing - # if so, convert it to a slice instead (faster) - elif np.all(np.diff(idx[0]) == 1): - source_sel = np.s_[idx[0][0] : idx[0][-1] + 1] - change_idx_to_slice = True - else: - source_sel = idx - else: - source_sel = np.s_[start_row : start_row + n_rows_to_read] - - # Now read the array - if obj_buf is not None and n_rows_to_read > 0: - buf_size = obj_buf_start + n_rows_to_read - if len(obj_buf) < buf_size: - obj_buf.resize(buf_size) - dest_sel = np.s_[obj_buf_start:buf_size] - - # this is required to make the read of multiple files faster - # until a better solution found. - if change_idx_to_slice or idx is None or use_h5idx: - h5f[name].read_direct(obj_buf.nda, source_sel, dest_sel) - else: - # it is faster to read the whole object and then do fancy indexing - obj_buf.nda[dest_sel] = h5f[name][...][source_sel] - - nda = obj_buf.nda - else: - if n_rows == 0: - tmp_shape = (0,) + h5f[name].shape[1:] - nda = np.empty(tmp_shape, h5f[name].dtype) - else: - if change_idx_to_slice or idx is None or use_h5idx: - nda = h5f[name][source_sel] - else: - # it is faster to read the whole object and then do fancy indexing - nda = h5f[name][...][source_sel] - - # if reading a list of files recursively, this is given to obj_buf on - # the first file read. obj_buf needs to be resized and therefore - # it needs to hold the data itself (not a view of the data). - # a view is returned by the source_sel indexing, which cannot be resized - # by ndarray.resize(). - if hasattr(self, "in_file_loop") and self.in_file_loop: - nda = np.copy(nda) - - # special handling for bools - # (c and Julia store as uint8 so cast to bool) - if elements == "bool": - nda = nda.astype(np.bool_) - - # Finally, set attributes and return objects - attrs = h5f[name].attrs - if obj_buf is None: - if datatype == "array": - return Array(nda=nda, attrs=attrs), n_rows_to_read - if datatype == "fixedsize_array": - return FixedSizeArray(nda=nda, attrs=attrs), n_rows_to_read - if datatype == "array_of_equalsized_arrays": - return ( - ArrayOfEqualSizedArrays(nda=nda, dims=shape, attrs=attrs), - n_rows_to_read, - ) - else: - if set(obj_buf.attrs.keys()) != set(attrs.keys()): - raise RuntimeError( - f"attrs mismatch. " - f"obj_buf.attrs: {obj_buf.attrs}, " - f"h5f[{name}].attrs: {attrs}" - ) - return obj_buf, n_rows_to_read - - raise RuntimeError("don't know how to read datatype {datatype}") - - def write_object( - self, - obj: LGDO, - name: str, - lh5_file: str | h5py.File, - group: str | h5py.Group = "/", - start_row: int = 0, - n_rows: int = None, - wo_mode: str = "append", - write_start: int = 0, - **h5py_kwargs, - ) -> None: - """Write an LGDO into an LH5 file. - - If the `obj` :class:`.LGDO` has a `compression` attribute, its value is - interpreted as the algorithm to be used to compress `obj` before - writing to disk. The type of `compression` can be: - - string, kwargs dictionary, hdf5plugin filter - interpreted as the name of a built-in or custom `HDF5 compression - filter `_ - (``"gzip"``, ``"lzf"``, :mod:`hdf5plugin` filter object etc.) and - passed directly to :meth:`h5py.Group.create_dataset`. - - :class:`.WaveformCodec` object - If `obj` is a :class:`.WaveformTable` and ``obj.values`` holds the - attribute, compress ``values`` using this algorithm. More - documentation about the supported waveform compression algorithms at - :mod:`.lgdo.compression`. - - If the `obj` :class:`.LGDO` has a `hdf5_settings` attribute holding a - dictionary, it is interpreted as a list of keyword arguments to be - forwarded directly to :meth:`h5py.Group.create_dataset` (exactly like - the first format of `compression` above). This is the preferred way to - specify HDF5 dataset options such as chunking etc. If compression - options are specified, they take precedence over those set with the - `compression` attribute. - - Note - ---- - The `compression` LGDO attribute takes precedence over the default HDF5 - compression settings. The `hdf5_settings` attribute takes precedence - over `compression`. These attributes are not written to disk. - - Note - ---- - HDF5 compression is skipped for the `encoded_data.flattened_data` - dataset of :class:`.VectorOfEncodedVectors` and - :class:`.ArrayOfEncodedEqualSizedArrays`. - - Parameters - ---------- - obj - LH5 object. if object is array-like, writes `n_rows` starting from - `start_row` in `obj`. - name - name of the object in the output HDF5 file. - lh5_file - HDF5 file name or :class:`h5py.File` object. - group - HDF5 group name or :class:`h5py.Group` object in which `obj` should - be written. - start_row - first row in `obj` to be written. - n_rows - number of rows in `obj` to be written. - wo_mode - - ``write_safe`` or ``w``: only proceed with writing if the - object does not already exist in the file. - - ``append`` or ``a``: append along axis 0 (the first dimension) - of array-like objects and array-like subfields of structs. - :class:`~.lgdo.scalar.Scalar` objects get overwritten. - - ``overwrite`` or ``o``: replace data in the file if present, - starting from `write_start`. Note: overwriting with `write_start` = - end of array is the same as ``append``. - - ``overwrite_file`` or ``of``: delete file if present prior to - writing to it. `write_start` should be 0 (its ignored). - - ``append_column`` or ``ac``: append columns from an :class:`~.lgdo.table.Table` - `obj` only if there is an existing :class:`~.lgdo.table.Table` in the `lh5_file` with - the same `name` and :class:`~.lgdo.table.Table.size`. If the sizes don't match, - or if there are matching fields, it errors out. - write_start - row in the output file (if already existing) to start overwriting - from. - **h5py_kwargs - additional keyword arguments forwarded to - :meth:`h5py.Group.create_dataset` to specify, for example, an HDF5 - compression filter to be applied before writing non-scalar - datasets. **Note: `compression` Ignored if compression is specified - as an `obj` attribute.** - """ - log.debug( - f"writing {repr(obj)}[{start_row}:{n_rows}] as " - f"{lh5_file}:{group}/{name}[{write_start}:], " - f"mode = {wo_mode}, h5py_kwargs = {h5py_kwargs}" + return self.read( + name, + lh5_file, + start_row, + n_rows, + idx, + field_mask, + obj_buf, + obj_buf_start, + decompress, ) - if wo_mode == "write_safe": - wo_mode = "w" - if wo_mode == "append": - wo_mode = "a" - if wo_mode == "overwrite": - wo_mode = "o" - if wo_mode == "overwrite_file": - wo_mode = "of" - write_start = 0 - if wo_mode == "append_column": - wo_mode = "ac" - if wo_mode not in ["w", "a", "o", "of", "ac"]: - raise ValueError(f"unknown wo_mode '{wo_mode}'") - - # "mode" is for the h5df.File and wo_mode is for this function - # In hdf5, 'a' is really "modify" -- in addition to appending, you can - # change any object in the file. So we use file:append for - # write_object:overwrite. - mode = "w" if wo_mode == "of" else "a" - lh5_file = self.gimme_file(lh5_file, mode=mode) - group = self.gimme_group(group, lh5_file) - if wo_mode == "w" and name in group: - raise RuntimeError(f"can't overwrite '{name}' in wo_mode 'write_safe'") - - # struct or table or waveform table - if isinstance(obj, Struct): - # In order to append a column, we need to update the `table{old_fields}` value in `group.attrs['datatype"]` to include the new fields. - # One way to do this is to override `obj.attrs["datatype"]` to include old and new fields. Then we can write the fields to the table as normal. - if wo_mode == "ac": - old_group = self.gimme_group(name, group) - datatype, shape, fields = parse_datatype(old_group.attrs["datatype"]) - if datatype not in ["table", "struct"]: - raise RuntimeError( - f"Trying to append columns to an object of type {datatype}" - ) - - # If the mode is `append_column`, make sure we aren't appending a table that has a column of the same name as in the existing table - # Also make sure that the field we are adding has the same size - if len(list(set(fields).intersection(set(obj.keys())))) != 0: - raise ValueError( - f"Can't append {list(set(fields).intersection(set(obj.keys())))} column(s) to a table with the same field(s)" - ) - # It doesn't matter what key we access, as all fields in the old table have the same size - if old_group[list(old_group.keys())[0]].size != obj.size: - raise ValueError( - f"Table sizes don't match. Trying to append column of size {obj.size} to a table of size {old_group[list(old_group.keys())[0]].size}." - ) - - # Now we can append the obj.keys() to the old fields, and then update obj.attrs. - fields.extend(list(obj.keys())) - obj.attrs.pop("datatype") - obj.attrs["datatype"] = "table" + "{" + ",".join(fields) + "}" - - group = self.gimme_group( - name, - group, - grp_attrs=obj.attrs, - overwrite=(wo_mode in ["o", "ac"]), - ) - # If the mode is overwrite, then we need to peek into the file's table's existing fields - # If we are writing a new table to the group that does not contain an old field, we should delete that old field from the file - if wo_mode == "o": - # Find the old keys in the group that are not present in the new table's keys, then delete them - for key in list(set(group.keys()) - set(obj.keys())): - log.debug(f"{key} is not present in new table, deleting field") - del group[key] - - for field in obj.keys(): - # eventually compress waveform table values with LGDO's - # custom codecs before writing - # if waveformtable.values.attrs["compression"] is NOT a - # WaveformCodec, just leave it there - obj_fld = None - if ( - isinstance(obj, WaveformTable) - and field == "values" - and not isinstance(obj.values, VectorOfEncodedVectors) - and not isinstance(obj.values, ArrayOfEncodedEqualSizedArrays) - and "compression" in obj.values.attrs - and isinstance(obj.values.attrs["compression"], WaveformCodec) - ): - codec = obj.values.attrs["compression"] - obj_fld = compress.encode(obj.values, codec=codec) - else: - obj_fld = obj[field] - - # Convert keys to string for dataset names - f = str(field) - self.write_object( - obj_fld, - f, - lh5_file, - group=group, - start_row=start_row, - n_rows=n_rows, - wo_mode=wo_mode, - write_start=write_start, - **h5py_kwargs, - ) - return - - # scalars - elif isinstance(obj, Scalar): - if name in group: - if wo_mode in ["o", "a"]: - log.debug(f"overwriting {name} in {group}") - del group[name] - else: - raise RuntimeError( - f"tried to overwrite {name} in {group} for wo_mode {wo_mode}" - ) - ds = group.create_dataset(name, shape=(), data=obj.value) - ds.attrs.update(obj.attrs) - return - - # vector of encoded vectors - elif isinstance(obj, (VectorOfEncodedVectors, ArrayOfEncodedEqualSizedArrays)): - group = self.gimme_group( - name, group, grp_attrs=obj.attrs, overwrite=(wo_mode == "o") - ) - - # ask not to further compress flattened_data, it is already compressed! - obj.encoded_data.flattened_data.attrs["compression"] = None - - self.write_object( - obj.encoded_data, - "encoded_data", - lh5_file, - group=group, - start_row=start_row, - n_rows=n_rows, - wo_mode=wo_mode, - write_start=write_start, - **h5py_kwargs, - ) - - self.write_object( - obj.decoded_size, - "decoded_size", - lh5_file, - group=group, - start_row=start_row, - n_rows=n_rows, - wo_mode=wo_mode, - write_start=write_start, - **h5py_kwargs, - ) - - # vector of vectors - elif isinstance(obj, VectorOfVectors): - group = self.gimme_group( - name, group, grp_attrs=obj.attrs, overwrite=(wo_mode == "o") - ) - if ( - n_rows is None - or n_rows > obj.cumulative_length.nda.shape[0] - start_row - ): - n_rows = obj.cumulative_length.nda.shape[0] - start_row - # if appending we need to add an appropriate offset to the - # cumulative lengths as appropriate for the in-file object - offset = 0 # declare here because we have to subtract it off at the end - if (wo_mode == "a" or wo_mode == "o") and "cumulative_length" in group: - len_cl = len(group["cumulative_length"]) - if wo_mode == "a": - write_start = len_cl - if len_cl > 0: - offset = group["cumulative_length"][write_start - 1] - - # First write flattened_data array. Only write rows with data. - fd_start = 0 if start_row == 0 else obj.cumulative_length.nda[start_row - 1] - fd_n_rows = obj.cumulative_length.nda[start_row + n_rows - 1] - fd_start - self.write_object( - obj.flattened_data, - "flattened_data", - lh5_file, - group=group, - start_row=fd_start, - n_rows=fd_n_rows, - wo_mode=wo_mode, - write_start=offset, - **h5py_kwargs, - ) - - # now offset is used to give appropriate in-file values for - # cumulative_length. Need to adjust it for start_row - if start_row > 0: - offset -= obj.cumulative_length.nda[start_row - 1] - - # Add offset to obj.cumulative_length itself to avoid memory allocation. - # Then subtract it off after writing! (otherwise it will be changed - # upon return) - cl_dtype = obj.cumulative_length.nda.dtype.type - obj.cumulative_length.nda += cl_dtype(offset) - - self.write_object( - obj.cumulative_length, - "cumulative_length", - lh5_file, - group=group, - start_row=start_row, - n_rows=n_rows, - wo_mode=wo_mode, - write_start=write_start, - **h5py_kwargs, - ) - obj.cumulative_length.nda -= cl_dtype(offset) - - return - - # if we get this far, must be one of the Array types - elif isinstance(obj, Array): - if n_rows is None or n_rows > obj.nda.shape[0] - start_row: - n_rows = obj.nda.shape[0] - start_row - - nda = obj.nda[start_row : start_row + n_rows] - - # hack to store bools as uint8 for c / Julia compliance - if nda.dtype.name == "bool": - nda = nda.astype(np.uint8) - - # need to create dataset from ndarray the first time for speed - # creating an empty dataset and appending to that is super slow! - if (wo_mode != "a" and write_start == 0) or name not in group: - # this is needed in order to have a resizable (in the first - # axis) data set, i.e. rows can be appended later - # NOTE: this automatically turns chunking on! - maxshape = (None,) + nda.shape[1:] - h5py_kwargs.setdefault("maxshape", maxshape) - - if wo_mode == "o" and name in group: - log.debug(f"overwriting {name} in {group}") - del group[name] - - # set default compression options - for k, v in DEFAULT_HDF5_SETTINGS.items(): - h5py_kwargs.setdefault(k, v) - - # compress using the 'compression' LGDO attribute, if available - if "compression" in obj.attrs: - comp_algo = obj.attrs["compression"] - if isinstance(comp_algo, dict): - h5py_kwargs |= obj.attrs["compression"] - else: - h5py_kwargs["compression"] = obj.attrs["compression"] - - # and even the 'hdf5_settings' one, preferred - if "hdf5_settings" in obj.attrs: - h5py_kwargs |= obj.attrs["hdf5_settings"] - - # create HDF5 dataset - ds = group.create_dataset(name, data=nda, **h5py_kwargs) - - # attach HDF5 dataset attributes, but not "compression"! - _attrs = obj.getattrs(datatype=True) - _attrs.pop("compression", None) - _attrs.pop("hdf5_settings", None) - ds.attrs.update(_attrs) - return - - # Now append or overwrite - ds = group[name] - if not isinstance(ds, h5py.Dataset): - raise RuntimeError( - f"existing HDF5 object '{name}' in group '{group}'" - " is not a dataset! Cannot overwrite or append" - ) - - old_len = ds.shape[0] - if wo_mode == "a": - write_start = old_len - add_len = write_start + nda.shape[0] - old_len - ds.resize(old_len + add_len, axis=0) - ds[write_start:] = nda - return - - else: - raise RuntimeError( - f"do not know how to write '{name}' of type '{type(obj).__name__}'" - ) - - def read_n_rows(self, name: str, lh5_file: str | h5py.File) -> int | None: - """Look up the number of rows in an Array-like object called `name` in - `lh5_file`. - - Return ``None`` if it is a :class:`.Scalar` or a :class:`.Struct`.""" - # this is basically a stripped down version of read_object - h5f = self.gimme_file(lh5_file, "r") - if not h5f or name not in h5f: - raise KeyError(f"'{name}' not in {lh5_file}") - - # get the datatype - if "datatype" not in h5f[name].attrs: - raise RuntimeError( - f"'{name}' in file {lh5_file} is missing the datatype attribute" - ) - - datatype = h5f[name].attrs["datatype"] - datatype, shape, elements = parse_datatype(datatype) - - # scalars are dim-0 datasets - if datatype == "scalar": - return None - - # structs don't have rows - if datatype == "struct": - return None - - # tables should have elements with all the same length - if datatype == "table": - # read out each of the fields - rows_read = None - for field in elements: - n_rows_read = self.read_n_rows(name + "/" + field, h5f) - if not rows_read: - rows_read = n_rows_read - elif rows_read != n_rows_read: - log.warning( - f"'{field}' field in table '{name}' has {rows_read} rows, " - f"{n_rows_read} was expected" - ) - return rows_read +class LH5Store(lh5.LH5Store): + def __init__(self, base_path: str = "", keep_open: bool = False): + warn( + "lgdo.lh5_store has moved to a subfolder lgdo.lh5 containing LH5Store. " + "Please replace 'from lgdo.lh5_store import LH5Store' with 'from lgdo.lh5 import LH5Store'." + "lgdo.lh5_store will be removed in a future release.", + DeprecationWarning, + stacklevel=2, + ) + super().__init__(base_path, keep_open) - # length of vector of vectors is the length of its cumulative_length - if elements.startswith("array"): - return self.read_n_rows(f"{name}/cumulative_length", h5f) - # length of vector of encoded vectors is the length of its decoded_size - if ( - elements.startswith("encoded_array") - or datatype == "array_of_encoded_equalsized_arrays" - ): - return self.read_n_rows(f"{name}/encoded_data", h5f) +def load_dfs( + f_list: str | list[str], + par_list: list[str], + lh5_group: str = "", + idx_list: list[np.ndarray | list | tuple] = None, +) -> pd.DataFrame: + warn( + "lgdo.lh5_store has moved to a subfolder lgdo.lh5. " + "Please replace 'from lgdo.lh5_store import load_dfs' with 'from lgdo.lh5 import load_dfs'. " + "lgdo.lh5_store will be removed in a future release.", + DeprecationWarning, + stacklevel=2, + ) + return lh5.load_dfs(f_list, par_list, lh5_group, idx_list) - # return array length (without reading the array!) - if "array" in datatype: - # compute the number of rows to read - return h5f[name].shape[0] - raise RuntimeError(f"don't know how to read datatype '{datatype}'") +def load_nda( + f_list: str | list[str], + par_list: list[str], + lh5_group: str = "", + idx_list: list[np.ndarray | list | tuple] = None, +) -> dict[str, np.ndarray]: + warn( + "lgdo.lh5_store has moved to a subfolder lgdo.lh5. " + "Please replace 'from lgdo.lh5_store import load_nda' with 'from lgdo.lh5 import load_nda'. " + "lgdo.lh5_store will be removed in a future release.", + DeprecationWarning, + stacklevel=2, + ) + return lh5.load_nda(f_list, par_list, lh5_group, idx_list) def ls(lh5_file: str | h5py.Group, lh5_group: str = "") -> list[str]: - """Return a list of LH5 groups in the input file and group, similar - to ``ls`` or ``h5ls``. Supports wildcards in group names. - - - Parameters - ---------- - lh5_file - name of file. - lh5_group - group to search. add a ``/`` to the end of the group name if you want to - list all objects inside that group. - """ - - log.debug( - f"Listing objects in '{lh5_file}'" - + ("" if lh5_group == "" else f" (and group {lh5_group})") + warn( + "lgdo.lh5_store has moved to a subfolder lgdo.lh5. " + "Please replace 'from lgdo.lh5_store import ls' with 'from lgdo.lh5 import ls'. " + "lgdo.lh5_store will be removed in a future release.", + DeprecationWarning, + stacklevel=2, ) - - lh5_st = LH5Store() - # To use recursively, make lh5_file a h5group instead of a string - if isinstance(lh5_file, str): - lh5_file = lh5_st.gimme_file(lh5_file, "r") - if lh5_group.startswith("/"): - lh5_group = lh5_group[1:] - - if lh5_group == "": - lh5_group = "*" - - splitpath = lh5_group.split("/", 1) - matchingkeys = fnmatch.filter(lh5_file.keys(), splitpath[0]) - - if len(splitpath) == 1: - return matchingkeys - else: - ret = [] - for key in matchingkeys: - ret.extend([f"{key}/{path}" for path in ls(lh5_file[key], splitpath[1])]) - return ret + return lh5.ls(lh5_file, lh5_group) def show( @@ -1337,495 +182,11 @@ def show( indent: str = "", header: bool = True, ) -> None: - """Print a tree of LH5 file contents with LGDO datatype. - - Parameters - ---------- - lh5_file - the LH5 file. - lh5_group - print only contents of this HDF5 group. - attrs - print the HDF5 attributes too. - indent - indent the diagram with this string. - header - print `lh5_group` at the top of the diagram. - - Examples - -------- - >>> from lgdo import show - >>> show("file.lh5", "/geds/raw") - /geds/raw - ├── channel · array<1>{real} - ├── energy · array<1>{real} - ├── timestamp · array<1>{real} - ├── waveform · table{t0,dt,values} - │ ├── dt · array<1>{real} - │ ├── t0 · array<1>{real} - │ └── values · array_of_equalsized_arrays<1,1>{real} - └── wf_std · array<1>{real} - """ - # open file - if isinstance(lh5_file, str): - lh5_file = h5py.File(expand_path(lh5_file), "r") - - # go to group - if lh5_group != "/": - lh5_file = lh5_file[lh5_group] - - if header: - print(f"\033[1m{lh5_group}\033[0m") # noqa: T201 - - # get an iterator over the keys in the group - it = iter(lh5_file) - key = None - - # make sure there is actually something in this file/group - try: - key = next(it) # get first key - except StopIteration: - print(f"{indent}└── empty") # noqa: T201 - return - - # loop over keys - while True: - val = lh5_file[key] - # we want to print the LGDO datatype - dtype = val.attrs.get("datatype", default="no datatype") - if dtype == "no datatype" and isinstance(val, h5py.Group): - dtype = "HDF5 group" - - _attrs = "" - if attrs: - attrs_d = dict(val.attrs) - attrs_d.pop("datatype", "") - _attrs = "── " + str(attrs_d) if attrs_d else "" - - # is this the last key? - killme = False - try: - k_new = next(it) # get next key - except StopIteration: - char = "└──" - killme = True # we'll have to kill this loop later - else: - char = "├──" - - print(f"{indent}{char} \033[1m{key}\033[0m · {dtype} {_attrs}") # noqa: T201 - - # if it's a group, call this function recursively - if isinstance(val, h5py.Group): - show( - val, - indent=indent + (" " if killme else "│ "), - header=False, - attrs=attrs, - ) - - # break or move to next key - if killme: - break - else: - key = k_new - - -def load_nda( - f_list: str | list[str], - par_list: list[str], - lh5_group: str = "", - idx_list: list[np.ndarray | list | tuple] = None, -) -> dict[str, np.ndarray]: - r"""Build a dictionary of :class:`numpy.ndarray`\ s from LH5 data. - - Given a list of files, a list of LH5 table parameters, and an optional - group path, return a NumPy array with all values for each parameter. - - Parameters - ---------- - f_list - A list of files. Can contain wildcards. - par_list - A list of parameters to read from each file. - lh5_group - group path within which to find the specified parameters. - idx_list - for fancy-indexed reads. Must be one index array for each file in - `f_list`. - - Returns - ------- - par_data - A dictionary of the parameter data keyed by the elements of `par_list`. - Each entry contains the data for the specified parameter concatenated - over all files in `f_list`. - """ - if isinstance(f_list, str): - f_list = [f_list] - if idx_list is not None: - idx_list = [idx_list] - if idx_list is not None and len(f_list) != len(idx_list): - raise ValueError( - f"f_list length ({len(f_list)}) != idx_list length ({len(idx_list)})!" - ) - - # Expand wildcards - f_list = [f for f_wc in f_list for f in sorted(glob.glob(os.path.expandvars(f_wc)))] - - sto = LH5Store() - par_data = {par: [] for par in par_list} - for ii, f in enumerate(f_list): - f = sto.gimme_file(f, "r") - for par in par_list: - if f"{lh5_group}/{par}" not in f: - raise RuntimeError(f"'{lh5_group}/{par}' not in file {f_list[ii]}") - - if idx_list is None: - data, _ = sto.read_object(f"{lh5_group}/{par}", f) - else: - data, _ = sto.read_object(f"{lh5_group}/{par}", f, idx=idx_list[ii]) - if not data: - continue - par_data[par].append(data.nda) - par_data = {par: np.concatenate(par_data[par]) for par in par_list} - return par_data - - -def load_dfs( - f_list: str | list[str], - par_list: list[str], - lh5_group: str = "", - idx_list: list[np.ndarray | list | tuple] = None, -) -> pd.DataFrame: - """Build a :class:`pandas.DataFrame` from LH5 data. - - Given a list of files (can use wildcards), a list of LH5 columns, and - optionally the group path, return a :class:`pandas.DataFrame` with all - values for each parameter. - - See Also - -------- - :func:`load_nda` - - Returns - ------- - dataframe - contains columns for each parameter in `par_list`, and rows containing - all data for the associated parameters concatenated over all files in - `f_list`. - """ - return pd.DataFrame( - load_nda(f_list, par_list, lh5_group=lh5_group, idx_list=idx_list) + warn( + "lgdo.lh5_store has moved to a subfolder lgdo.lh5. " + "Please replace 'from lgdo.lh5_store import show' with 'from lgdo.lh5 import show'. " + "lgdo.lh5_store will be removed in a future release.", + DeprecationWarning, + stacklevel=2, ) - - -class LH5Iterator(Iterator): - """ - A class for iterating through one or more LH5 files, one block of entries - at a time. This also accepts an entry list/mask to enable event selection, - and a field mask. - - This class can be used either for random access: - - >>> lh5_obj, n_rows = lh5_it.read(entry) - - to read the block of entries starting at entry. In case of multiple files - or the use of an event selection, entry refers to a global event index - across files and does not count events that are excluded by the selection. - - This can also be used as an iterator: - - >>> for lh5_obj, entry, n_rows in LH5Iterator(...): - >>> # do the thing! - - This is intended for if you are reading a large quantity of data but - want to limit your memory usage (particularly when reading in waveforms!). - The ``lh5_obj`` that is read by this class is reused in order to avoid - reallocation of memory; this means that if you want to hold on to data - between reads, you will have to copy it somewhere! - """ - - def __init__( - self, - lh5_files: str | list[str], - groups: str | list[str], - base_path: str = "", - entry_list: list[int] | list[list[int]] = None, - entry_mask: list[bool] | list[list[bool]] = None, - field_mask: dict[str, bool] | list[str] | tuple[str] = None, - buffer_len: int = 3200, - friend: LH5Iterator = None, - ) -> None: - """ - Parameters - ---------- - lh5_files - file or files to read from. May include wildcards and environment - variables. - groups - HDF5 group(s) to read. If a list is provided for both lh5_files - and group, they must be the same size. If a file is wild-carded, - the same group will be assigned to each file found - entry_list - list of entry numbers to read. If a nested list is provided, - expect one top-level list for each file, containing a list of - local entries. If a list of ints is provided, use global entries. - entry_mask - mask of entries to read. If a list of arrays is provided, expect - one for each file. Ignore if a selection list is provided. - field_mask - mask of which fields to read. See :meth:`LH5Store.read_object` for - more details. - buffer_len - number of entries to read at a time while iterating through files. - friend - a ''friend'' LH5Iterator that will be read in parallel with this. - The friend should have the same length and entry list. A single - LH5 table containing columns from both iterators will be returned. - """ - self.lh5_st = LH5Store(base_path=base_path, keep_open=True) - - # List of files, with wildcards and env vars expanded - if isinstance(lh5_files, str): - lh5_files = [lh5_files] - if isinstance(groups, list): - lh5_files *= len(groups) - elif not isinstance(lh5_files, list): - raise ValueError("lh5_files must be a string or list of strings") - - if isinstance(groups, str): - groups = [groups] * len(lh5_files) - elif not isinstance(groups, list): - raise ValueError("group must be a string or list of strings") - - if not len(groups) == len(lh5_files): - raise ValueError("lh5_files and groups must have same length") - - self.lh5_files = [] - self.groups = [] - for f, g in zip(lh5_files, groups): - f_exp = expand_path(f, list=True, base_path=base_path) - self.lh5_files += f_exp - self.groups += [g] * len(f_exp) - - if entry_list is not None and entry_mask is not None: - raise ValueError( - "entry_list and entry_mask arguments are mutually exclusive" - ) - - # Map to last row in each file - self.file_map = np.full(len(self.lh5_files), np.iinfo("i").max, "i") - # Map to last iterator entry for each file - self.entry_map = np.full(len(self.lh5_files), np.iinfo("i").max, "i") - self.buffer_len = buffer_len - - if len(self.lh5_files) > 0: - f = self.lh5_files[0] - g = self.groups[0] - self.lh5_buffer = self.lh5_st.get_buffer( - g, - f, - size=self.buffer_len, - field_mask=field_mask, - ) - self.file_map[0] = self.lh5_st.read_n_rows(g, f) - else: - raise RuntimeError(f"can't open any files from {lh5_files}") - - self.n_rows = 0 - self.current_entry = 0 - self.next_entry = 0 - - self.field_mask = field_mask - - # List of entry indices from each file - self.local_entry_list = None - self.global_entry_list = None - if entry_list is not None: - entry_list = list(entry_list) - if isinstance(entry_list[0], int): - self.local_entry_list = [None] * len(self.file_map) - self.global_entry_list = np.array(entry_list, "i") - self.global_entry_list.sort() - - else: - self.local_entry_list = [[]] * len(self.file_map) - for i_file, local_list in enumerate(entry_list): - self.local_entry_list[i_file] = np.array(local_list, "i") - self.local_entry_list[i_file].sort() - - elif entry_mask is not None: - # Convert entry mask into an entry list - if isinstance(entry_mask, pd.Series): - entry_mask = entry_mask.values - if isinstance(entry_mask, np.ndarray): - self.local_entry_list = [None] * len(self.file_map) - self.global_entry_list = np.nonzero(entry_mask)[0] - else: - self.local_entry_list = [[]] * len(self.file_map) - for i_file, local_mask in enumerate(entry_mask): - self.local_entry_list[i_file] = np.nonzero(local_mask)[0] - - # Attach the friend - if friend is not None: - if not isinstance(friend, LH5Iterator): - raise ValueError("Friend must be an LH5Iterator") - self.lh5_buffer.join(friend.lh5_buffer) - self.friend = friend - - def _get_file_cumlen(self, i_file: int) -> int: - """Helper to get cumulative file length of file""" - if i_file < 0: - return 0 - fcl = self.file_map[i_file] - if fcl == np.iinfo("i").max: - fcl = self._get_file_cumlen(i_file - 1) + self.lh5_st.read_n_rows( - self.groups[i_file], self.lh5_files[i_file] - ) - self.file_map[i_file] = fcl - return fcl - - def _get_file_cumentries(self, i_file: int) -> int: - """Helper to get cumulative iterator entries in file""" - if i_file < 0: - return 0 - n = self.entry_map[i_file] - if n == np.iinfo("i").max: - elist = self.get_file_entrylist(i_file) - fcl = self._get_file_cumlen(i_file) - if elist is None: - # no entry list provided - n = fcl - else: - file_entries = self.get_file_entrylist(i_file) - n = len(file_entries) - # check that file entries fall inside of file - if n > 0 and file_entries[-1] >= fcl: - logging.warning(f"Found entries out of range for file {i_file}") - n = np.searchsorted(file_entries, fcl, "right") - n += self._get_file_cumentries(i_file - 1) - self.entry_map[i_file] = n - return n - - def get_file_entrylist(self, i_file: int) -> np.ndarray: - """Helper to get entry list for file""" - # If no entry list is provided - if self.local_entry_list is None: - return None - - elist = self.local_entry_list[i_file] - if elist is None: - # Get local entrylist for this file from global entry list - f_start = self._get_file_cumlen(i_file - 1) - f_end = self._get_file_cumlen(i_file) - i_start = self._get_file_cumentries(i_file - 1) - i_stop = np.searchsorted(self.global_entry_list, f_end, "right") - elist = np.array(self.global_entry_list[i_start:i_stop], "i") - f_start - self.local_entry_list[i_file] = elist - return elist - - def get_global_entrylist(self) -> np.ndarray: - """Get global entry list, constructing it if needed""" - if self.global_entry_list is None and self.local_entry_list is not None: - self.global_entry_list = np.zeros(len(self), "i") - for i_file in range(len(self.lh5_files)): - i_start = self.get_file_cumentries(i_file - 1) - i_stop = self.get_file_cumentries(i_file) - f_start = self.get_file_cumlen(i_file - 1) - self.global_entry_list[i_start:i_stop] = ( - self.get_file_entrylist(i_file) + f_start - ) - return self.global_entry_list - - def read(self, entry: int) -> tuple[LGDO, int]: - """Read the nextlocal chunk of events, starting at entry. Return the - LH5 buffer and number of rows read.""" - self.n_rows = 0 - i_file = np.searchsorted(self.entry_map, entry, "right") - - # if file hasn't been opened yet, search through files - # sequentially until we find the right one - if i_file < len(self.lh5_files) and self.entry_map[i_file] == np.iinfo("i").max: - while i_file < len(self.lh5_files) and entry >= self._get_file_cumentries( - i_file - ): - i_file += 1 - - if i_file == len(self.lh5_files): - return (self.lh5_buffer, self.n_rows) - local_entry = entry - self._get_file_cumentries(i_file - 1) - - while self.n_rows < self.buffer_len and i_file < len(self.file_map): - # Loop through files - local_idx = self.get_file_entrylist(i_file) - if local_idx is not None and len(local_idx) == 0: - i_file += 1 - local_entry = 0 - continue - - i_local = local_idx[local_entry] if local_idx is not None else local_entry - self.lh5_buffer, n_rows = self.lh5_st.read_object( - self.groups[i_file], - self.lh5_files[i_file], - start_row=i_local, - n_rows=self.buffer_len - self.n_rows, - idx=local_idx, - field_mask=self.field_mask, - obj_buf=self.lh5_buffer, - obj_buf_start=self.n_rows, - ) - - self.n_rows += n_rows - i_file += 1 - local_entry = 0 - - self.current_entry = entry - - if self.friend is not None: - self.friend.read(entry) - - return (self.lh5_buffer, self.n_rows) - - def reset_field_mask(self, mask): - """Replaces the field mask of this iterator and any friends with mask""" - self.field_mask = mask - if self.friend is not None: - self.friend.reset_field_mask(mask) - - def __len__(self) -> int: - """Return the total number of entries.""" - return ( - self._get_file_cumentries(len(self.lh5_files) - 1) - if len(self.entry_map) > 0 - else 0 - ) - - def __iter__(self) -> Iterator: - """Loop through entries in blocks of size buffer_len.""" - self.current_entry = 0 - self.next_entry = 0 - return self - - def __next__(self) -> tuple[LGDO, int, int]: - """Read next buffer_len entries and return lh5_table, iterator entry - and n_rows read.""" - buf, n_rows = self.read(self.next_entry) - self.next_entry = self.current_entry + n_rows - if n_rows == 0: - raise StopIteration - return (buf, self.current_entry, n_rows) - - -@nb.njit(parallel=False, fastmath=True) -def _make_fd_idx(starts, stops, idx): - k = 0 - if len(starts) < len(stops): - for i in range(stops[0]): - idx[k] = i - k += 1 - stops = stops[1:] - for j in range(len(starts)): - for i in range(starts[j], stops[j]): - idx[k] = i - k += 1 - return (idx,) + lh5.show(lh5_file, lh5_group, attrs, indent, header) diff --git a/src/lgdo/types/array.py b/src/lgdo/types/array.py index 30a47bd2..54fd76f3 100644 --- a/src/lgdo/types/array.py +++ b/src/lgdo/types/array.py @@ -10,7 +10,7 @@ import numpy as np -from .. import lgdo_utils as utils +from .. import utils as utils from .lgdo import LGDO log = logging.getLogger(__name__) diff --git a/src/lgdo/types/arrayofequalsizedarrays.py b/src/lgdo/types/arrayofequalsizedarrays.py index 95884bc9..bf16ed8d 100644 --- a/src/lgdo/types/arrayofequalsizedarrays.py +++ b/src/lgdo/types/arrayofequalsizedarrays.py @@ -9,7 +9,7 @@ import numpy as np -from .. import lgdo_utils as utils +from .. import utils as utils from . import vectorofvectors as vov from .array import Array diff --git a/src/lgdo/types/encoded.py b/src/lgdo/types/encoded.py index 68886273..766001b3 100644 --- a/src/lgdo/types/encoded.py +++ b/src/lgdo/types/encoded.py @@ -6,7 +6,7 @@ import numpy as np from numpy.typing import NDArray -from .. import lgdo_utils as utils +from .. import utils as utils from .array import Array from .lgdo import LGDO from .scalar import Scalar diff --git a/src/lgdo/types/scalar.py b/src/lgdo/types/scalar.py index 6b793137..e79bb932 100644 --- a/src/lgdo/types/scalar.py +++ b/src/lgdo/types/scalar.py @@ -7,7 +7,7 @@ import numpy as np -from .. import lgdo_utils as utils +from .. import utils as utils from .lgdo import LGDO log = logging.getLogger(__name__) diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py index 7d227a52..2b0d7f13 100644 --- a/src/lgdo/types/vectorofvectors.py +++ b/src/lgdo/types/vectorofvectors.py @@ -13,7 +13,7 @@ import numpy as np from numpy.typing import DTypeLike, NDArray -from .. import lgdo_utils as utils +from .. import utils as utils from . import arrayofequalsizedarrays as aoesa from .array import Array from .lgdo import LGDO diff --git a/src/lgdo/utils.py b/src/lgdo/utils.py new file mode 100644 index 00000000..22866a35 --- /dev/null +++ b/src/lgdo/utils.py @@ -0,0 +1,84 @@ +"""Implements utilities for LEGEND Data Objects.""" +from __future__ import annotations + +import logging + +import numpy as np + +from . import types as lgdo + +log = logging.getLogger(__name__) + + +def get_element_type(obj: object) -> str: + """Get the LGDO element type of a scalar or array. + + For use in LGDO datatype attributes. + + Parameters + ---------- + obj + if a ``str``, will automatically return ``string`` if the object has + a :class:`numpy.dtype`, that will be used for determining the element + type otherwise will attempt to case the type of the object to a + :class:`numpy.dtype`. + + Returns + ------- + element_type + A string stating the determined element type of the object. + """ + + # special handling for strings + if isinstance(obj, str): + return "string" + + # the rest use dtypes + dt = obj.dtype if hasattr(obj, "dtype") else np.dtype(type(obj)) + kind = dt.kind + + if kind == "b": + return "bool" + if kind == "V": + return "blob" + if kind in ["i", "u", "f"]: + return "real" + if kind == "c": + return "complex" + if kind in ["S", "U"]: + return "string" + + # couldn't figure it out + raise ValueError( + "cannot determine lgdo element_type for object of type", type(obj).__name__ + ) + + +def copy(obj: lgdo.LGDO, dtype: np.dtype = None) -> lgdo.LGDO: + """Return a copy of an LGDO. + + Parameters + ---------- + obj + the LGDO to be copied. + dtype + NumPy dtype to be used for the copied object. + + """ + if dtype is None: + dtype = obj.dtype + + if isinstance(obj, lgdo.Array): + return lgdo.Array( + np.array(obj.nda, dtype=dtype, copy=True), attrs=dict(obj.attrs) + ) + + if isinstance(obj, lgdo.VectorOfVectors): + return lgdo.VectorOfVectors( + flattened_data=copy(obj.flattened_data, dtype=dtype), + cumulative_length=copy(obj.cumulative_length), + attrs=dict(obj.attrs), + ) + + else: + raise ValueError(f"copy of {type(obj)} not supported") diff --git a/tests/compression/conftest.py b/tests/compression/conftest.py index 927ba1ff..e69cc307 100644 --- a/tests/compression/conftest.py +++ b/tests/compression/conftest.py @@ -1,12 +1,12 @@ import pytest -from lgdo import LH5Store +import lgdo.lh5 as lh5 @pytest.fixture() def wftable(lgnd_test_data): - store = LH5Store() - wft, _ = store.read_object( + store = lh5.LH5Store() + wft, _ = store.read( "/geds/raw/waveform", lgnd_test_data.get_path("lh5/LDQTA_r117_20200110T105115Z_cal_geds_raw.lh5"), ) diff --git a/tests/compression/test_radware_sigcompress.py b/tests/compression/test_radware_sigcompress.py index aacf38f6..fe0bdd99 100644 --- a/tests/compression/test_radware_sigcompress.py +++ b/tests/compression/test_radware_sigcompress.py @@ -2,7 +2,8 @@ import numpy as np -from lgdo import ArrayOfEncodedEqualSizedArrays, ArrayOfEqualSizedArrays, LH5Store +import lgdo.lh5 as lh5 +from lgdo import ArrayOfEncodedEqualSizedArrays, ArrayOfEqualSizedArrays from lgdo.compression.radware import ( _get_hton_u16, _radware_sigcompress_decode, @@ -177,8 +178,8 @@ def test_aoesa(wftable): def test_performance(lgnd_test_data): - store = LH5Store() - obj, _ = store.read_object( + store = lh5.LH5Store() + obj, _ = store.read( "/geds/raw/waveform", lgnd_test_data.get_path("lh5/LDQTA_r117_20200110T105115Z_cal_geds_raw.lh5"), ) diff --git a/tests/test_lh5_iterator.py b/tests/lh5/test_lh5_iterator.py similarity index 92% rename from tests/test_lh5_iterator.py rename to tests/lh5/test_lh5_iterator.py index 09297665..95e575af 100644 --- a/tests/test_lh5_iterator.py +++ b/tests/lh5/test_lh5_iterator.py @@ -2,7 +2,7 @@ import pytest import lgdo -from lgdo.lh5_store import LH5Iterator +import lgdo.lh5 as lh5 @pytest.fixture(scope="module") @@ -11,7 +11,7 @@ def lgnd_file(lgnd_test_data): def test_basics(lgnd_file): - lh5_it = LH5Iterator( + lh5_it = lh5.LH5Iterator( lgnd_file, "/geds/raw", entry_list=range(100), @@ -35,14 +35,14 @@ def test_basics(lgnd_file): def test_errors(lgnd_file): with pytest.raises(RuntimeError): - LH5Iterator("non-existent-file.lh5", "random-group") + lh5.LH5Iterator("non-existent-file.lh5", "random-group") with pytest.raises(ValueError): - LH5Iterator(1, 2) + lh5.LH5Iterator(1, 2) def test_lgnd_waveform_table_fancy_idx(lgnd_file): - lh5_it = LH5Iterator( + lh5_it = lh5.LH5Iterator( lgnd_file, "geds/raw/waveform", entry_list=[ @@ -97,13 +97,13 @@ def more_lgnd_files(lgnd_test_data): def test_friend(more_lgnd_files): - lh5_raw_it = LH5Iterator( + lh5_raw_it = lh5.LH5Iterator( more_lgnd_files[0], "ch1084803/raw", field_mask=["waveform", "baseline"], buffer_len=5, ) - lh5_it = LH5Iterator( + lh5_it = lh5.LH5Iterator( more_lgnd_files[1], "ch1084803/hit", field_mask=["is_valid_0vbb"], @@ -121,7 +121,7 @@ def test_friend(more_lgnd_files): def test_iterate(more_lgnd_files): # iterate through all hit groups in all files; there are 10 entries in # each group/file - lh5_it = LH5Iterator( + lh5_it = lh5.LH5Iterator( more_lgnd_files[1] * 3, ["ch1084803/hit"] * 2 + ["ch1084804/hit"] * 2 + ["ch1121600/hit"] * 2, field_mask=["is_valid_0vbb", "timestamp", "zacEmax_ctc_cal"], diff --git a/tests/test_lh5_store.py b/tests/lh5/test_lh5_store.py similarity index 71% rename from tests/test_lh5_store.py rename to tests/lh5/test_lh5_store.py index 25491660..9d2d254c 100644 --- a/tests/test_lh5_store.py +++ b/tests/lh5/test_lh5_store.py @@ -7,10 +7,11 @@ import pytest import lgdo -import lgdo.lh5_store as lh5 +import lgdo.lh5 as lh5 +import lgdo.types as types from lgdo import compression from lgdo.compression import RadwareSigcompress -from lgdo.lh5_store import DEFAULT_HDF5_SETTINGS, LH5Store +from lgdo.lh5.store import DEFAULT_HDF5_SETTINGS @pytest.fixture(scope="module") @@ -19,11 +20,11 @@ def lgnd_file(lgnd_test_data): def test_init(): - LH5Store() + lh5.LH5Store() def test_gimme_file(lgnd_file): - store = LH5Store(keep_open=True) + store = lh5.LH5Store(keep_open=True) f = store.gimme_file(lgnd_file) assert isinstance(f, h5py.File) @@ -35,7 +36,7 @@ def test_gimme_file(lgnd_file): def test_gimme_group(lgnd_file, tmptestdir): f = h5py.File(lgnd_file) - store = LH5Store() + store = lh5.LH5Store() g = store.gimme_group("/geds", f) assert isinstance(g, h5py.Group) @@ -44,12 +45,6 @@ def test_gimme_group(lgnd_file, tmptestdir): assert isinstance(g, h5py.Group) -def test_show(lgnd_file): - lh5.show(lgnd_file) - lh5.show(lgnd_file, "/geds/raw") - lh5.show(lgnd_file, "geds/raw") - - def test_ls(lgnd_file): assert lh5.ls(lgnd_file) == ["geds"] assert lh5.ls(lgnd_file, "/*/raw") == ["geds/raw"] @@ -68,6 +63,12 @@ def test_ls(lgnd_file): ] +def test_show(lgnd_file): + lh5.show(lgnd_file) + lh5.show(lgnd_file, "/geds/raw") + lh5.show(lgnd_file, "geds/raw") + + def test_load_nda(lgnd_file): nda = lh5.load_nda( [lgnd_file, lgnd_file], @@ -83,49 +84,38 @@ def test_load_nda(lgnd_file): assert nda["waveform/values"].shape == (6, 5592) -def test_load_dfs(lgnd_file): - dfs = lh5.load_dfs( - [lgnd_file, lgnd_file], - ["baseline", "waveform/t0"], - lh5_group="/geds/raw", - idx_list=[[1, 3, 5], [2, 6, 7]], - ) - - assert isinstance(dfs, pd.DataFrame) - - @pytest.fixture(scope="module") def lh5_file(tmptestdir): - store = LH5Store() + store = lh5.LH5Store() struct = lgdo.Struct() struct.add_field("scalar", lgdo.Scalar(value=10, attrs={"sth": 1})) - struct.add_field("array", lgdo.Array(nda=np.array([1, 2, 3, 4, 5]))) + struct.add_field("array", types.Array(nda=np.array([1, 2, 3, 4, 5]))) struct.add_field( "aoesa", - lgdo.ArrayOfEqualSizedArrays(shape=(5, 5), dtype=np.float32, fill_val=42), + types.ArrayOfEqualSizedArrays(shape=(5, 5), dtype=np.float32, fill_val=42), ) struct.add_field( "vov", - lgdo.VectorOfVectors( - flattened_data=lgdo.Array( + types.VectorOfVectors( + flattened_data=types.Array( nda=np.array([1, 2, 3, 4, 5, 2, 4, 8, 9, 7, 5, 3, 1]) ), - cumulative_length=lgdo.Array(nda=np.array([2, 5, 6, 10, 13])), + cumulative_length=types.Array(nda=np.array([2, 5, 6, 10, 13])), attrs={"myattr": 2}, ), ) struct.add_field( "voev", - lgdo.VectorOfEncodedVectors( - encoded_data=lgdo.VectorOfVectors( - flattened_data=lgdo.Array( + types.VectorOfEncodedVectors( + encoded_data=types.VectorOfVectors( + flattened_data=types.Array( nda=np.array([1, 2, 3, 4, 5, 2, 4, 8, 9, 7, 5, 3, 1]) ), - cumulative_length=lgdo.Array(nda=np.array([2, 5, 6, 10, 13])), + cumulative_length=types.Array(nda=np.array([2, 5, 6, 10, 13])), ), - decoded_size=lgdo.Array(shape=5, fill_val=6), + decoded_size=types.Array(shape=5, fill_val=6), ), ) @@ -142,14 +132,14 @@ def lh5_file(tmptestdir): ), } - struct.add_field("table", lgdo.Table(col_dict=col_dict, attrs={"stuff": 5})) + struct.add_field("table", types.Table(col_dict=col_dict, attrs={"stuff": 5})) struct.add_field( "wftable", - lgdo.WaveformTable( - t0=lgdo.Array(np.zeros(10)), - dt=lgdo.Array(np.full(10, fill_value=1)), - values=lgdo.ArrayOfEqualSizedArrays( + types.WaveformTable( + t0=types.Array(np.zeros(10)), + dt=types.Array(np.full(10, fill_value=1)), + values=types.ArrayOfEqualSizedArrays( shape=(10, 1000), dtype=np.uint16, fill_val=100, attrs={"custom": 8} ), ), @@ -157,16 +147,16 @@ def lh5_file(tmptestdir): struct.add_field( "wftable_enc", - lgdo.WaveformTable( - t0=lgdo.Array(np.zeros(10)), - dt=lgdo.Array(np.full(10, fill_value=1)), + types.WaveformTable( + t0=types.Array(np.zeros(10)), + dt=types.Array(np.full(10, fill_value=1)), values=compression.encode( struct["wftable"].values, codec=RadwareSigcompress(codec_shift=-32768) ), ), ) - store.write_object( + store.write( struct, "struct", f"{tmptestdir}/tmp-pygama-lgdo-types.lh5", @@ -176,7 +166,7 @@ def lh5_file(tmptestdir): wo_mode="overwrite_file", ) - store.write_object( + store.write( struct, "struct_full", f"{tmptestdir}/tmp-pygama-lgdo-types.lh5", @@ -194,7 +184,7 @@ def test_write_objects(lh5_file): def test_read_n_rows(lh5_file): - store = LH5Store() + store = lh5.LH5Store() assert store.read_n_rows("/data/struct_full/aoesa", lh5_file) == 5 assert store.read_n_rows("/data/struct_full/array", lh5_file) == 5 assert store.read_n_rows("/data/struct_full/scalar", lh5_file) is None @@ -206,14 +196,14 @@ def test_read_n_rows(lh5_file): def test_get_buffer(lh5_file): - store = LH5Store() + store = lh5.LH5Store() buf = store.get_buffer("/data/struct_full/wftable_enc", lh5_file) - assert isinstance(buf.values, lgdo.ArrayOfEqualSizedArrays) + assert isinstance(buf.values, types.ArrayOfEqualSizedArrays) def test_read_scalar(lh5_file): - store = LH5Store() - lh5_obj, n_rows = store.read_object("/data/struct/scalar", lh5_file) + store = lh5.LH5Store() + lh5_obj, n_rows = store.read("/data/struct/scalar", lh5_file) assert isinstance(lh5_obj, lgdo.Scalar) assert lh5_obj.value == 10 assert n_rows == 1 @@ -223,9 +213,9 @@ def test_read_scalar(lh5_file): def test_read_array(lh5_file): - store = LH5Store() - lh5_obj, n_rows = store.read_object("/data/struct/array", lh5_file) - assert isinstance(lh5_obj, lgdo.Array) + store = lh5.LH5Store() + lh5_obj, n_rows = store.read("/data/struct/array", lh5_file) + assert isinstance(lh5_obj, types.Array) assert (lh5_obj.nda == np.array([2, 3, 4])).all() assert n_rows == 3 with h5py.File(lh5_file) as h5f: @@ -236,19 +226,17 @@ def test_read_array(lh5_file): def test_read_array_fancy_idx(lh5_file): - store = LH5Store() - lh5_obj, n_rows = store.read_object( - "/data/struct_full/array", lh5_file, idx=[0, 3, 4] - ) - assert isinstance(lh5_obj, lgdo.Array) + store = lh5.LH5Store() + lh5_obj, n_rows = store.read("/data/struct_full/array", lh5_file, idx=[0, 3, 4]) + assert isinstance(lh5_obj, types.Array) assert (lh5_obj.nda == np.array([1, 4, 5])).all() assert n_rows == 3 def test_read_vov(lh5_file): - store = LH5Store() - lh5_obj, n_rows = store.read_object("/data/struct/vov", lh5_file) - assert isinstance(lh5_obj, lgdo.VectorOfVectors) + store = lh5.LH5Store() + lh5_obj, n_rows = store.read("/data/struct/vov", lh5_file) + assert isinstance(lh5_obj, types.VectorOfVectors) desired = [np.array([3, 4, 5]), np.array([2]), np.array([4, 8, 9, 7])] @@ -270,9 +258,9 @@ def test_read_vov(lh5_file): def test_read_vov_fancy_idx(lh5_file): - store = LH5Store() - lh5_obj, n_rows = store.read_object("/data/struct_full/vov", lh5_file, idx=[0, 2]) - assert isinstance(lh5_obj, lgdo.VectorOfVectors) + store = lh5.LH5Store() + lh5_obj, n_rows = store.read("/data/struct_full/vov", lh5_file, idx=[0, 2]) + assert isinstance(lh5_obj, types.VectorOfVectors) desired = [np.array([1, 2]), np.array([2])] @@ -283,9 +271,9 @@ def test_read_vov_fancy_idx(lh5_file): def test_read_voev(lh5_file): - store = LH5Store() - lh5_obj, n_rows = store.read_object("/data/struct/voev", lh5_file, decompress=False) - assert isinstance(lh5_obj, lgdo.VectorOfEncodedVectors) + store = lh5.LH5Store() + lh5_obj, n_rows = store.read("/data/struct/voev", lh5_file, decompress=False) + assert isinstance(lh5_obj, types.VectorOfEncodedVectors) desired = [np.array([3, 4, 5]), np.array([2]), np.array([4, 8, 9, 7])] @@ -294,10 +282,10 @@ def test_read_voev(lh5_file): assert n_rows == 3 - lh5_obj, n_rows = store.read_object( + lh5_obj, n_rows = store.read( "/data/struct/voev", [lh5_file, lh5_file], decompress=False ) - assert isinstance(lh5_obj, lgdo.VectorOfEncodedVectors) + assert isinstance(lh5_obj, types.VectorOfEncodedVectors) assert n_rows == 6 with h5py.File(lh5_file) as h5f: @@ -313,11 +301,11 @@ def test_read_voev(lh5_file): def test_read_voev_fancy_idx(lh5_file): - store = LH5Store() - lh5_obj, n_rows = store.read_object( + store = lh5.LH5Store() + lh5_obj, n_rows = store.read( "/data/struct_full/voev", lh5_file, idx=[0, 2], decompress=False ) - assert isinstance(lh5_obj, lgdo.VectorOfEncodedVectors) + assert isinstance(lh5_obj, types.VectorOfEncodedVectors) desired = [np.array([1, 2]), np.array([2])] @@ -328,27 +316,27 @@ def test_read_voev_fancy_idx(lh5_file): def test_read_aoesa(lh5_file): - store = LH5Store() - lh5_obj, n_rows = store.read_object("/data/struct/aoesa", lh5_file) - assert isinstance(lh5_obj, lgdo.ArrayOfEqualSizedArrays) + store = lh5.LH5Store() + lh5_obj, n_rows = store.read("/data/struct/aoesa", lh5_file) + assert isinstance(lh5_obj, types.ArrayOfEqualSizedArrays) assert (lh5_obj.nda == np.full((3, 5), fill_value=42)).all() def test_read_table(lh5_file): - store = LH5Store() - lh5_obj, n_rows = store.read_object("/data/struct/table", lh5_file) - assert isinstance(lh5_obj, lgdo.Table) + store = lh5.LH5Store() + lh5_obj, n_rows = store.read("/data/struct/table", lh5_file) + assert isinstance(lh5_obj, types.Table) assert n_rows == 3 - lh5_obj, n_rows = store.read_object("/data/struct/table", [lh5_file, lh5_file]) + lh5_obj, n_rows = store.read("/data/struct/table", [lh5_file, lh5_file]) assert n_rows == 6 assert lh5_obj.attrs["stuff"] == 5 assert lh5_obj["a"].attrs["attr"] == 9 def test_read_hdf5_compressed_data(lh5_file): - store = LH5Store() - lh5_obj, n_rows = store.read_object("/data/struct/table", lh5_file) + store = lh5.LH5Store() + lh5_obj, n_rows = store.read("/data/struct/table", lh5_file) assert "compression" not in lh5_obj["b"].attrs with h5py.File(lh5_file) as h5f: @@ -363,12 +351,12 @@ def test_read_hdf5_compressed_data(lh5_file): def test_read_wftable(lh5_file): - store = LH5Store() - lh5_obj, n_rows = store.read_object("/data/struct/wftable", lh5_file) - assert isinstance(lh5_obj, lgdo.WaveformTable) + store = lh5.LH5Store() + lh5_obj, n_rows = store.read("/data/struct/wftable", lh5_file) + assert isinstance(lh5_obj, types.WaveformTable) assert n_rows == 3 - lh5_obj, n_rows = store.read_object("/data/struct/wftable", [lh5_file, lh5_file]) + lh5_obj, n_rows = store.read("/data/struct/wftable", [lh5_file, lh5_file]) assert n_rows == 6 assert lh5_obj.values.attrs["custom"] == 8 @@ -388,32 +376,30 @@ def test_read_wftable(lh5_file): def test_read_wftable_encoded(lh5_file): - store = LH5Store() - lh5_obj, n_rows = store.read_object( - "/data/struct/wftable_enc", lh5_file, decompress=False - ) - assert isinstance(lh5_obj, lgdo.WaveformTable) - assert isinstance(lh5_obj.values, lgdo.ArrayOfEncodedEqualSizedArrays) + store = lh5.LH5Store() + lh5_obj, n_rows = store.read("/data/struct/wftable_enc", lh5_file, decompress=False) + assert isinstance(lh5_obj, types.WaveformTable) + assert isinstance(lh5_obj.values, types.ArrayOfEncodedEqualSizedArrays) assert n_rows == 3 assert lh5_obj.values.attrs["codec"] == "radware_sigcompress" assert "codec_shift" in lh5_obj.values.attrs - lh5_obj, n_rows = store.read_object("/data/struct/wftable_enc/values", lh5_file) + lh5_obj, n_rows = store.read("/data/struct/wftable_enc/values", lh5_file) assert isinstance(lh5_obj, lgdo.ArrayOfEqualSizedArrays) assert n_rows == 3 - lh5_obj, n_rows = store.read_object("/data/struct/wftable_enc", lh5_file) + lh5_obj, n_rows = store.read("/data/struct/wftable_enc", lh5_file) assert isinstance(lh5_obj, lgdo.WaveformTable) assert isinstance(lh5_obj.values, lgdo.ArrayOfEqualSizedArrays) assert n_rows == 3 - lh5_obj_chain, n_rows = store.read_object( + lh5_obj_chain, n_rows = store.read( "/data/struct/wftable_enc", [lh5_file, lh5_file], decompress=False ) assert n_rows == 6 assert isinstance(lh5_obj_chain.values, lgdo.ArrayOfEncodedEqualSizedArrays) - lh5_obj_chain, n_rows = store.read_object( + lh5_obj_chain, n_rows = store.read( "/data/struct/wftable_enc", [lh5_file, lh5_file], decompress=True ) assert isinstance(lh5_obj_chain.values, lgdo.ArrayOfEqualSizedArrays) @@ -440,24 +426,22 @@ def test_read_wftable_encoded(lh5_file): def test_read_with_field_mask(lh5_file): - store = LH5Store() + store = lh5.LH5Store() - lh5_obj, n_rows = store.read_object( - "/data/struct_full", lh5_file, field_mask=["array"] - ) + lh5_obj, n_rows = store.read("/data/struct_full", lh5_file, field_mask=["array"]) assert list(lh5_obj.keys()) == ["array"] - lh5_obj, n_rows = store.read_object( + lh5_obj, n_rows = store.read( "/data/struct_full", lh5_file, field_mask=("array", "table") ) assert list(lh5_obj.keys()) == ["array", "table"] - lh5_obj, n_rows = store.read_object( + lh5_obj, n_rows = store.read( "/data/struct_full", lh5_file, field_mask={"array": True} ) assert list(lh5_obj.keys()) == ["array"] - lh5_obj, n_rows = store.read_object( + lh5_obj, n_rows = store.read( "/data/struct_full", lh5_file, field_mask={"vov": False, "voev": False} ) assert list(lh5_obj.keys()) == [ @@ -471,45 +455,45 @@ def test_read_with_field_mask(lh5_file): def test_read_lgnd_array(lgnd_file): - store = LH5Store() + store = lh5.LH5Store() - lh5_obj, n_rows = store.read_object("/geds/raw/baseline", lgnd_file) - assert isinstance(lh5_obj, lgdo.Array) + lh5_obj, n_rows = store.read("/geds/raw/baseline", lgnd_file) + assert isinstance(lh5_obj, types.Array) assert n_rows == 100 assert len(lh5_obj) == 100 - lh5_obj, n_rows = store.read_object("/geds/raw/waveform/values", lgnd_file) - assert isinstance(lh5_obj, lgdo.ArrayOfEqualSizedArrays) + lh5_obj, n_rows = store.read("/geds/raw/waveform/values", lgnd_file) + assert isinstance(lh5_obj, types.ArrayOfEqualSizedArrays) def test_read_lgnd_array_fancy_idx(lgnd_file): - store = LH5Store() + store = lh5.LH5Store() - lh5_obj, n_rows = store.read_object( + lh5_obj, n_rows = store.read( "/geds/raw/baseline", lgnd_file, idx=[2, 4, 6, 9, 11, 16, 68] ) - assert isinstance(lh5_obj, lgdo.Array) + assert isinstance(lh5_obj, types.Array) assert n_rows == 7 assert len(lh5_obj) == 7 assert (lh5_obj.nda == [13508, 14353, 14525, 14341, 15079, 11675, 13995]).all() def test_read_lgnd_vov(lgnd_file): - store = LH5Store() + store = lh5.LH5Store() - lh5_obj, n_rows = store.read_object("/geds/raw/tracelist", lgnd_file) - assert isinstance(lh5_obj, lgdo.VectorOfVectors) + lh5_obj, n_rows = store.read("/geds/raw/tracelist", lgnd_file) + assert isinstance(lh5_obj, types.VectorOfVectors) assert n_rows == 100 assert len(lh5_obj) == 100 def test_read_lgnd_vov_fancy_idx(lgnd_file): - store = LH5Store() + store = lh5.LH5Store() - lh5_obj, n_rows = store.read_object( + lh5_obj, n_rows = store.read( "/geds/raw/tracelist", lgnd_file, idx=[2, 4, 6, 9, 11, 16, 68] ) - assert isinstance(lh5_obj, lgdo.VectorOfVectors) + assert isinstance(lh5_obj, types.VectorOfVectors) assert n_rows == 7 assert len(lh5_obj) == 7 assert (lh5_obj.cumulative_length.nda == [1, 2, 3, 4, 5, 6, 7]).all() @@ -517,20 +501,20 @@ def test_read_lgnd_vov_fancy_idx(lgnd_file): def test_read_array_concatenation(lgnd_file): - store = LH5Store() - lh5_obj, n_rows = store.read_object("/geds/raw/baseline", [lgnd_file, lgnd_file]) - assert isinstance(lh5_obj, lgdo.Array) + store = lh5.LH5Store() + lh5_obj, n_rows = store.read("/geds/raw/baseline", [lgnd_file, lgnd_file]) + assert isinstance(lh5_obj, types.Array) assert n_rows == 200 assert len(lh5_obj) == 200 def test_read_lgnd_waveform_table(lgnd_file): - store = LH5Store() + store = lh5.LH5Store() - lh5_obj, n_rows = store.read_object("/geds/raw/waveform", lgnd_file) - assert isinstance(lh5_obj, lgdo.WaveformTable) + lh5_obj, n_rows = store.read("/geds/raw/waveform", lgnd_file) + assert isinstance(lh5_obj, types.WaveformTable) - lh5_obj, n_rows = store.read_object( + lh5_obj, n_rows = store.read( "/geds/raw/waveform", lgnd_file, start_row=10, @@ -538,29 +522,29 @@ def test_read_lgnd_waveform_table(lgnd_file): field_mask=["t0", "dt"], ) - assert isinstance(lh5_obj, lgdo.Table) + assert isinstance(lh5_obj, types.Table) assert list(lh5_obj.keys()) == ["t0", "dt"] assert len(lh5_obj) == 10 def test_read_lgnd_waveform_table_fancy_idx(lgnd_file): - store = LH5Store() + store = lh5.LH5Store() - lh5_obj, n_rows = store.read_object( + lh5_obj, n_rows = store.read( "/geds/raw/waveform", lgnd_file, idx=[7, 9, 25, 27, 33, 38, 46, 52, 57, 59, 67, 71, 72, 82, 90, 92, 93, 94, 97], ) - assert isinstance(lh5_obj, lgdo.WaveformTable) + assert isinstance(lh5_obj, types.WaveformTable) assert len(lh5_obj) == 19 @pytest.fixture(scope="module") def enc_lgnd_file(lgnd_file, tmptestdir): - store = LH5Store() - wft, n_rows = store.read_object("/geds/raw/waveform", lgnd_file) + store = lh5.LH5Store() + wft, n_rows = store.read("/geds/raw/waveform", lgnd_file) wft.values.attrs["compression"] = RadwareSigcompress(codec_shift=-32768) - store.write_object( + store.write( wft, "/geds/raw/waveform", f"{tmptestdir}/tmp-pygama-compressed-wfs.lh5", @@ -574,16 +558,16 @@ def test_write_compressed_lgnd_waveform_table(enc_lgnd_file): def test_read_compressed_lgnd_waveform_table(lgnd_file, enc_lgnd_file): - store = LH5Store() - wft, _ = store.read_object("/geds/raw/waveform", enc_lgnd_file) - assert isinstance(wft.values, lgdo.ArrayOfEqualSizedArrays) + store = lh5.LH5Store() + wft, _ = store.read("/geds/raw/waveform", enc_lgnd_file) + assert isinstance(wft.values, types.ArrayOfEqualSizedArrays) assert "compression" not in wft.values.attrs def test_write_with_hdf5_compression(lgnd_file, tmptestdir): - store = LH5Store() - wft, n_rows = store.read_object("/geds/raw/waveform", lgnd_file) - store.write_object( + store = lh5.LH5Store() + wft, n_rows = store.read("/geds/raw/waveform", lgnd_file) + store.write( wft, "/geds/raw/waveform", f"{tmptestdir}/tmp-pygama-hdf5-compressed-wfs.lh5", @@ -597,7 +581,7 @@ def test_write_with_hdf5_compression(lgnd_file, tmptestdir): assert h5f["/geds/raw/waveform/values"].compression_opts == 9 assert h5f["/geds/raw/waveform/values"].shuffle is True - store.write_object( + store.write( wft, "/geds/raw/waveform", f"{tmptestdir}/tmp-pygama-hdf5-compressed-wfs.lh5", @@ -618,13 +602,13 @@ def test_write_object_overwrite_table_no_deletion(caplog, tmptestdir): if os.path.exists(f"{tmptestdir}/write_object_overwrite_test.lh5"): os.remove(f"{tmptestdir}/write_object_overwrite_test.lh5") - tb1 = lh5.Table(col_dict={"dset1": lh5.Array(np.zeros(10))}) - tb2 = lh5.Table( - col_dict={"dset1": lh5.Array(np.ones(10))} + tb1 = types.Table(col_dict={"dset1": types.Array(np.zeros(10))}) + tb2 = types.Table( + col_dict={"dset1": types.Array(np.ones(10))} ) # Same field name, different values - store = LH5Store() - store.write_object(tb1, "my_group", f"{tmptestdir}/write_object_overwrite_test.lh5") - store.write_object( + store = lh5.LH5Store() + store.write(tb1, "my_group", f"{tmptestdir}/write_object_overwrite_test.lh5") + store.write( tb2, "my_group", f"{tmptestdir}/write_object_overwrite_test.lh5", @@ -637,9 +621,7 @@ def test_write_object_overwrite_table_no_deletion(caplog, tmptestdir): ] # Now, check that the data were overwritten - tb_dat, _ = store.read_object( - "my_group", f"{tmptestdir}/write_object_overwrite_test.lh5" - ) + tb_dat, _ = store.read("my_group", f"{tmptestdir}/write_object_overwrite_test.lh5") assert np.array_equal(tb_dat["dset1"].nda, np.ones(10)) @@ -651,13 +633,13 @@ def test_write_object_overwrite_table_with_deletion(caplog, tmptestdir): if os.path.exists(f"{tmptestdir}/write_object_overwrite_test.lh5"): os.remove(f"{tmptestdir}/write_object_overwrite_test.lh5") - tb1 = lh5.Table(col_dict={"dset1": lh5.Array(np.zeros(10))}) - tb2 = lh5.Table( - col_dict={"dset2": lh5.Array(np.ones(10))} + tb1 = types.Table(col_dict={"dset1": types.Array(np.zeros(10))}) + tb2 = types.Table( + col_dict={"dset2": types.Array(np.ones(10))} ) # Same field name, different values - store = LH5Store() - store.write_object(tb1, "my_group", f"{tmptestdir}/write_object_overwrite_test.lh5") - store.write_object( + store = lh5.LH5Store() + store.write(tb1, "my_group", f"{tmptestdir}/write_object_overwrite_test.lh5") + store.write( tb2, "my_group", f"{tmptestdir}/write_object_overwrite_test.lh5", @@ -665,9 +647,7 @@ def test_write_object_overwrite_table_with_deletion(caplog, tmptestdir): ) # Now, try to overwrite with a different field # Now, check that the data were overwritten - tb_dat, _ = store.read_object( - "my_group", f"{tmptestdir}/write_object_overwrite_test.lh5" - ) + tb_dat, _ = store.read("my_group", f"{tmptestdir}/write_object_overwrite_test.lh5") assert np.array_equal(tb_dat["dset2"].nda, np.ones(10)) # Also make sure that the first table's fields aren't lurking around the lh5 file! @@ -678,18 +658,18 @@ def test_write_object_overwrite_table_with_deletion(caplog, tmptestdir): if os.path.exists(f"{tmptestdir}/write_object_overwrite_test.lh5"): os.remove(f"{tmptestdir}/write_object_overwrite_test.lh5") - tb1 = lh5.Table(col_dict={"dset1": lh5.Array(np.zeros(10))}) - tb2 = lh5.Table( - col_dict={"dset2": lh5.Array(np.ones(10))} + tb1 = types.Table(col_dict={"dset1": types.Array(np.zeros(10))}) + tb2 = types.Table( + col_dict={"dset2": types.Array(np.ones(10))} ) # Same field name, different values - store = LH5Store() - store.write_object( + store = lh5.LH5Store() + store.write( tb1, "my_table", f"{tmptestdir}/write_object_overwrite_test.lh5", group="my_group", ) - store.write_object( + store.write( tb2, "my_table", f"{tmptestdir}/write_object_overwrite_test.lh5", @@ -698,7 +678,7 @@ def test_write_object_overwrite_table_with_deletion(caplog, tmptestdir): ) # Now, try to overwrite with a different field # Now, check that the data were overwritten - tb_dat, _ = store.read_object( + tb_dat, _ = store.read( "my_group/my_table", f"{tmptestdir}/write_object_overwrite_test.lh5" ) assert np.array_equal(tb_dat["dset2"].nda, np.ones(10)) @@ -713,11 +693,11 @@ def test_write_object_overwrite_lgdo(caplog, tmptestdir): caplog.set_level(logging.DEBUG) caplog.clear() - # Start with an lgdo.WaveformTable + # Start with an types.WaveformTable if os.path.exists(f"{tmptestdir}/write_object_overwrite_test.lh5"): os.remove(f"{tmptestdir}/write_object_overwrite_test.lh5") - tb1 = lh5.WaveformTable( + tb1 = types.WaveformTable( t0=np.zeros(10), t0_units="ns", dt=np.zeros(10), @@ -725,7 +705,7 @@ def test_write_object_overwrite_lgdo(caplog, tmptestdir): values=np.zeros((10, 10)), values_units="ADC", ) - tb2 = lh5.WaveformTable( + tb2 = types.WaveformTable( t0=np.ones(10), t0_units="ns", dt=np.ones(10), @@ -733,14 +713,14 @@ def test_write_object_overwrite_lgdo(caplog, tmptestdir): values=np.ones((10, 10)), values_units="ADC", ) # Same field name, different values - store = LH5Store() - store.write_object( + store = lh5.LH5Store() + store.write( tb1, "my_table", f"{tmptestdir}/write_object_overwrite_test.lh5", group="my_group", ) - store.write_object( + store.write( tb2, "my_table", f"{tmptestdir}/write_object_overwrite_test.lh5", @@ -754,19 +734,17 @@ def test_write_object_overwrite_lgdo(caplog, tmptestdir): ] # Now, check that the data were overwritten - tb_dat, _ = store.read_object( + tb_dat, _ = store.read( "my_group/my_table", f"{tmptestdir}/write_object_overwrite_test.lh5" ) assert np.array_equal(tb_dat["values"].nda, np.ones((10, 10))) # Now try overwriting an array, and test the write_start argument - array1 = lh5.Array(nda=np.zeros(10)) - array2 = lh5.Array(nda=np.ones(20)) - store = LH5Store() - store.write_object( - array1, "my_array", f"{tmptestdir}/write_object_overwrite_test.lh5" - ) - store.write_object( + array1 = types.Array(nda=np.zeros(10)) + array2 = types.Array(nda=np.ones(20)) + store = lh5.LH5Store() + store.write(array1, "my_array", f"{tmptestdir}/write_object_overwrite_test.lh5") + store.write( array2, "my_array", f"{tmptestdir}/write_object_overwrite_test.lh5", @@ -775,7 +753,7 @@ def test_write_object_overwrite_lgdo(caplog, tmptestdir): ) # Now, check that the data were overwritten - array_dat, _ = store.read_object( + array_dat, _ = store.read( "my_array", f"{tmptestdir}/write_object_overwrite_test.lh5" ) expected_out_array = np.append(np.zeros(5), np.ones(20)) @@ -783,13 +761,11 @@ def test_write_object_overwrite_lgdo(caplog, tmptestdir): assert np.array_equal(array_dat.nda, expected_out_array) # Now try overwriting a scalar - scalar1 = lh5.Scalar(0) - scalar2 = lh5.Scalar(1) - store = LH5Store() - store.write_object( - scalar1, "my_scalar", f"{tmptestdir}/write_object_overwrite_test.lh5" - ) - store.write_object( + scalar1 = types.Scalar(0) + scalar2 = types.Scalar(1) + store = lh5.LH5Store() + store.write(scalar1, "my_scalar", f"{tmptestdir}/write_object_overwrite_test.lh5") + store.write( scalar2, "my_scalar", f"{tmptestdir}/write_object_overwrite_test.lh5", @@ -797,20 +773,18 @@ def test_write_object_overwrite_lgdo(caplog, tmptestdir): ) # Now, check that the data were overwritten - scalar_dat, _ = store.read_object( + scalar_dat, _ = store.read( "my_scalar", f"{tmptestdir}/write_object_overwrite_test.lh5" ) assert scalar_dat.value == 1 # Finally, try overwriting a vector of vectors - vov1 = lh5.VectorOfVectors(listoflists=[np.zeros(1), np.ones(2), np.zeros(3)]) - vov2 = lh5.VectorOfVectors(listoflists=[np.ones(1), np.zeros(2), np.ones(3)]) - store = LH5Store() - store.write_object( - vov1, "my_vector", f"{tmptestdir}/write_object_overwrite_test.lh5" - ) - store.write_object( + vov1 = types.VectorOfVectors(listoflists=[np.zeros(1), np.ones(2), np.zeros(3)]) + vov2 = types.VectorOfVectors(listoflists=[np.ones(1), np.zeros(2), np.ones(3)]) + store = lh5.LH5Store() + store.write(vov1, "my_vector", f"{tmptestdir}/write_object_overwrite_test.lh5") + store.write( vov2, "my_vector", f"{tmptestdir}/write_object_overwrite_test.lh5", @@ -818,7 +792,7 @@ def test_write_object_overwrite_lgdo(caplog, tmptestdir): write_start=1, ) # start overwriting the second list of lists - vector_dat, _ = store.read_object( + vector_dat, _ = store.read( "my_vector", f"{tmptestdir}/write_object_overwrite_test.lh5" ) @@ -832,14 +806,12 @@ def test_write_object_append_column(tmptestdir): if os.path.exists(f"{tmptestdir}/write_object_append_column_test.lh5"): os.remove(f"{tmptestdir}/write_object_append_column_test.lh5") - array1 = lh5.Array(np.zeros(10)) - tb1 = lh5.Table(col_dict={"dset1`": lh5.Array(np.ones(10))}) - store = LH5Store() - store.write_object( - array1, "my_table", f"{tmptestdir}/write_object_append_column_test.lh5" - ) + array1 = types.Array(np.zeros(10)) + tb1 = types.Table(col_dict={"dset1`": types.Array(np.ones(10))}) + store = lh5.LH5Store() + store.write(array1, "my_table", f"{tmptestdir}/write_object_append_column_test.lh5") with pytest.raises(RuntimeError) as exc_info: - store.write_object( + store.write( tb1, "my_table", f"{tmptestdir}/write_object_append_column_test.lh5", @@ -855,18 +827,19 @@ def test_write_object_append_column(tmptestdir): if os.path.exists(f"{tmptestdir}/write_object_append_column_test.lh5"): os.remove(f"{tmptestdir}/write_object_append_column_test.lh5") - tb1 = lh5.Table( - col_dict={"dset1": lh5.Array(np.zeros(10)), "dset2": lh5.Array(np.zeros(10))} + tb1 = types.Table( + col_dict={ + "dset1": types.Array(np.zeros(10)), + "dset2": types.Array(np.zeros(10)), + } ) - tb2 = lh5.Table( - col_dict={"dset2": lh5.Array(np.ones(10))} + tb2 = types.Table( + col_dict={"dset2": types.Array(np.ones(10))} ) # Same field name, different values - store = LH5Store() - store.write_object( - tb1, "my_table", f"{tmptestdir}/write_object_append_column_test.lh5" - ) + store = lh5.LH5Store() + store.write(tb1, "my_table", f"{tmptestdir}/write_object_append_column_test.lh5") with pytest.raises(ValueError) as exc_info: - store.write_object( + store.write( tb2, "my_table", f"{tmptestdir}/write_object_append_column_test.lh5", @@ -883,16 +856,14 @@ def test_write_object_append_column(tmptestdir): if os.path.exists(f"{tmptestdir}/write_object_append_column_test.lh5"): os.remove(f"{tmptestdir}/write_object_append_column_test.lh5") - tb1 = lh5.Table(col_dict={"dset1": lh5.Array(np.zeros(10))}) - tb2 = lh5.Table( - col_dict={"dset2": lh5.Array(np.ones(20))} + tb1 = types.Table(col_dict={"dset1": types.Array(np.zeros(10))}) + tb2 = types.Table( + col_dict={"dset2": types.Array(np.ones(20))} ) # different field name, different size - store = LH5Store() - store.write_object( - tb1, "my_table", f"{tmptestdir}/write_object_append_column_test.lh5" - ) + store = lh5.LH5Store() + store.write(tb1, "my_table", f"{tmptestdir}/write_object_append_column_test.lh5") with pytest.raises(ValueError) as exc_info: - store.write_object( + store.write( tb2, "my_table", f"{tmptestdir}/write_object_append_column_test.lh5", @@ -909,18 +880,18 @@ def test_write_object_append_column(tmptestdir): if os.path.exists(f"{tmptestdir}/write_object_append_column_test.lh5"): os.remove(f"{tmptestdir}/write_object_append_column_test.lh5") - tb1 = lh5.Table(col_dict={"dset1": lh5.Array(np.zeros(10))}) - tb2 = lh5.Table( - col_dict={"dset2": lh5.Array(np.ones(10))} + tb1 = types.Table(col_dict={"dset1": types.Array(np.zeros(10))}) + tb2 = types.Table( + col_dict={"dset2": types.Array(np.ones(10))} ) # different field name, different size - store = LH5Store() - store.write_object( + store = lh5.LH5Store() + store.write( tb1, "my_table", f"{tmptestdir}/write_object_append_column_test.lh5", group="my_group", ) - store.write_object( + store.write( tb2, "my_table", f"{tmptestdir}/write_object_append_column_test.lh5", @@ -929,9 +900,20 @@ def test_write_object_append_column(tmptestdir): ) # Now, check that the data were appended - tb_dat, _ = store.read_object( + tb_dat, _ = store.read( "my_group/my_table", f"{tmptestdir}/write_object_append_column_test.lh5" ) - assert isinstance(tb_dat, lgdo.Table) + assert isinstance(tb_dat, types.Table) assert np.array_equal(tb_dat["dset1"].nda, np.zeros(10)) assert np.array_equal(tb_dat["dset2"].nda, np.ones(10)) + + +def test_load_dfs(lgnd_file): + dfs = lh5.load_dfs( + [lgnd_file, lgnd_file], + ["baseline", "waveform/t0"], + lh5_group="/geds/raw", + idx_list=[[1, 3, 5], [2, 6, 7]], + ) + + assert isinstance(dfs, pd.DataFrame) diff --git a/tests/lh5/test_lh5_utils.py b/tests/lh5/test_lh5_utils.py new file mode 100644 index 00000000..c83dd9a9 --- /dev/null +++ b/tests/lh5/test_lh5_utils.py @@ -0,0 +1,72 @@ +import os + +import pytest + +import lgdo.lh5.utils as utils + + +@pytest.fixture(scope="module") +def lgnd_file(lgnd_test_data): + return lgnd_test_data.get_path("lh5/LDQTA_r117_20200110T105115Z_cal_geds_raw.lh5") + + +def test_parse_datatype(): + datatypes = [ + ("real", ("scalar", None, "real")), + ("array<1>{bool}", ("array", (1,), "bool")), + ("fixedsizearray<2>{real}", ("fixedsizearray", (2,), "real")), + ( + "arrayofequalsizedarrays<3,4>{complex}", + ("arrayofequalsizedarrays", (3, 4), "complex"), + ), + ("array<1>{array<1>{blob}}", ("array", (1,), "array<1>{blob}")), + ( + "struct{field1,field2,fieldn}", + ("struct", None, ["field1", "field2", "fieldn"]), + ), + ("table{col1,col2,coln}", ("table", None, ["col1", "col2", "coln"])), + ] + + for string, dt_tuple in datatypes: + pd_dt_tuple = utils.parse_datatype(string) + assert pd_dt_tuple == dt_tuple + + +def test_expand_vars(): + # Check env variable expansion + os.environ["PYGAMATESTBASEDIR"] = "a_random_string" + assert utils.expand_vars("$PYGAMATESTBASEDIR/blah") == "a_random_string/blah" + + # Check user variable expansion + assert ( + utils.expand_vars( + "$PYGAMATESTBASEDIR2/blah", + substitute={"PYGAMATESTBASEDIR2": "a_random_string"}, + ) + == "a_random_string/blah" + ) + + +def test_expand_path(lgnd_test_data): + files = [ + lgnd_test_data.get_path( + "lh5/prod-ref-l200/generated/tier/dsp/cal/p03/r001/l200-p03-r001-cal-20230318T012144Z-tier_dsp.lh5" + ), + lgnd_test_data.get_path( + "lh5/prod-ref-l200/generated/tier/dsp/cal/p03/r001/l200-p03-r001-cal-20230318T012228Z-tier_dsp.lh5" + ), + ] + base_dir = os.path.dirname(files[0]) + + assert utils.expand_path(f"{base_dir}/*20230318T012144Z*") == files[0] + + # Should fail if file not found + with pytest.raises(FileNotFoundError): + utils.expand_path(f"{base_dir}/not_a_real_file.lh5") + + # Should fail if multiple files found + with pytest.raises(FileNotFoundError): + utils.expand_path(f"{base_dir}/*.lh5") + + # Check if it finds a list of files correctly + assert sorted(utils.expand_path(f"{base_dir}/*.lh5", list=True)) == sorted(files) diff --git a/tests/test_lgdo_utils.py b/tests/test_lgdo_utils.py index 49df91ca..ce86d971 100644 --- a/tests/test_lgdo_utils.py +++ b/tests/test_lgdo_utils.py @@ -1,9 +1,6 @@ -import os - import numpy as np -import pytest -import lgdo.lgdo_utils as lgdo_utils +import lgdo.utils as utils def test_get_element_type(): @@ -20,69 +17,5 @@ def test_get_element_type(): ] for obj, name in objs: - get_name = lgdo_utils.get_element_type(obj) + get_name = utils.get_element_type(obj) assert get_name == name - - -def test_parse_datatype(): - datatypes = [ - ("real", ("scalar", None, "real")), - ("array<1>{bool}", ("array", (1,), "bool")), - ("fixedsizearray<2>{real}", ("fixedsizearray", (2,), "real")), - ( - "arrayofequalsizedarrays<3,4>{complex}", - ("arrayofequalsizedarrays", (3, 4), "complex"), - ), - ("array<1>{array<1>{blob}}", ("array", (1,), "array<1>{blob}")), - ( - "struct{field1,field2,fieldn}", - ("struct", None, ["field1", "field2", "fieldn"]), - ), - ("table{col1,col2,coln}", ("table", None, ["col1", "col2", "coln"])), - ] - - for string, dt_tuple in datatypes: - pd_dt_tuple = lgdo_utils.parse_datatype(string) - assert pd_dt_tuple == dt_tuple - - -def test_expand_vars(): - # Check env variable expansion - os.environ["PYGAMATESTBASEDIR"] = "a_random_string" - assert lgdo_utils.expand_vars("$PYGAMATESTBASEDIR/blah") == "a_random_string/blah" - - # Check user variable expansion - assert ( - lgdo_utils.expand_vars( - "$PYGAMATESTBASEDIR2/blah", - substitute={"PYGAMATESTBASEDIR2": "a_random_string"}, - ) - == "a_random_string/blah" - ) - - -def test_expand_path(lgnd_test_data): - files = [ - lgnd_test_data.get_path( - "lh5/prod-ref-l200/generated/tier/dsp/cal/p03/r001/l200-p03-r001-cal-20230318T012144Z-tier_dsp.lh5" - ), - lgnd_test_data.get_path( - "lh5/prod-ref-l200/generated/tier/dsp/cal/p03/r001/l200-p03-r001-cal-20230318T012228Z-tier_dsp.lh5" - ), - ] - base_dir = os.path.dirname(files[0]) - - assert lgdo_utils.expand_path(f"{base_dir}/*20230318T012144Z*") == files[0] - - # Should fail if file not found - with pytest.raises(FileNotFoundError): - lgdo_utils.expand_path(f"{base_dir}/not_a_real_file.lh5") - - # Should fail if multiple files found - with pytest.raises(FileNotFoundError): - lgdo_utils.expand_path(f"{base_dir}/*.lh5") - - # Check if it finds a list of files correctly - assert sorted(lgdo_utils.expand_path(f"{base_dir}/*.lh5", list=True)) == sorted( - files - ) diff --git a/tests/types/test_array.py b/tests/types/test_array.py index 0932c99b..df1bcd3c 100644 --- a/tests/types/test_array.py +++ b/tests/types/test_array.py @@ -1,6 +1,6 @@ import numpy as np -import lgdo.lgdo_utils as utils +import lgdo.utils as utils from lgdo import Array diff --git a/tests/types/test_vectorofvectors.py b/tests/types/test_vectorofvectors.py index 4126d119..71c20ea8 100644 --- a/tests/types/test_vectorofvectors.py +++ b/tests/types/test_vectorofvectors.py @@ -2,7 +2,7 @@ import pytest import lgdo -import lgdo.lgdo_utils as utils +import lgdo.utils as utils from lgdo import VectorOfVectors from lgdo.types import vectorofvectors as vov