diff --git a/docs/source/extensions/numbadoc.py b/docs/source/extensions/numbadoc.py
index 5b4202a1..06de3418 100644
--- a/docs/source/extensions/numbadoc.py
+++ b/docs/source/extensions/numbadoc.py
@@ -27,7 +27,7 @@ def import_object(self) -> bool:
"""
success = super().import_object()
if success:
- # Store away numba wrapper
+ # store away numba wrapper
self.jitobj = self.object
# And bend references to underlying python function
if hasattr(self.object, "py_func"):
diff --git a/docs/source/notebooks/DataCompression.ipynb b/docs/source/notebooks/DataCompression.ipynb
index fad9c9bc..74a26c92 100644
--- a/docs/source/notebooks/DataCompression.ipynb
+++ b/docs/source/notebooks/DataCompression.ipynb
@@ -61,8 +61,8 @@
"metadata": {},
"outputs": [],
"source": [
- "store = lgdo.LH5Store()\n",
- "store.write_object(data, \"data\", \"data.lh5\", wo_mode=\"of\")\n",
+ "store = lgdo.lh5.LH5Store()\n",
+ "store.write(data, \"data\", \"data.lh5\", wo_mode=\"of\")\n",
"lgdo.show(\"data.lh5\")"
]
},
@@ -110,7 +110,7 @@
"metadata": {},
"outputs": [],
"source": [
- "lgdo.lh5_store.DEFAULT_HDF5_SETTINGS"
+ "lgdo.lh5.store.DEFAULT_HDF5_SETTINGS"
]
},
{
@@ -131,18 +131,18 @@
"outputs": [],
"source": [
"# use another built-in filter\n",
- "lgdo.lh5_store.DEFAULT_HDF5_SETTINGS = {\"compression\": \"lzf\"}\n",
+ "lgdo.lh5.store.DEFAULT_HDF5_SETTINGS = {\"compression\": \"lzf\"}\n",
"\n",
"# specify filter name and options\n",
- "lgdo.lh5_store.DEFAULT_HDF5_SETTINGS = {\"compression\": \"gzip\", \"compression_opts\": 7}\n",
+ "lgdo.lh5.store.DEFAULT_HDF5_SETTINGS = {\"compression\": \"gzip\", \"compression_opts\": 7}\n",
"\n",
"# specify a registered filter provided by hdf5plugin\n",
"import hdf5plugin\n",
"\n",
- "lgdo.lh5_store.DEFAULT_HDF5_SETTINGS = {\"compression\": hdf5plugin.Blosc()}\n",
+ "lgdo.lh5.store.DEFAULT_HDF5_SETTINGS = {\"compression\": hdf5plugin.Blosc()}\n",
"\n",
"# shuffle bytes before compressing (typically better compression ratio with no performance penalty)\n",
- "lgdo.lh5_store.DEFAULT_HDF5_SETTINGS = {\"shuffle\": True, \"compression\": \"lzf\"}"
+ "lgdo.lh5.store.DEFAULT_HDF5_SETTINGS = {\"shuffle\": True, \"compression\": \"lzf\"}"
]
},
{
@@ -166,7 +166,7 @@
"metadata": {},
"outputs": [],
"source": [
- "store.write_object(data, \"data\", \"data.lh5\", wo_mode=\"of\")\n",
+ "store.write(data, \"data\", \"data.lh5\", wo_mode=\"of\")\n",
"show_h5ds_opts(\"data/col1\")"
]
},
@@ -175,7 +175,7 @@
"id": "f597a9e2",
"metadata": {},
"source": [
- "Nice. Shuffling bytes before compressing significantly reduced size on disk. Last but not least, `create_dataset()` keyword arguments can be passed to `write_object()`. They will be forwarded as is, overriding default settings."
+ "Nice. Shuffling bytes before compressing significantly reduced size on disk. Last but not least, `create_dataset()` keyword arguments can be passed to `write()`. They will be forwarded as is, overriding default settings."
]
},
{
@@ -185,9 +185,7 @@
"metadata": {},
"outputs": [],
"source": [
- "store.write_object(\n",
- " data, \"data\", \"data.lh5\", wo_mode=\"of\", shuffle=True, compression=\"gzip\"\n",
- ")\n",
+ "store.write(data, \"data\", \"data.lh5\", wo_mode=\"of\", shuffle=True, compression=\"gzip\")\n",
"show_h5ds_opts(\"data/col1\")"
]
},
@@ -207,7 +205,7 @@
"outputs": [],
"source": [
"data[\"col2\"].attrs[\"hdf5_settings\"] = {\"compression\": \"gzip\"}\n",
- "store.write_object(data, \"data\", \"data.lh5\", wo_mode=\"of\")\n",
+ "store.write(data, \"data\", \"data.lh5\", wo_mode=\"of\")\n",
"\n",
"show_h5ds_opts(\"data/col1\")\n",
"show_h5ds_opts(\"data/col2\")"
@@ -221,7 +219,7 @@
"We are now storing table columns with different compression settings.\n",
"\n",
"
\n",
- "**Note:** since any [h5py.Group.create_dataset()](https://docs.h5py.org/en/stable/high/group.html#h5py.Group.create_dataset) keyword argument can be used in `write_object()` or set in the `hdf5_settings` attribute, other HDF5 dataset settings can be configured, like the chunk size.\n",
+ "**Note:** since any [h5py.Group.create_dataset()](https://docs.h5py.org/en/stable/high/group.html#h5py.Group.create_dataset) keyword argument can be used in `write()` or set in the `hdf5_settings` attribute, other HDF5 dataset settings can be configured, like the chunk size.\n",
"
"
]
},
@@ -232,7 +230,7 @@
"metadata": {},
"outputs": [],
"source": [
- "store.write_object(data, \"data\", \"data.lh5\", wo_mode=\"of\", chunks=2)"
+ "store.write(data, \"data\", \"data.lh5\", wo_mode=\"of\", chunks=2)"
]
},
{
@@ -257,7 +255,7 @@
"from legendtestdata import LegendTestData\n",
"\n",
"ldata = LegendTestData()\n",
- "wfs, n_rows = store.read_object(\n",
+ "wfs, n_rows = store.read(\n",
" \"geds/raw/waveform\",\n",
" ldata.get_path(\"lh5/LDQTA_r117_20200110T105115Z_cal_geds_raw.lh5\"),\n",
")\n",
@@ -347,7 +345,7 @@
" t0=wfs.t0,\n",
" dt=wfs.dt,\n",
")\n",
- "store.write_object(enc_wfs, \"waveforms\", \"data.lh5\", wo_mode=\"o\")\n",
+ "store.write(enc_wfs, \"waveforms\", \"data.lh5\", wo_mode=\"o\")\n",
"lgdo.show(\"data.lh5\", attrs=True)"
]
},
@@ -372,7 +370,7 @@
"metadata": {},
"outputs": [],
"source": [
- "obj, _ = store.read_object(\"waveforms\", \"data.lh5\")\n",
+ "obj, _ = store.read(\"waveforms\", \"data.lh5\")\n",
"obj.values"
]
},
@@ -391,7 +389,7 @@
"metadata": {},
"outputs": [],
"source": [
- "obj, _ = store.read_object(\"waveforms\", \"data.lh5\", decompress=False)\n",
+ "obj, _ = store.read(\"waveforms\", \"data.lh5\", decompress=False)\n",
"obj.values"
]
},
@@ -433,9 +431,9 @@
"from lgdo.compression import ULEB128ZigZagDiff\n",
"\n",
"wfs.values.attrs[\"compression\"] = ULEB128ZigZagDiff()\n",
- "store.write_object(wfs, \"waveforms\", \"data.lh5\", wo_mode=\"of\")\n",
+ "store.write(wfs, \"waveforms\", \"data.lh5\", wo_mode=\"of\")\n",
"\n",
- "obj, _ = store.read_object(\"waveforms\", \"data.lh5\", decompress=False)\n",
+ "obj, _ = store.read(\"waveforms\", \"data.lh5\", decompress=False)\n",
"obj.values.attrs[\"codec\"]"
]
},
@@ -447,8 +445,8 @@
"Further reading:\n",
"\n",
"- [Available waveform compression algorithms](https://legend-pydataobj.readthedocs.io/en/stable/api/lgdo.compression.html)\n",
- "- [read_object() docstring](https://legend-pydataobj.readthedocs.io/en/stable/api/lgdo.html#lgdo.lh5_store.LH5Store.read_object)\n",
- "- [write_object() docstring](https://legend-pydataobj.readthedocs.io/en/stable/api/lgdo.html#lgdo.lh5_store.LH5Store.write_object)"
+ "- [read() docstring](https://legend-pydataobj.readthedocs.io/en/stable/api/lgdo.html#lgdo.lh5.store.LH5Store.read)\n",
+ "- [write() docstring](https://legend-pydataobj.readthedocs.io/en/stable/api/lgdo.html#lgdo.lh5_store.LH5Store.write)"
]
}
],
diff --git a/docs/source/notebooks/LH5Files.ipynb b/docs/source/notebooks/LH5Files.ipynb
index 8563f4bd..9c594be9 100644
--- a/docs/source/notebooks/LH5Files.ipynb
+++ b/docs/source/notebooks/LH5Files.ipynb
@@ -38,7 +38,7 @@
"id": "c136b537",
"metadata": {},
"source": [
- "We can use `lgdo.lh5_store.ls()` [[docs]](https://legend-pydataobj.readthedocs.io/en/stable/api/lgdo.html#lgdo.lh5_store.ls) to inspect the file contents:"
+ "We can use `lgdo.lh5.ls()` [[docs]](https://legend-pydataobj.readthedocs.io/en/stable/api/lgdo.html#lgdo.lh5.ls) to inspect the file contents:"
]
},
{
@@ -131,7 +131,7 @@
"metadata": {},
"outputs": [],
"source": [
- "store.read_object(\"geds/raw\", lh5_file)"
+ "store.read(\"geds/raw\", lh5_file)"
]
},
{
@@ -149,7 +149,7 @@
"metadata": {},
"outputs": [],
"source": [
- "obj, n_rows = store.read_object(\"geds/raw/timestamp\", lh5_file)\n",
+ "obj, n_rows = store.read(\"geds/raw/timestamp\", lh5_file)\n",
"obj"
]
},
@@ -170,7 +170,7 @@
"metadata": {},
"outputs": [],
"source": [
- "obj, n_rows = store.read_object(\"geds/raw/timestamp\", lh5_file, start_row=15, n_rows=10)\n",
+ "obj, n_rows = store.read(\"geds/raw/timestamp\", lh5_file, start_row=15, n_rows=10)\n",
"print(obj)"
]
},
@@ -189,7 +189,7 @@
"metadata": {},
"outputs": [],
"source": [
- "obj, n_rows = store.read_object(\n",
+ "obj, n_rows = store.read(\n",
" \"geds/raw\", lh5_file, field_mask=(\"timestamp\", \"energy\"), idx=[1, 3, 7, 9, 10, 15]\n",
")\n",
"print(obj)"
@@ -200,7 +200,7 @@
"id": "b3f52d77",
"metadata": {},
"source": [
- "As you might have noticed, `read_object()` loads all the requested data in memory at once. This can be a problem when dealing with large datasets. `LH5Iterator` [[docs]](https://legend-pydataobj.readthedocs.io/en/stable/api/lgdo.html#lgdo.lh5_store.LH5Iterator) makes it possible to handle data one chunk at a time (sequentially) to avoid running out of memory:"
+ "As you might have noticed, `read_object()` loads all the requested data in memory at once. This can be a problem when dealing with large datasets. `LH5Iterator` [[docs]](https://legend-pydataobj.readthedocs.io/en/stable/api/lgdo.html#lgdo.lh5.iterator.LH5Iterator) makes it possible to handle data one chunk at a time (sequentially) to avoid running out of memory:"
]
},
{
@@ -260,9 +260,7 @@
"source": [
"store = LH5Store()\n",
"\n",
- "store.write_object(\n",
- " scalar, name=\"message\", lh5_file=\"my_objects.lh5\", wo_mode=\"overwrite_file\"\n",
- ")"
+ "store.write(scalar, name=\"message\", lh5_file=\"my_objects.lh5\", wo_mode=\"overwrite_file\")"
]
},
{
@@ -300,10 +298,8 @@
"metadata": {},
"outputs": [],
"source": [
- "store.write_object(array, name=\"numbers\", group=\"closet\", lh5_file=\"my_objects.lh5\")\n",
- "store.write_object(\n",
- " wf_table, name=\"waveforms\", group=\"closet\", lh5_file=\"my_objects.lh5\"\n",
- ")\n",
+ "store.write(array, name=\"numbers\", group=\"closet\", lh5_file=\"my_objects.lh5\")\n",
+ "store.write(wf_table, name=\"waveforms\", group=\"closet\", lh5_file=\"my_objects.lh5\")\n",
"show(\"my_objects.lh5\")"
]
},
diff --git a/src/lgdo/__init__.py b/src/lgdo/__init__.py
index 5e6eb7e0..25efb08d 100644
--- a/src/lgdo/__init__.py
+++ b/src/lgdo/__init__.py
@@ -66,11 +66,11 @@
"VectorOfVectors",
"VectorOfEncodedVectors",
"WaveformTable",
- "LH5Iterator",
- "LH5Store",
"load_dfs",
"load_nda",
"ls",
"show",
+ "LH5Iterator",
+ "LH5Store",
"__version__",
]
diff --git a/src/lgdo/cli.py b/src/lgdo/cli.py
index 24ba56d1..2273579a 100644
--- a/src/lgdo/cli.py
+++ b/src/lgdo/cli.py
@@ -9,7 +9,7 @@
def lh5ls():
- """:func:`.show` command line interface."""
+ """:func:`.lh5.show` command line interface."""
parser = argparse.ArgumentParser(
prog="lh5ls", description="Inspect LEGEND HDF5 (LH5) file contents"
)
diff --git a/src/lgdo/lgdo_utils.py b/src/lgdo/lgdo_utils.py
index 05b46bd5..cddd2111 100644
--- a/src/lgdo/lgdo_utils.py
+++ b/src/lgdo/lgdo_utils.py
@@ -1,149 +1,56 @@
-"""Implements utilities for LEGEND Data Objects."""
from __future__ import annotations
-import glob
-import logging
-import os
-import string
+from warnings import warn
import numpy as np
from . import types as lgdo
+from .lh5 import utils
-log = logging.getLogger(__name__)
-
-def get_element_type(obj: object) -> str:
- """Get the LGDO element type of a scalar or array.
-
- For use in LGDO datatype attributes.
-
- Parameters
- ----------
- obj
- if a ``str``, will automatically return ``string`` if the object has
- a :class:`numpy.dtype`, that will be used for determining the element
- type otherwise will attempt to case the type of the object to a
- :class:`numpy.dtype`.
-
- Returns
- -------
- element_type
- A string stating the determined element type of the object.
- """
-
- # special handling for strings
- if isinstance(obj, str):
- return "string"
-
- # the rest use dtypes
- dt = obj.dtype if hasattr(obj, "dtype") else np.dtype(type(obj))
- kind = dt.kind
-
- if kind == "b":
- return "bool"
- if kind == "V":
- return "blob"
- if kind in ["i", "u", "f"]:
- return "real"
- if kind == "c":
- return "complex"
- if kind in ["S", "U"]:
- return "string"
-
- # couldn't figure it out
- raise ValueError(
- "cannot determine lgdo element_type for object of type", type(obj).__name__
+def copy(obj: lgdo.LGDO, dtype: np.dtype = None) -> None:
+ warn(
+ "lgdo_utils.copy will soon be removed and will be replaced soon with copy member functions of each LGDO data type.",
+ DeprecationWarning,
+ stacklevel=2,
)
+ return utils.copy(obj, dtype)
-def copy(obj: lgdo.LGDO, dtype: np.dtype = None) -> lgdo.LGDO:
- """Return a copy of an LGDO.
-
- Parameters
- ----------
- obj
- the LGDO to be copied.
- dtype
- NumPy dtype to be used for the copied object.
-
- """
- if dtype is None:
- dtype = obj.dtype
-
- if isinstance(obj, lgdo.Array):
- return lgdo.Array(
- np.array(obj.nda, dtype=dtype, copy=True), attrs=dict(obj.attrs)
- )
-
- if isinstance(obj, lgdo.VectorOfVectors):
- return lgdo.VectorOfVectors(
- flattened_data=copy(obj.flattened_data, dtype=dtype),
- cumulative_length=copy(obj.cumulative_length),
- attrs=dict(obj.attrs),
- )
-
- else:
- raise ValueError(f"copy of {type(obj)} not supported")
+def get_element_type(obj: object) -> str:
+ warn(
+ "'lgdo.lgdo_utils' has been renamed to 'lgdo.utils'. "
+ "Please replace either 'import lgdo.lgdo_utils as utils' with 'import lgdo.utils as utils' "
+ "or 'from lgdo.lgdo_utils import get_element_type' with 'from lgdo.utils import get_element_type'."
+ "'lgdo.lgdo_utils' will be removed in a future release.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ return utils.get_element_type(obj)
def parse_datatype(datatype: str) -> tuple[str, tuple[int, ...], str | list[str]]:
- """Parse datatype string and return type, dimensions and elements.
-
- Parameters
- ----------
- datatype
- a LGDO-formatted datatype string.
-
- Returns
- -------
- element_type
- the datatype name dims if not ``None``, a tuple of dimensions for the
- LGDO. Note this is not the same as the NumPy shape of the underlying
- data object. See the LGDO specification for more information. Also see
- :class:`~.types.ArrayOfEqualSizedArrays` and
- :meth:`.lh5_store.LH5Store.read_object` for example code elements for
- numeric objects, the element type for struct-like objects, the list of
- fields in the struct.
- """
- if "{" not in datatype:
- return "scalar", None, datatype
-
- # for other datatypes, need to parse the datatype string
- from parse import parse
-
- datatype, element_description = parse("{}{{{}}}", datatype)
- if datatype.endswith(">"):
- datatype, dims = parse("{}<{}>", datatype)
- dims = [int(i) for i in dims.split(",")]
- return datatype, tuple(dims), element_description
- else:
- return datatype, None, element_description.split(",")
+ warn(
+ "'lgdo.lgdo_utils' has been renamed to 'lgdo.utils'. "
+ "Please replace either 'import lgdo.lgdo_utils as utils' with 'import lgdo.utils as utils' "
+ "or 'from lgdo.lgdo_utils import parse_datatype' with 'from lgdo.utils import parse_datatype'."
+ "'lgdo.lgdo_utils' will be removed in a future release.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ return utils.parse_datatype(datatype)
def expand_vars(expr: str, substitute: dict[str, str] = None) -> str:
- """Expand (environment) variables.
-
- Note
- ----
- Malformed variable names and references to non-existing variables are left
- unchanged.
-
- Parameters
- ----------
- expr
- string expression, which may include (environment) variables prefixed by
- ``$``.
- substitute
- use this dictionary to substitute variables. Takes precedence over
- environment variables.
- """
- if substitute is None:
- substitute = {}
-
- # use provided mapping
- # then expand env variables
- return os.path.expandvars(string.Template(expr).safe_substitute(substitute))
+ warn(
+ "'lgdo.lgdo_utils' has been renamed to 'lgdo.utils'. "
+ "Please replace either 'import lgdo.lgdo_utils as utils' with 'import lgdo.utils as utils' "
+ "or 'from lgdo.lgdo_utils import expand_vars' with 'from lgdo.utils import expand_vars'."
+ "'lgdo.lgdo_utils' will be removed in a future release.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ return utils.expand_vars(expr, substitute)
def expand_path(
@@ -152,45 +59,12 @@ def expand_path(
list: bool = False,
base_path: str = None,
) -> str | list:
- """Expand (environment) variables and wildcards to return absolute paths.
-
- Parameters
- ----------
- path
- name of path, which may include environment variables and wildcards.
- list
- if ``True``, return a list. If ``False``, return a string; if ``False``
- and a unique file is not found, raise an exception.
- substitute
- use this dictionary to substitute variables. Environment variables take
- precedence.
- base_path
- name of base path. Returned paths will be relative to base.
-
- Returns
- -------
- path or list of paths
- Unique absolute path, or list of all absolute paths
- """
- if base_path is not None and base_path != "":
- base_path = os.path.expanduser(os.path.expandvars(base_path))
- path = os.path.join(base_path, path)
-
- # first expand variables
- _path = expand_vars(path, substitute)
-
- # then expand wildcards
- paths = sorted(glob.glob(os.path.expanduser(_path)))
-
- if base_path is not None and base_path != "":
- paths = [os.path.relpath(p, base_path) for p in paths]
-
- if not list:
- if len(paths) == 0:
- raise FileNotFoundError(f"could not find path matching {path}")
- elif len(paths) > 1:
- raise FileNotFoundError(f"found multiple paths matching {path}")
- else:
- return paths[0]
- else:
- return paths
+ warn(
+ "'lgdo.lgdo_utils' has been renamed to 'lgdo.utils'. "
+ "Please replace either 'import lgdo.lgdo_utils as utils' with 'import lgdo.utils as utils' "
+ "or 'from lgdo.lgdo_utils import expand_path' with 'from lgdo.utils import expand_path'."
+ "'lgdo.lgdo_utils' will be removed in a future release. ",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ return utils.expand_path(path, substitute, list, base_path)
diff --git a/src/lgdo/lh5/__init__.py b/src/lgdo/lh5/__init__.py
new file mode 100644
index 00000000..6263372a
--- /dev/null
+++ b/src/lgdo/lh5/__init__.py
@@ -0,0 +1,18 @@
+"""Routines from reading and writing LEGEND Data Objects in HDF5 files.
+Currently the primary on-disk format for LGDO object is LEGEND HDF5 (LH5) files. IO
+is done via the class :class:`.store.LH5Store`. LH5 files can also be
+browsed easily in python like any `HDF5 `_ file using
+`h5py `_.
+"""
+
+from .iterator import LH5Iterator
+from .store import LH5Store, load_dfs, load_nda, ls, show
+
+__all__ = [
+ "LH5Iterator",
+ "LH5Store",
+ "load_dfs",
+ "load_nda",
+ "ls",
+ "show",
+]
diff --git a/src/lgdo/lh5/iterator.py b/src/lgdo/lh5/iterator.py
new file mode 100644
index 00000000..534a7c05
--- /dev/null
+++ b/src/lgdo/lh5/iterator.py
@@ -0,0 +1,310 @@
+from __future__ import annotations
+
+import logging
+import typing as typing
+
+import numpy as np
+import pandas as pd
+
+from ..types import Array, Scalar, Struct, VectorOfVectors
+from .store import LH5Store
+from .utils import expand_path
+
+LGDO = typing.Union[Array, Scalar, Struct, VectorOfVectors]
+
+
+class LH5Iterator(typing.Iterator):
+ """
+ A class for iterating through one or more LH5 files, one block of entries
+ at a time. This also accepts an entry list/mask to enable event selection,
+ and a field mask.
+
+ This class can be used either for random access:
+
+ >>> lh5_obj, n_rows = lh5_it.read(entry)
+
+ to read the block of entries starting at entry. In case of multiple files
+ or the use of an event selection, entry refers to a global event index
+ across files and does not count events that are excluded by the selection.
+
+ This can also be used as an iterator:
+
+ >>> for lh5_obj, entry, n_rows in LH5Iterator(...):
+ >>> # do the thing!
+
+ This is intended for if you are reading a large quantity of data but
+ want to limit your memory usage (particularly when reading in waveforms!).
+ The ``lh5_obj`` that is read by this class is reused in order to avoid
+ reallocation of memory; this means that if you want to hold on to data
+ between reads, you will have to copy it somewhere!
+ """
+
+ def __init__(
+ self,
+ lh5_files: str | list[str],
+ groups: str | list[str],
+ base_path: str = "",
+ entry_list: list[int] | list[list[int]] = None,
+ entry_mask: list[bool] | list[list[bool]] = None,
+ field_mask: dict[str, bool] | list[str] | tuple[str] = None,
+ buffer_len: int = 3200,
+ friend: typing.Iterator = None,
+ ) -> None:
+ """
+ Parameters
+ ----------
+ lh5_files
+ file or files to read from. May include wildcards and environment
+ variables.
+ groups
+ HDF5 group(s) to read. If a list is provided for both lh5_files
+ and group, they must be the same size. If a file is wild-carded,
+ the same group will be assigned to each file found
+ entry_list
+ list of entry numbers to read. If a nested list is provided,
+ expect one top-level list for each file, containing a list of
+ local entries. If a list of ints is provided, use global entries.
+ entry_mask
+ mask of entries to read. If a list of arrays is provided, expect
+ one for each file. Ignore if a selection list is provided.
+ field_mask
+ mask of which fields to read. See :meth:`LH5Store.read` for
+ more details.
+ buffer_len
+ number of entries to read at a time while iterating through files.
+ friend
+ a ''friend'' LH5Iterator that will be read in parallel with this.
+ The friend should have the same length and entry list. A single
+ LH5 table containing columns from both iterators will be returned.
+ """
+ self.lh5_st = LH5Store(base_path=base_path, keep_open=True)
+
+ # List of files, with wildcards and env vars expanded
+ if isinstance(lh5_files, str):
+ lh5_files = [lh5_files]
+ if isinstance(groups, list):
+ lh5_files *= len(groups)
+ elif not isinstance(lh5_files, list):
+ raise ValueError("lh5_files must be a string or list of strings")
+
+ if isinstance(groups, str):
+ groups = [groups] * len(lh5_files)
+ elif not isinstance(groups, list):
+ raise ValueError("group must be a string or list of strings")
+
+ if not len(groups) == len(lh5_files):
+ raise ValueError("lh5_files and groups must have same length")
+
+ self.lh5_files = []
+ self.groups = []
+ for f, g in zip(lh5_files, groups):
+ f_exp = expand_path(f, list=True, base_path=base_path)
+ self.lh5_files += f_exp
+ self.groups += [g] * len(f_exp)
+
+ if entry_list is not None and entry_mask is not None:
+ raise ValueError(
+ "entry_list and entry_mask arguments are mutually exclusive"
+ )
+
+ # Map to last row in each file
+ self.file_map = np.full(len(self.lh5_files), np.iinfo("i").max, "i")
+ # Map to last iterator entry for each file
+ self.entry_map = np.full(len(self.lh5_files), np.iinfo("i").max, "i")
+ self.buffer_len = buffer_len
+
+ if len(self.lh5_files) > 0:
+ f = self.lh5_files[0]
+ g = self.groups[0]
+ self.lh5_buffer = self.lh5_st.get_buffer(
+ g,
+ f,
+ size=self.buffer_len,
+ field_mask=field_mask,
+ )
+ self.file_map[0] = self.lh5_st.read_n_rows(g, f)
+ else:
+ raise RuntimeError(f"can't open any files from {lh5_files}")
+
+ self.n_rows = 0
+ self.current_entry = 0
+ self.next_entry = 0
+
+ self.field_mask = field_mask
+
+ # List of entry indices from each file
+ self.local_entry_list = None
+ self.global_entry_list = None
+ if entry_list is not None:
+ entry_list = list(entry_list)
+ if isinstance(entry_list[0], int):
+ self.local_entry_list = [None] * len(self.file_map)
+ self.global_entry_list = np.array(entry_list, "i")
+ self.global_entry_list.sort()
+
+ else:
+ self.local_entry_list = [[]] * len(self.file_map)
+ for i_file, local_list in enumerate(entry_list):
+ self.local_entry_list[i_file] = np.array(local_list, "i")
+ self.local_entry_list[i_file].sort()
+
+ elif entry_mask is not None:
+ # Convert entry mask into an entry list
+ if isinstance(entry_mask, pd.Series):
+ entry_mask = entry_mask.values
+ if isinstance(entry_mask, np.ndarray):
+ self.local_entry_list = [None] * len(self.file_map)
+ self.global_entry_list = np.nonzero(entry_mask)[0]
+ else:
+ self.local_entry_list = [[]] * len(self.file_map)
+ for i_file, local_mask in enumerate(entry_mask):
+ self.local_entry_list[i_file] = np.nonzero(local_mask)[0]
+
+ # Attach the friend
+ if friend is not None:
+ if not isinstance(friend, typing.Iterator):
+ raise ValueError("Friend must be an Iterator")
+ self.lh5_buffer.join(friend.lh5_buffer)
+ self.friend = friend
+
+ def _get_file_cumlen(self, i_file: int) -> int:
+ """Helper to get cumulative file length of file"""
+ if i_file < 0:
+ return 0
+ fcl = self.file_map[i_file]
+ if fcl == np.iinfo("i").max:
+ fcl = self._get_file_cumlen(i_file - 1) + self.lh5_st.read_n_rows(
+ self.groups[i_file], self.lh5_files[i_file]
+ )
+ self.file_map[i_file] = fcl
+ return fcl
+
+ def _get_file_cumentries(self, i_file: int) -> int:
+ """Helper to get cumulative iterator entries in file"""
+ if i_file < 0:
+ return 0
+ n = self.entry_map[i_file]
+ if n == np.iinfo("i").max:
+ elist = self.get_file_entrylist(i_file)
+ fcl = self._get_file_cumlen(i_file)
+ if elist is None:
+ # no entry list provided
+ n = fcl
+ else:
+ file_entries = self.get_file_entrylist(i_file)
+ n = len(file_entries)
+ # check that file entries fall inside of file
+ if n > 0 and file_entries[-1] >= fcl:
+ logging.warning(f"Found entries out of range for file {i_file}")
+ n = np.searchsorted(file_entries, fcl, "right")
+ n += self._get_file_cumentries(i_file - 1)
+ self.entry_map[i_file] = n
+ return n
+
+ def get_file_entrylist(self, i_file: int) -> np.ndarray:
+ """Helper to get entry list for file"""
+ # If no entry list is provided
+ if self.local_entry_list is None:
+ return None
+
+ elist = self.local_entry_list[i_file]
+ if elist is None:
+ # Get local entrylist for this file from global entry list
+ f_start = self._get_file_cumlen(i_file - 1)
+ f_end = self._get_file_cumlen(i_file)
+ i_start = self._get_file_cumentries(i_file - 1)
+ i_stop = np.searchsorted(self.global_entry_list, f_end, "right")
+ elist = np.array(self.global_entry_list[i_start:i_stop], "i") - f_start
+ self.local_entry_list[i_file] = elist
+ return elist
+
+ def get_global_entrylist(self) -> np.ndarray:
+ """Get global entry list, constructing it if needed"""
+ if self.global_entry_list is None and self.local_entry_list is not None:
+ self.global_entry_list = np.zeros(len(self), "i")
+ for i_file in range(len(self.lh5_files)):
+ i_start = self.get_file_cumentries(i_file - 1)
+ i_stop = self.get_file_cumentries(i_file)
+ f_start = self.get_file_cumlen(i_file - 1)
+ self.global_entry_list[i_start:i_stop] = (
+ self.get_file_entrylist(i_file) + f_start
+ )
+ return self.global_entry_list
+
+ def read(self, entry: int) -> tuple[LGDO, int]:
+ """Read the nextlocal chunk of events, starting at entry. Return the
+ LH5 buffer and number of rows read."""
+ self.n_rows = 0
+ i_file = np.searchsorted(self.entry_map, entry, "right")
+
+ # if file hasn't been opened yet, search through files
+ # sequentially until we find the right one
+ if i_file < len(self.lh5_files) and self.entry_map[i_file] == np.iinfo("i").max:
+ while i_file < len(self.lh5_files) and entry >= self._get_file_cumentries(
+ i_file
+ ):
+ i_file += 1
+
+ if i_file == len(self.lh5_files):
+ return (self.lh5_buffer, self.n_rows)
+ local_entry = entry - self._get_file_cumentries(i_file - 1)
+
+ while self.n_rows < self.buffer_len and i_file < len(self.file_map):
+ # Loop through files
+ local_idx = self.get_file_entrylist(i_file)
+ if local_idx is not None and len(local_idx) == 0:
+ i_file += 1
+ local_entry = 0
+ continue
+
+ i_local = local_idx[local_entry] if local_idx is not None else local_entry
+ self.lh5_buffer, n_rows = self.lh5_st.read(
+ self.groups[i_file],
+ self.lh5_files[i_file],
+ start_row=i_local,
+ n_rows=self.buffer_len - self.n_rows,
+ idx=local_idx,
+ field_mask=self.field_mask,
+ obj_buf=self.lh5_buffer,
+ obj_buf_start=self.n_rows,
+ )
+
+ self.n_rows += n_rows
+ i_file += 1
+ local_entry = 0
+
+ self.current_entry = entry
+
+ if self.friend is not None:
+ self.friend.read(entry)
+
+ return (self.lh5_buffer, self.n_rows)
+
+ def reset_field_mask(self, mask):
+ """Replaces the field mask of this iterator and any friends with mask"""
+ self.field_mask = mask
+ if self.friend is not None:
+ self.friend.reset_field_mask(mask)
+
+ def __len__(self) -> int:
+ """Return the total number of entries."""
+ return (
+ self._get_file_cumentries(len(self.lh5_files) - 1)
+ if len(self.entry_map) > 0
+ else 0
+ )
+
+ def __iter__(self) -> typing.Iterator:
+ """Loop through entries in blocks of size buffer_len."""
+ self.current_entry = 0
+ self.next_entry = 0
+ return self
+
+ def __next__(self) -> tuple[LGDO, int, int]:
+ """Read next buffer_len entries and return lh5_table, iterator entry
+ and n_rows read."""
+ buf, n_rows = self.read(self.next_entry)
+ self.next_entry = self.current_entry + n_rows
+ if n_rows == 0:
+ raise StopIteration
+ return (buf, self.current_entry, n_rows)
diff --git a/src/lgdo/lh5/store.py b/src/lgdo/lh5/store.py
new file mode 100644
index 00000000..3c2aa696
--- /dev/null
+++ b/src/lgdo/lh5/store.py
@@ -0,0 +1,1535 @@
+"""
+This module implements routines from reading and writing LEGEND Data Objects in
+HDF5 files.
+"""
+from __future__ import annotations
+
+import fnmatch
+import glob
+import logging
+import os
+import sys
+from bisect import bisect_left
+from collections import defaultdict
+from typing import Any, Union
+
+import h5py
+import numba as nb
+import numpy as np
+import pandas as pd
+
+from .. import compression as compress
+from ..compression import WaveformCodec
+from ..types import (
+ Array,
+ ArrayOfEncodedEqualSizedArrays,
+ ArrayOfEqualSizedArrays,
+ FixedSizeArray,
+ Scalar,
+ Struct,
+ Table,
+ VectorOfEncodedVectors,
+ VectorOfVectors,
+ WaveformTable,
+)
+from .utils import expand_path, parse_datatype
+
+LGDO = Union[Array, Scalar, Struct, VectorOfVectors]
+
+log = logging.getLogger(__name__)
+
+DEFAULT_HDF5_SETTINGS: dict[str, ...] = {"shuffle": True, "compression": "gzip"}
+DEFAULT_HDF5_COMPRESSION = None
+
+
+class LH5Store:
+ """
+ Class to represent a store of LEGEND HDF5 files. The two main methods
+ implemented by the class are :meth:`read` and :meth:`write`.
+
+ Examples
+ --------
+ >>> from lgdo import LH5Store
+ >>> store = LH5Store()
+ >>> obj, _ = store.read("/geds/waveform", "file.lh5")
+ >>> type(obj)
+ lgdo.waveform_table.WaveformTable
+ """
+
+ def __init__(self, base_path: str = "", keep_open: bool = False) -> None:
+ """
+ Parameters
+ ----------
+ base_path
+ directory path to prepend to LH5 files.
+ keep_open
+ whether to keep files open by storing the :mod:`h5py` objects as
+ class attributes.
+ """
+ self.base_path = "" if base_path == "" else expand_path(base_path)
+ self.keep_open = keep_open
+ self.files = {}
+
+ def gimme_file(self, lh5_file: str | h5py.File, mode: str = "r") -> h5py.File:
+ """Returns a :mod:`h5py` file object from the store or creates a new one.
+
+ Parameters
+ ----------
+ lh5_file
+ LH5 file name.
+ mode
+ mode in which to open file. See :class:`h5py.File` documentation.
+ """
+ if isinstance(lh5_file, h5py.File):
+ return lh5_file
+ if mode == "r":
+ lh5_file = expand_path(lh5_file, base_path=self.base_path)
+ if lh5_file in self.files.keys():
+ return self.files[lh5_file]
+ if self.base_path != "":
+ full_path = os.path.join(self.base_path, lh5_file)
+ else:
+ full_path = lh5_file
+ if mode != "r":
+ directory = os.path.dirname(full_path)
+ if directory != "" and not os.path.exists(directory):
+ log.debug(f"making path {directory}")
+ os.makedirs(directory)
+ if mode == "r" and not os.path.exists(full_path):
+ raise FileNotFoundError(f"file {full_path} not found")
+ if mode != "r" and os.path.exists(full_path):
+ log.debug(f"opening existing file {full_path} in mode '{mode}'")
+ h5f = h5py.File(full_path, mode)
+ if self.keep_open:
+ self.files[lh5_file] = h5f
+ return h5f
+
+ def gimme_group(
+ self,
+ group: str | h5py.Group,
+ base_group: h5py.Group,
+ grp_attrs: dict[str, Any] = None,
+ overwrite: bool = False,
+ ) -> h5py.Group:
+ """
+ Returns an existing :class:`h5py` group from a base group or creates a
+ new one. Can also set (or replace) group attributes.
+
+ Parameters
+ ----------
+ group
+ name of the HDF5 group.
+ base_group
+ HDF5 group to be used as a base.
+ grp_attrs
+ HDF5 group attributes.
+ overwrite
+ whether overwrite group attributes, ignored if `grp_attrs` is
+ ``None``.
+ """
+ if not isinstance(group, h5py.Group):
+ if group in base_group:
+ group = base_group[group]
+ else:
+ group = base_group.create_group(group)
+ if grp_attrs is not None:
+ group.attrs.update(grp_attrs)
+ return group
+ if (
+ grp_attrs is not None
+ and len(set(grp_attrs.items()) ^ set(group.attrs.items())) > 0
+ ):
+ if not overwrite:
+ raise RuntimeError("grp_attrs != group.attrs but overwrite not set")
+ else:
+ log.debug(f"overwriting {group}.attrs...")
+ for key in group.attrs.keys():
+ group.attrs.pop(key)
+ group.attrs.update(grp_attrs)
+ return group
+
+ def get_buffer(
+ self,
+ name: str,
+ lh5_file: str | h5py.File | list[str | h5py.File],
+ size: int = None,
+ field_mask: dict[str, bool] | list[str] | tuple[str] = None,
+ ) -> LGDO:
+ """Returns an LH5 object appropriate for use as a pre-allocated buffer
+ in a read loop. Sets size to `size` if object has a size.
+ """
+ obj, n_rows = self.read(name, lh5_file, n_rows=0, field_mask=field_mask)
+ if hasattr(obj, "resize") and size is not None:
+ obj.resize(new_size=size)
+ return obj
+
+ def read(
+ self,
+ name: str,
+ lh5_file: str | h5py.File | list[str | h5py.File],
+ start_row: int = 0,
+ n_rows: int = sys.maxsize,
+ idx: np.ndarray | list | tuple | list[np.ndarray | list | tuple] = None,
+ use_h5idx: bool = False,
+ field_mask: dict[str, bool] | list[str] | tuple[str] = None,
+ obj_buf: LGDO = None,
+ obj_buf_start: int = 0,
+ decompress: bool = True,
+ ) -> tuple[LGDO, int]:
+ """Read LH5 object data from a file.
+
+ Use the ``idx`` parameter to read out particular rows of the data. The ``use_h5idx`` flag
+ controls whether *only* those rows are read from disk or if the rows are indexed after reading
+ the entire object. Reading individual rows can be orders of magnitude slower than reading
+ the whole object and then indexing the desired rows. The default behavior (``use_h5idx=False``)
+ is to use slightly more memory for a much faster read. See
+ `legend-pydataobj #29 `_
+ for additional information.
+
+ Parameters
+ ----------
+ name
+ Name of the LH5 object to be read (including its group path).
+ lh5_file
+ The file(s) containing the object to be read out. If a list of
+ files, array-like object data will be concatenated into the output
+ object.
+ start_row
+ Starting entry for the object read (for array-like objects). For a
+ list of files, only applies to the first file.
+ n_rows
+ The maximum number of rows to read (for array-like objects). The
+ actual number of rows read will be returned as one of the return
+ values (see below).
+ idx
+ For NumPy-style "fancying indexing" for the read to select only some
+ rows, e.g. after applying some cuts to particular columns.
+ Only selection along the first axis is supported, so tuple arguments
+ must be one-tuples. If `n_rows` is not false, `idx` will be truncated to
+ `n_rows` before reading. To use with a list of files, can pass in a list of
+ `idx`'s (one for each file) or use a long contiguous list (e.g. built from a previous
+ identical read). If used in conjunction with `start_row` and `n_rows`,
+ will be sliced to obey those constraints, where `n_rows` is
+ interpreted as the (max) number of *selected* values (in `idx`) to be
+ read out. Note that the ``use_h5idx`` parameter controls some behaviour of the
+ read and that the default behavior (``use_h5idx=False``) prioritizes speed over
+ a small memory penalty.
+ use_h5idx
+ ``True`` will directly pass the ``idx`` parameter to the underlying
+ ``h5py`` call such that only the selected rows are read directly into memory,
+ which conserves memory at the cost of speed. There can be a significant penalty
+ to speed for larger files (1 - 2 orders of magnitude longer time).
+ ``False`` (default) will read the entire object into memory before
+ performing the indexing. The default is much faster but requires additional memory,
+ though a relatively small amount in the typical use case. It is recommended to
+ leave this parameter as its default.
+ field_mask
+ For tables and structs, determines which fields get written out.
+ Only applies to immediate fields of the requested objects. If a dict
+ is used, a default dict will be made with the default set to the
+ opposite of the first element in the dict. This way if one specifies
+ a few fields at ``False``, all but those fields will be read out,
+ while if one specifies just a few fields as ``True``, only those
+ fields will be read out. If a list is provided, the listed fields
+ will be set to ``True``, while the rest will default to ``False``.
+ obj_buf
+ Read directly into memory provided in `obj_buf`. Note: the buffer
+ will be expanded to accommodate the data requested. To maintain the
+ buffer length, send in ``n_rows = len(obj_buf)``.
+ obj_buf_start
+ Start location in ``obj_buf`` for read. For concatenating data to
+ array-like objects.
+ decompress
+ Decompress data encoded with LGDO's compression routines right
+ after reading. The option has no effect on data encoded with HDF5
+ built-in filters, which is always decompressed upstream by HDF5.
+
+
+ Returns
+ -------
+ (object, n_rows_read)
+ `object` is the read-out object `n_rows_read` is the number of rows
+ successfully read out. Essential for arrays when the amount of data
+ is smaller than the object buffer. For scalars and structs
+ `n_rows_read` will be``1``. For tables it is redundant with
+ ``table.loc``.
+ """
+ # Handle list-of-files recursively
+ if not isinstance(lh5_file, (str, h5py.File)):
+ lh5_file = list(lh5_file)
+ n_rows_read = 0
+
+ # to know whether we are reading in a list of files.
+ # this is part of the fix for reading data by idx
+ # (see https://github.com/legend-exp/legend-pydataobj/issues/29)
+ # so that we only make a copy of the data if absolutely necessary
+ # or if we can read the data from file without having to make a copy
+ self.in_file_loop = True
+
+ for i, h5f in enumerate(lh5_file):
+ if isinstance(idx, list) and len(idx) > 0 and not np.isscalar(idx[0]):
+ # a list of lists: must be one per file
+ idx_i = idx[i]
+ elif idx is not None:
+ # make idx a proper tuple if it's not one already
+ if not (isinstance(idx, tuple) and len(idx) == 1):
+ idx = (idx,)
+ # idx is a long continuous array
+ n_rows_i = self.read_n_rows(name, h5f)
+ # find the length of the subset of idx that contains indices
+ # that are less than n_rows_i
+ n_rows_to_read_i = bisect_left(idx[0], n_rows_i)
+ # now split idx into idx_i and the remainder
+ idx_i = (idx[0][:n_rows_to_read_i],)
+ idx = (idx[0][n_rows_to_read_i:] - n_rows_i,)
+ else:
+ idx_i = None
+ n_rows_i = n_rows - n_rows_read
+
+ # maybe someone passed in a list of len==1?
+ if i == (len(lh5_file) - 1):
+ self.in_file_loop = False
+
+ obj_buf, n_rows_read_i = self.read(
+ name,
+ lh5_file[i],
+ start_row=start_row,
+ n_rows=n_rows_i,
+ idx=idx_i,
+ use_h5idx=use_h5idx,
+ field_mask=field_mask,
+ obj_buf=obj_buf,
+ obj_buf_start=obj_buf_start,
+ decompress=decompress,
+ )
+
+ n_rows_read += n_rows_read_i
+ if n_rows_read >= n_rows or obj_buf is None:
+ return obj_buf, n_rows_read
+ start_row = 0
+ obj_buf_start += n_rows_read_i
+
+ self.in_file_loop = False
+
+ return obj_buf, n_rows_read
+
+ # get the file from the store
+ h5f = self.gimme_file(lh5_file, "r")
+ if not h5f or name not in h5f:
+ raise KeyError(f"'{name}' not in {h5f.filename}")
+
+ log.debug(
+ f"reading {h5f.filename}:{name}[{start_row}:{n_rows}], decompress = {decompress}, "
+ + (f" with field mask {field_mask}" if field_mask else "")
+ )
+
+ # make idx a proper tuple if it's not one already
+ if not (isinstance(idx, tuple) and len(idx) == 1):
+ if idx is not None:
+ idx = (idx,)
+
+ # get the object's datatype
+ if "datatype" not in h5f[name].attrs:
+ raise RuntimeError(
+ f"'{name}' in file {lh5_file} is missing the datatype attribute"
+ )
+
+ datatype = h5f[name].attrs["datatype"]
+ datatype, shape, elements = parse_datatype(datatype)
+
+ # check field_mask and make it a default dict
+ if datatype == "struct" or datatype == "table":
+ if field_mask is None:
+ field_mask = defaultdict(lambda: True)
+ elif isinstance(field_mask, dict):
+ default = True
+ if len(field_mask) > 0:
+ default = not field_mask[list(field_mask.keys())[0]]
+ field_mask = defaultdict(lambda: default, field_mask)
+ elif isinstance(field_mask, (list, tuple)):
+ field_mask = defaultdict(
+ lambda: False, {field: True for field in field_mask}
+ )
+ elif not isinstance(field_mask, defaultdict):
+ raise RuntimeError("bad field_mask of type", type(field_mask).__name__)
+ elif field_mask is not None:
+ raise RuntimeError(f"datatype {datatype} does not accept a field_mask")
+
+ # Scalar
+ # scalars are dim-0 datasets
+ if datatype == "scalar":
+ value = h5f[name][()]
+ if elements == "bool":
+ value = np.bool_(value)
+ if obj_buf is not None:
+ obj_buf.value = value
+ obj_buf.attrs.update(h5f[name].attrs)
+ return obj_buf, 1
+ else:
+ return Scalar(value=value, attrs=h5f[name].attrs), 1
+
+ # Struct
+ # recursively build a struct, return as a dictionary
+ if datatype == "struct":
+ # ignore obj_buf.
+ # TODO: could append new fields or overwrite/concat to existing
+ # fields. If implemented, get_buffer() above should probably also
+ # (optionally?) prep buffers for each field
+ if obj_buf is not None:
+ raise NotImplementedError("obj_buf not implemented for LGOD Structs")
+
+ # loop over fields and read
+ obj_dict = {}
+ for field in elements:
+ if not field_mask[field]:
+ continue
+ # TODO: it's strange to pass start_row, n_rows, idx to struct
+ # fields. If they all had shared indexing, they should be in a
+ # table... Maybe should emit a warning? Or allow them to be
+ # dicts keyed by field name?
+ if "int_keys" in h5f[name].attrs:
+ if dict(h5f[name].attrs)["int_keys"]:
+ f = int(field)
+ else:
+ f = str(field)
+ obj_dict[f], _ = self.read(
+ name + "/" + field,
+ h5f,
+ start_row=start_row,
+ n_rows=n_rows,
+ idx=idx,
+ use_h5idx=use_h5idx,
+ decompress=decompress,
+ )
+ # modify datatype in attrs if a field_mask was used
+ attrs = dict(h5f[name].attrs)
+ if field_mask is not None:
+ selected_fields = []
+ for field in elements:
+ if field_mask[field]:
+ selected_fields.append(field)
+ attrs["datatype"] = "struct" + "{" + ",".join(selected_fields) + "}"
+ return Struct(obj_dict=obj_dict, attrs=attrs), 1
+
+ # Below here is all array-like types. So trim idx if needed
+ if idx is not None:
+ # chop off indices < start_row
+ i_first_valid = bisect_left(idx[0], start_row)
+ idxa = idx[0][i_first_valid:]
+ # don't readout more than n_rows indices
+ idx = (idxa[:n_rows],) # works even if n_rows > len(idxa)
+
+ # Table or WaveformTable
+ if datatype == "table":
+ col_dict = {}
+
+ # read out each of the fields
+ rows_read = []
+ for field in elements:
+ if not field_mask[field]:
+ continue
+
+ fld_buf = None
+ if obj_buf is not None:
+ if not isinstance(obj_buf, Table) or field not in obj_buf:
+ raise ValueError(
+ f"obj_buf for LGDO Table '{name}' not formatted correctly"
+ )
+
+ else:
+ fld_buf = obj_buf[field]
+
+ col_dict[field], n_rows_read = self.read(
+ name + "/" + field,
+ h5f,
+ start_row=start_row,
+ n_rows=n_rows,
+ idx=idx,
+ use_h5idx=use_h5idx,
+ obj_buf=fld_buf,
+ obj_buf_start=obj_buf_start,
+ decompress=decompress,
+ )
+ if obj_buf is not None and obj_buf_start + n_rows_read > len(obj_buf):
+ obj_buf.resize(obj_buf_start + n_rows_read)
+
+ rows_read.append(n_rows_read)
+
+ # warn if all columns don't read in the same number of rows
+ if len(rows_read) > 0:
+ n_rows_read = rows_read[0]
+ else:
+ n_rows_read = 0
+ log.warning(f"Table '{name}' has no subgroups accepted by field mask")
+
+ for n in rows_read[1:]:
+ if n != n_rows_read:
+ log.warning(
+ f"Table '{name}' got strange n_rows_read = {n}, {n_rows_read} was expected ({rows_read})"
+ )
+
+ # modify datatype in attrs if a field_mask was used
+ attrs = dict(h5f[name].attrs)
+ if field_mask is not None:
+ selected_fields = []
+ for field in elements:
+ if field_mask[field]:
+ selected_fields.append(field)
+ attrs["datatype"] = "table" + "{" + ",".join(selected_fields) + "}"
+
+ # fields have been read out, now return a table
+ if obj_buf is None:
+ # if col_dict contains just 3 objects called t0, dt, and values,
+ # return a WaveformTable
+ if (
+ len(col_dict) == 3
+ and "t0" in col_dict
+ and "dt" in col_dict
+ and "values" in col_dict
+ ):
+ table = WaveformTable(
+ t0=col_dict["t0"], dt=col_dict["dt"], values=col_dict["values"]
+ )
+ else:
+ table = Table(col_dict=col_dict, attrs=attrs)
+
+ # set (write) loc to end of tree
+ table.loc = n_rows_read
+ return table, n_rows_read
+ else:
+ # We have read all fields into the object buffer. Run
+ # checks: All columns should be the same size. So update
+ # table's size as necessary, warn if any mismatches are found
+ obj_buf.resize(do_warn=True)
+ # set (write) loc to end of tree
+ obj_buf.loc = obj_buf_start + n_rows_read
+ # check attributes
+ if set(obj_buf.attrs.keys()) != set(attrs.keys()):
+ raise RuntimeError(
+ f"attrs mismatch. obj_buf.attrs: "
+ f"{obj_buf.attrs}, h5f[{name}].attrs: {attrs}"
+ )
+ return obj_buf, n_rows_read
+
+ # ArrayOfEncodedEqualSizedArrays and VectorOfEncodedVectors
+ for cond, enc_lgdo in [
+ (
+ datatype == "array_of_encoded_equalsized_arrays",
+ ArrayOfEncodedEqualSizedArrays,
+ ),
+ (elements.startswith("encoded_array"), VectorOfEncodedVectors),
+ ]:
+ if cond:
+ if (
+ not decompress
+ and obj_buf is not None
+ and not isinstance(obj_buf, enc_lgdo)
+ ):
+ raise ValueError(f"obj_buf for '{name}' not a {enc_lgdo}")
+
+ # read out decoded_size, either a Scalar or an Array
+ decoded_size_buf = encoded_data_buf = None
+ if obj_buf is not None and not decompress:
+ decoded_size_buf = obj_buf.decoded_size
+ encoded_data_buf = obj_buf.encoded_data
+
+ decoded_size, _ = self.read(
+ f"{name}/decoded_size",
+ h5f,
+ start_row=start_row,
+ n_rows=n_rows,
+ idx=idx,
+ use_h5idx=use_h5idx,
+ obj_buf=None if decompress else decoded_size_buf,
+ obj_buf_start=0 if decompress else obj_buf_start,
+ )
+
+ # read out encoded_data, a VectorOfVectors
+ encoded_data, n_rows_read = self.read(
+ f"{name}/encoded_data",
+ h5f,
+ start_row=start_row,
+ n_rows=n_rows,
+ idx=idx,
+ use_h5idx=use_h5idx,
+ obj_buf=None if decompress else encoded_data_buf,
+ obj_buf_start=0 if decompress else obj_buf_start,
+ )
+
+ # return the still encoded data in the buffer object, if there
+ if obj_buf is not None and not decompress:
+ return obj_buf, n_rows_read
+
+ # otherwise re-create the encoded LGDO
+ rawdata = enc_lgdo(
+ encoded_data=encoded_data,
+ decoded_size=decoded_size,
+ attrs=h5f[name].attrs,
+ )
+
+ # already return if no decompression is requested
+ if not decompress:
+ return rawdata, n_rows_read
+
+ # if no buffer, decode and return
+ elif obj_buf is None and decompress:
+ return compress.decode(rawdata), n_rows_read
+
+ # eventually expand provided obj_buf, if too short
+ buf_size = obj_buf_start + n_rows_read
+ if len(obj_buf) < buf_size:
+ obj_buf.resize(buf_size)
+
+ # use the (decoded object type) buffer otherwise
+ if enc_lgdo == ArrayOfEncodedEqualSizedArrays:
+ if not isinstance(obj_buf, ArrayOfEqualSizedArrays):
+ raise ValueError(
+ f"obj_buf for decoded '{name}' not an ArrayOfEqualSizedArrays"
+ )
+
+ compress.decode(rawdata, obj_buf[obj_buf_start:buf_size])
+
+ elif enc_lgdo == VectorOfEncodedVectors:
+ if not isinstance(obj_buf, VectorOfVectors):
+ raise ValueError(
+ f"obj_buf for decoded '{name}' not a VectorOfVectors"
+ )
+
+ # FIXME: not a good idea. an in place decoding version
+ # of decode would be needed to avoid extra memory
+ # allocations
+ for i, wf in enumerate(compress.decode(rawdata)):
+ obj_buf[obj_buf_start + i] = wf
+
+ return obj_buf, n_rows_read
+
+ # VectorOfVectors
+ # read out vector of vectors of different size
+ if elements.startswith("array"):
+ if obj_buf is not None and not isinstance(obj_buf, VectorOfVectors):
+ raise ValueError(f"obj_buf for '{name}' not a LGDO VectorOfVectors")
+
+ # read out cumulative_length
+ cumulen_buf = None if obj_buf is None else obj_buf.cumulative_length
+ cumulative_length, n_rows_read = self.read(
+ f"{name}/cumulative_length",
+ h5f,
+ start_row=start_row,
+ n_rows=n_rows,
+ idx=idx,
+ use_h5idx=use_h5idx,
+ obj_buf=cumulen_buf,
+ obj_buf_start=obj_buf_start,
+ )
+ # get a view of just what was read out for cleaner code below
+ this_cumulen_nda = cumulative_length.nda[
+ obj_buf_start : obj_buf_start + n_rows_read
+ ]
+
+ if idx is not None and n_rows_read > 0:
+ # get the starting indices for each array in flattended data:
+ # the starting index for array[i] is cumulative_length[i-1]
+ idx2 = (np.asarray(idx[0]).copy() - 1,)
+ # re-read cumulative_length with these indices
+ # note this will allocate memory for fd_starts!
+ fd_start = None
+ if idx2[0][0] == -1:
+ idx2 = (idx2[0][1:],)
+ fd_start = 0 # this variable avoids an ndarray append
+ fd_starts, fds_n_rows_read = self.read(
+ f"{name}/cumulative_length",
+ h5f,
+ start_row=start_row,
+ n_rows=n_rows,
+ idx=idx2,
+ use_h5idx=use_h5idx,
+ )
+ fd_starts = fd_starts.nda # we just need the nda
+ if fd_start is None:
+ fd_start = fd_starts[0]
+
+ # compute the length that flattened_data will have after the
+ # fancy-indexed read
+ fd_n_rows = np.sum(this_cumulen_nda[-len(fd_starts) :] - fd_starts)
+ if fd_start == 0:
+ fd_n_rows += this_cumulen_nda[0]
+
+ # now make fd_idx
+ fd_idx = np.empty(fd_n_rows, dtype="uint32")
+ fd_idx = _make_fd_idx(fd_starts, this_cumulen_nda, fd_idx)
+
+ # Now clean up this_cumulen_nda, to be ready
+ # to match the in-memory version of flattened_data. Note: these
+ # operations on the view change the original array because they are
+ # numpy arrays, not lists.
+ this_cumulen_nda[-len(fd_starts) :] -= fd_starts
+ np.cumsum(this_cumulen_nda, out=this_cumulen_nda)
+
+ else:
+ fd_idx = None
+
+ # determine the start_row and n_rows for the flattened_data readout
+ fd_start = 0
+ if start_row > 0 and n_rows_read > 0:
+ # need to read out the cumulen sample -before- the first sample
+ # read above in order to get the starting row of the first
+ # vector to read out in flattened_data
+ fd_start = h5f[f"{name}/cumulative_length"][start_row - 1]
+
+ # check limits for values that will be used subsequently
+ if this_cumulen_nda[-1] < fd_start:
+ log.debug(
+ f"this_cumulen_nda[-1] = {this_cumulen_nda[-1]}, "
+ f"fd_start = {fd_start}, "
+ f"start_row = {start_row}, "
+ f"n_rows_read = {n_rows_read}"
+ )
+ raise RuntimeError(
+ f"cumulative_length non-increasing between entries "
+ f"{start_row} and {start_row+n_rows_read} ??"
+ )
+
+ # determine the number of rows for the flattened_data readout
+ fd_n_rows = this_cumulen_nda[-1] if n_rows_read > 0 else 0
+
+ # Now done with this_cumulen_nda, so we can clean it up to be ready
+ # to match the in-memory version of flattened_data. Note: these
+ # operations on the view change the original array because they are
+ # numpy arrays, not lists.
+ #
+ # First we need to subtract off the in-file offset for the start of
+ # read for flattened_data
+ this_cumulen_nda -= fd_start
+
+ # If we started with a partially-filled buffer, add the
+ # appropriate offset for the start of the in-memory flattened
+ # data for this read.
+ fd_buf_start = np.uint32(0)
+ if obj_buf_start > 0:
+ fd_buf_start = cumulative_length.nda[obj_buf_start - 1]
+ this_cumulen_nda += fd_buf_start
+
+ # Now prepare the object buffer if necessary
+ fd_buf = None
+ if obj_buf is not None:
+ fd_buf = obj_buf.flattened_data
+ # grow fd_buf if necessary to hold the data
+ fdb_size = fd_buf_start + fd_n_rows
+ if len(fd_buf) < fdb_size:
+ fd_buf.resize(fdb_size)
+
+ # now read
+ flattened_data, dummy_rows_read = self.read(
+ f"{name}/flattened_data",
+ h5f,
+ start_row=fd_start,
+ n_rows=fd_n_rows,
+ idx=fd_idx,
+ use_h5idx=use_h5idx,
+ obj_buf=fd_buf,
+ obj_buf_start=fd_buf_start,
+ )
+ if obj_buf is not None:
+ return obj_buf, n_rows_read
+ return (
+ VectorOfVectors(
+ flattened_data=flattened_data,
+ cumulative_length=cumulative_length,
+ attrs=h5f[name].attrs,
+ ),
+ n_rows_read,
+ )
+
+ # Array
+ # FixedSizeArray
+ # ArrayOfEqualSizedArrays
+ # read out all arrays by slicing
+ if "array" in datatype:
+ if obj_buf is not None:
+ if not isinstance(obj_buf, Array):
+ raise ValueError(f"obj_buf for '{name}' not an LGDO Array")
+ obj_buf = None
+
+ # compute the number of rows to read
+ # we culled idx above for start_row and n_rows, now we have to apply
+ # the constraint of the length of the dataset
+ ds_n_rows = h5f[name].shape[0]
+ if idx is not None:
+ if len(idx[0]) > 0 and idx[0][-1] >= ds_n_rows:
+ log.warning(
+ "idx indexed past the end of the array in the file. Culling..."
+ )
+ n_rows_to_read = bisect_left(idx[0], ds_n_rows)
+ idx = (idx[0][:n_rows_to_read],)
+ if len(idx[0]) == 0:
+ log.warning("idx empty after culling.")
+ n_rows_to_read = len(idx[0])
+ else:
+ n_rows_to_read = ds_n_rows - start_row
+ if n_rows_to_read > n_rows:
+ n_rows_to_read = n_rows
+
+ # if idx is passed, check if we can make it a slice instead (faster)
+ change_idx_to_slice = False
+
+ # prepare the selection for the read. Use idx if available
+ if idx is not None:
+ # check if idx is empty and convert to slice instead
+ if len(idx[0]) == 0:
+ source_sel = np.s_[0:0]
+ change_idx_to_slice = True
+ # check if idx is contiguous and increasing
+ # if so, convert it to a slice instead (faster)
+ elif np.all(np.diff(idx[0]) == 1):
+ source_sel = np.s_[idx[0][0] : idx[0][-1] + 1]
+ change_idx_to_slice = True
+ else:
+ source_sel = idx
+ else:
+ source_sel = np.s_[start_row : start_row + n_rows_to_read]
+
+ # Now read the array
+ if obj_buf is not None and n_rows_to_read > 0:
+ buf_size = obj_buf_start + n_rows_to_read
+ if len(obj_buf) < buf_size:
+ obj_buf.resize(buf_size)
+ dest_sel = np.s_[obj_buf_start:buf_size]
+
+ # this is required to make the read of multiple files faster
+ # until a better solution found.
+ if change_idx_to_slice or idx is None or use_h5idx:
+ h5f[name].read_direct(obj_buf.nda, source_sel, dest_sel)
+ else:
+ # it is faster to read the whole object and then do fancy indexing
+ obj_buf.nda[dest_sel] = h5f[name][...][source_sel]
+
+ nda = obj_buf.nda
+ else:
+ if n_rows == 0:
+ tmp_shape = (0,) + h5f[name].shape[1:]
+ nda = np.empty(tmp_shape, h5f[name].dtype)
+ else:
+ if change_idx_to_slice or idx is None or use_h5idx:
+ nda = h5f[name][source_sel]
+ else:
+ # it is faster to read the whole object and then do fancy indexing
+ nda = h5f[name][...][source_sel]
+
+ # if reading a list of files recursively, this is given to obj_buf on
+ # the first file read. obj_buf needs to be resized and therefore
+ # it needs to hold the data itself (not a view of the data).
+ # a view is returned by the source_sel indexing, which cannot be resized
+ # by ndarray.resize().
+ if hasattr(self, "in_file_loop") and self.in_file_loop:
+ nda = np.copy(nda)
+
+ # special handling for bools
+ # (c and Julia store as uint8 so cast to bool)
+ if elements == "bool":
+ nda = nda.astype(np.bool_)
+
+ # Finally, set attributes and return objects
+ attrs = h5f[name].attrs
+ if obj_buf is None:
+ if datatype == "array":
+ return Array(nda=nda, attrs=attrs), n_rows_to_read
+ if datatype == "fixedsize_array":
+ return FixedSizeArray(nda=nda, attrs=attrs), n_rows_to_read
+ if datatype == "array_of_equalsized_arrays":
+ return (
+ ArrayOfEqualSizedArrays(nda=nda, dims=shape, attrs=attrs),
+ n_rows_to_read,
+ )
+ else:
+ if set(obj_buf.attrs.keys()) != set(attrs.keys()):
+ raise RuntimeError(
+ f"attrs mismatch. "
+ f"obj_buf.attrs: {obj_buf.attrs}, "
+ f"h5f[{name}].attrs: {attrs}"
+ )
+ return obj_buf, n_rows_to_read
+
+ raise RuntimeError("don't know how to read datatype {datatype}")
+
+ def write(
+ self,
+ obj: LGDO,
+ name: str,
+ lh5_file: str | h5py.File,
+ group: str | h5py.Group = "/",
+ start_row: int = 0,
+ n_rows: int = None,
+ wo_mode: str = "append",
+ write_start: int = 0,
+ **h5py_kwargs,
+ ) -> None:
+ """Write an LGDO into an LH5 file.
+
+ If the `obj` :class:`.LGDO` has a `compression` attribute, its value is
+ interpreted as the algorithm to be used to compress `obj` before
+ writing to disk. The type of `compression` can be:
+
+ string, kwargs dictionary, hdf5plugin filter
+ interpreted as the name of a built-in or custom `HDF5 compression
+ filter `_
+ (``"gzip"``, ``"lzf"``, :mod:`hdf5plugin` filter object etc.) and
+ passed directly to :meth:`h5py.Group.create_dataset`.
+
+ :class:`.WaveformCodec` object
+ If `obj` is a :class:`.WaveformTable` and ``obj.values`` holds the
+ attribute, compress ``values`` using this algorithm. More
+ documentation about the supported waveform compression algorithms at
+ :mod:`.lgdo.compression`.
+
+ If the `obj` :class:`.LGDO` has a `hdf5_settings` attribute holding a
+ dictionary, it is interpreted as a list of keyword arguments to be
+ forwarded directly to :meth:`h5py.Group.create_dataset` (exactly like
+ the first format of `compression` above). This is the preferred way to
+ specify HDF5 dataset options such as chunking etc. If compression
+ options are specified, they take precedence over those set with the
+ `compression` attribute.
+
+ Note
+ ----
+ The `compression` LGDO attribute takes precedence over the default HDF5
+ compression settings. The `hdf5_settings` attribute takes precedence
+ over `compression`. These attributes are not written to disk.
+
+ Note
+ ----
+ HDF5 compression is skipped for the `encoded_data.flattened_data`
+ dataset of :class:`.VectorOfEncodedVectors` and
+ :class:`.ArrayOfEncodedEqualSizedArrays`.
+
+ Parameters
+ ----------
+ obj
+ LH5 object. if object is array-like, writes `n_rows` starting from
+ `start_row` in `obj`.
+ name
+ name of the object in the output HDF5 file.
+ lh5_file
+ HDF5 file name or :class:`h5py.File` object.
+ group
+ HDF5 group name or :class:`h5py.Group` object in which `obj` should
+ be written.
+ start_row
+ first row in `obj` to be written.
+ n_rows
+ number of rows in `obj` to be written.
+ wo_mode
+ - ``write_safe`` or ``w``: only proceed with writing if the
+ object does not already exist in the file.
+ - ``append`` or ``a``: append along axis 0 (the first dimension)
+ of array-like objects and array-like subfields of structs.
+ :class:`~.lgdo.scalar.Scalar` objects get overwritten.
+ - ``overwrite`` or ``o``: replace data in the file if present,
+ starting from `write_start`. Note: overwriting with `write_start` =
+ end of array is the same as ``append``.
+ - ``overwrite_file`` or ``of``: delete file if present prior to
+ writing to it. `write_start` should be 0 (its ignored).
+ - ``append_column`` or ``ac``: append columns from an :class:`~.lgdo.table.Table`
+ `obj` only if there is an existing :class:`~.lgdo.table.Table` in the `lh5_file` with
+ the same `name` and :class:`~.lgdo.table.Table.size`. If the sizes don't match,
+ or if there are matching fields, it errors out.
+ write_start
+ row in the output file (if already existing) to start overwriting
+ from.
+ **h5py_kwargs
+ additional keyword arguments forwarded to
+ :meth:`h5py.Group.create_dataset` to specify, for example, an HDF5
+ compression filter to be applied before writing non-scalar
+ datasets. **Note: `compression` Ignored if compression is specified
+ as an `obj` attribute.**
+ """
+ log.debug(
+ f"writing {repr(obj)}[{start_row}:{n_rows}] as "
+ f"{lh5_file}:{group}/{name}[{write_start}:], "
+ f"mode = {wo_mode}, h5py_kwargs = {h5py_kwargs}"
+ )
+
+ if wo_mode == "write_safe":
+ wo_mode = "w"
+ if wo_mode == "append":
+ wo_mode = "a"
+ if wo_mode == "overwrite":
+ wo_mode = "o"
+ if wo_mode == "overwrite_file":
+ wo_mode = "of"
+ write_start = 0
+ if wo_mode == "append_column":
+ wo_mode = "ac"
+ if wo_mode not in ["w", "a", "o", "of", "ac"]:
+ raise ValueError(f"unknown wo_mode '{wo_mode}'")
+
+ # "mode" is for the h5df.File and wo_mode is for this function
+ # In hdf5, 'a' is really "modify" -- in addition to appending, you can
+ # change any object in the file. So we use file:append for
+ # write_object:overwrite.
+ mode = "w" if wo_mode == "of" else "a"
+ lh5_file = self.gimme_file(lh5_file, mode=mode)
+ group = self.gimme_group(group, lh5_file)
+ if wo_mode == "w" and name in group:
+ raise RuntimeError(f"can't overwrite '{name}' in wo_mode 'write_safe'")
+
+ # struct or table or waveform table
+ if isinstance(obj, Struct):
+ # In order to append a column, we need to update the `table{old_fields}` value in `group.attrs['datatype"]` to include the new fields.
+ # One way to do this is to override `obj.attrs["datatype"]` to include old and new fields. Then we can write the fields to the table as normal.
+ if wo_mode == "ac":
+ old_group = self.gimme_group(name, group)
+ datatype, shape, fields = parse_datatype(old_group.attrs["datatype"])
+ if datatype not in ["table", "struct"]:
+ raise RuntimeError(
+ f"Trying to append columns to an object of type {datatype}"
+ )
+
+ # If the mode is `append_column`, make sure we aren't appending a table that has a column of the same name as in the existing table
+ # Also make sure that the field we are adding has the same size
+ if len(list(set(fields).intersection(set(obj.keys())))) != 0:
+ raise ValueError(
+ f"Can't append {list(set(fields).intersection(set(obj.keys())))} column(s) to a table with the same field(s)"
+ )
+ # It doesn't matter what key we access, as all fields in the old table have the same size
+ if old_group[list(old_group.keys())[0]].size != obj.size:
+ raise ValueError(
+ f"Table sizes don't match. Trying to append column of size {obj.size} to a table of size {old_group[list(old_group.keys())[0]].size}."
+ )
+
+ # Now we can append the obj.keys() to the old fields, and then update obj.attrs.
+ fields.extend(list(obj.keys()))
+ obj.attrs.pop("datatype")
+ obj.attrs["datatype"] = "table" + "{" + ",".join(fields) + "}"
+
+ group = self.gimme_group(
+ name,
+ group,
+ grp_attrs=obj.attrs,
+ overwrite=(wo_mode in ["o", "ac"]),
+ )
+ # If the mode is overwrite, then we need to peek into the file's table's existing fields
+ # If we are writing a new table to the group that does not contain an old field, we should delete that old field from the file
+ if wo_mode == "o":
+ # Find the old keys in the group that are not present in the new table's keys, then delete them
+ for key in list(set(group.keys()) - set(obj.keys())):
+ log.debug(f"{key} is not present in new table, deleting field")
+ del group[key]
+
+ for field in obj.keys():
+ # eventually compress waveform table values with LGDO's
+ # custom codecs before writing
+ # if waveformtable.values.attrs["compression"] is NOT a
+ # WaveformCodec, just leave it there
+ obj_fld = None
+ if (
+ isinstance(obj, WaveformTable)
+ and field == "values"
+ and not isinstance(obj.values, VectorOfEncodedVectors)
+ and not isinstance(obj.values, ArrayOfEncodedEqualSizedArrays)
+ and "compression" in obj.values.attrs
+ and isinstance(obj.values.attrs["compression"], WaveformCodec)
+ ):
+ codec = obj.values.attrs["compression"]
+ obj_fld = compress.encode(obj.values, codec=codec)
+ else:
+ obj_fld = obj[field]
+
+ # Convert keys to string for dataset names
+ f = str(field)
+ self.write(
+ obj_fld,
+ f,
+ lh5_file,
+ group=group,
+ start_row=start_row,
+ n_rows=n_rows,
+ wo_mode=wo_mode,
+ write_start=write_start,
+ **h5py_kwargs,
+ )
+ return
+
+ # scalars
+ elif isinstance(obj, Scalar):
+ if name in group:
+ if wo_mode in ["o", "a"]:
+ log.debug(f"overwriting {name} in {group}")
+ del group[name]
+ else:
+ raise RuntimeError(
+ f"tried to overwrite {name} in {group} for wo_mode {wo_mode}"
+ )
+ ds = group.create_dataset(name, shape=(), data=obj.value)
+ ds.attrs.update(obj.attrs)
+ return
+
+ # vector of encoded vectors
+ elif isinstance(obj, (VectorOfEncodedVectors, ArrayOfEncodedEqualSizedArrays)):
+ group = self.gimme_group(
+ name, group, grp_attrs=obj.attrs, overwrite=(wo_mode == "o")
+ )
+
+ # ask not to further compress flattened_data, it is already compressed!
+ obj.encoded_data.flattened_data.attrs["compression"] = None
+
+ self.write(
+ obj.encoded_data,
+ "encoded_data",
+ lh5_file,
+ group=group,
+ start_row=start_row,
+ n_rows=n_rows,
+ wo_mode=wo_mode,
+ write_start=write_start,
+ **h5py_kwargs,
+ )
+
+ self.write(
+ obj.decoded_size,
+ "decoded_size",
+ lh5_file,
+ group=group,
+ start_row=start_row,
+ n_rows=n_rows,
+ wo_mode=wo_mode,
+ write_start=write_start,
+ **h5py_kwargs,
+ )
+
+ # vector of vectors
+ elif isinstance(obj, VectorOfVectors):
+ group = self.gimme_group(
+ name, group, grp_attrs=obj.attrs, overwrite=(wo_mode == "o")
+ )
+ if (
+ n_rows is None
+ or n_rows > obj.cumulative_length.nda.shape[0] - start_row
+ ):
+ n_rows = obj.cumulative_length.nda.shape[0] - start_row
+
+ # if appending we need to add an appropriate offset to the
+ # cumulative lengths as appropriate for the in-file object
+ offset = 0 # declare here because we have to subtract it off at the end
+ if (wo_mode == "a" or wo_mode == "o") and "cumulative_length" in group:
+ len_cl = len(group["cumulative_length"])
+ if wo_mode == "a":
+ write_start = len_cl
+ if len_cl > 0:
+ offset = group["cumulative_length"][write_start - 1]
+
+ # First write flattened_data array. Only write rows with data.
+ fd_start = 0 if start_row == 0 else obj.cumulative_length.nda[start_row - 1]
+ fd_n_rows = obj.cumulative_length.nda[start_row + n_rows - 1] - fd_start
+ self.write(
+ obj.flattened_data,
+ "flattened_data",
+ lh5_file,
+ group=group,
+ start_row=fd_start,
+ n_rows=fd_n_rows,
+ wo_mode=wo_mode,
+ write_start=offset,
+ **h5py_kwargs,
+ )
+
+ # now offset is used to give appropriate in-file values for
+ # cumulative_length. Need to adjust it for start_row
+ if start_row > 0:
+ offset -= obj.cumulative_length.nda[start_row - 1]
+
+ # Add offset to obj.cumulative_length itself to avoid memory allocation.
+ # Then subtract it off after writing! (otherwise it will be changed
+ # upon return)
+ cl_dtype = obj.cumulative_length.nda.dtype.type
+ obj.cumulative_length.nda += cl_dtype(offset)
+
+ self.write(
+ obj.cumulative_length,
+ "cumulative_length",
+ lh5_file,
+ group=group,
+ start_row=start_row,
+ n_rows=n_rows,
+ wo_mode=wo_mode,
+ write_start=write_start,
+ **h5py_kwargs,
+ )
+ obj.cumulative_length.nda -= cl_dtype(offset)
+
+ return
+
+ # if we get this far, must be one of the Array types
+ elif isinstance(obj, Array):
+ if n_rows is None or n_rows > obj.nda.shape[0] - start_row:
+ n_rows = obj.nda.shape[0] - start_row
+
+ nda = obj.nda[start_row : start_row + n_rows]
+
+ # hack to store bools as uint8 for c / Julia compliance
+ if nda.dtype.name == "bool":
+ nda = nda.astype(np.uint8)
+
+ # need to create dataset from ndarray the first time for speed
+ # creating an empty dataset and appending to that is super slow!
+ if (wo_mode != "a" and write_start == 0) or name not in group:
+ # this is needed in order to have a resizable (in the first
+ # axis) data set, i.e. rows can be appended later
+ # NOTE: this automatically turns chunking on!
+ maxshape = (None,) + nda.shape[1:]
+ h5py_kwargs.setdefault("maxshape", maxshape)
+
+ if wo_mode == "o" and name in group:
+ log.debug(f"overwriting {name} in {group}")
+ del group[name]
+
+ # set default compression options
+ for k, v in DEFAULT_HDF5_SETTINGS.items():
+ h5py_kwargs.setdefault(k, v)
+
+ # compress using the 'compression' LGDO attribute, if available
+ if "compression" in obj.attrs:
+ comp_algo = obj.attrs["compression"]
+ if isinstance(comp_algo, dict):
+ h5py_kwargs |= obj.attrs["compression"]
+ else:
+ h5py_kwargs["compression"] = obj.attrs["compression"]
+
+ # and even the 'hdf5_settings' one, preferred
+ if "hdf5_settings" in obj.attrs:
+ h5py_kwargs |= obj.attrs["hdf5_settings"]
+
+ # create HDF5 dataset
+ ds = group.create_dataset(name, data=nda, **h5py_kwargs)
+
+ # attach HDF5 dataset attributes, but not "compression"!
+ _attrs = obj.getattrs(datatype=True)
+ _attrs.pop("compression", None)
+ _attrs.pop("hdf5_settings", None)
+ ds.attrs.update(_attrs)
+ return
+
+ # Now append or overwrite
+ ds = group[name]
+ if not isinstance(ds, h5py.Dataset):
+ raise RuntimeError(
+ f"existing HDF5 object '{name}' in group '{group}'"
+ " is not a dataset! Cannot overwrite or append"
+ )
+
+ old_len = ds.shape[0]
+ if wo_mode == "a":
+ write_start = old_len
+ add_len = write_start + nda.shape[0] - old_len
+ ds.resize(old_len + add_len, axis=0)
+ ds[write_start:] = nda
+ return
+
+ else:
+ raise RuntimeError(
+ f"do not know how to write '{name}' of type '{type(obj).__name__}'"
+ )
+
+ def read_n_rows(self, name: str, lh5_file: str | h5py.File) -> int | None:
+ """Look up the number of rows in an Array-like object called `name` in
+ `lh5_file`.
+
+ Return ``None`` if it is a :class:`.Scalar` or a :class:`.Struct`."""
+ # this is basically a stripped down version of read_object
+ h5f = self.gimme_file(lh5_file, "r")
+ if not h5f or name not in h5f:
+ raise KeyError(f"'{name}' not in {lh5_file}")
+
+ # get the datatype
+ if "datatype" not in h5f[name].attrs:
+ raise RuntimeError(
+ f"'{name}' in file {lh5_file} is missing the datatype attribute"
+ )
+
+ datatype = h5f[name].attrs["datatype"]
+ datatype, shape, elements = parse_datatype(datatype)
+
+ # scalars are dim-0 datasets
+ if datatype == "scalar":
+ return None
+
+ # structs don't have rows
+ if datatype == "struct":
+ return None
+
+ # tables should have elements with all the same length
+ if datatype == "table":
+ # read out each of the fields
+ rows_read = None
+ for field in elements:
+ n_rows_read = self.read_n_rows(name + "/" + field, h5f)
+ if not rows_read:
+ rows_read = n_rows_read
+ elif rows_read != n_rows_read:
+ log.warning(
+ f"'{field}' field in table '{name}' has {rows_read} rows, "
+ f"{n_rows_read} was expected"
+ )
+ return rows_read
+
+ # length of vector of vectors is the length of its cumulative_length
+ if elements.startswith("array"):
+ return self.read_n_rows(f"{name}/cumulative_length", h5f)
+
+ # length of vector of encoded vectors is the length of its decoded_size
+ if (
+ elements.startswith("encoded_array")
+ or datatype == "array_of_encoded_equalsized_arrays"
+ ):
+ return self.read_n_rows(f"{name}/encoded_data", h5f)
+
+ # return array length (without reading the array!)
+ if "array" in datatype:
+ # compute the number of rows to read
+ return h5f[name].shape[0]
+
+ raise RuntimeError(f"don't know how to read datatype '{datatype}'")
+
+
+def ls(lh5_file: str | h5py.Group, lh5_group: str = "") -> list[str]:
+ """Return a list of LH5 groups in the input file and group, similar
+ to ``ls`` or ``h5ls``. Supports wildcards in group names.
+
+
+ Parameters
+ ----------
+ lh5_file
+ name of file.
+ lh5_group
+ group to search. add a ``/`` to the end of the group name if you want to
+ list all objects inside that group.
+ """
+
+ log.debug(
+ f"Listing objects in '{lh5_file}'"
+ + ("" if lh5_group == "" else f" (and group {lh5_group})")
+ )
+
+ lh5_st = LH5Store()
+ # To use recursively, make lh5_file a h5group instead of a string
+ if isinstance(lh5_file, str):
+ lh5_file = lh5_st.gimme_file(lh5_file, "r")
+ if lh5_group.startswith("/"):
+ lh5_group = lh5_group[1:]
+
+ if lh5_group == "":
+ lh5_group = "*"
+
+ splitpath = lh5_group.split("/", 1)
+ matchingkeys = fnmatch.filter(lh5_file.keys(), splitpath[0])
+
+ if len(splitpath) == 1:
+ return matchingkeys
+ else:
+ ret = []
+ for key in matchingkeys:
+ ret.extend([f"{key}/{path}" for path in ls(lh5_file[key], splitpath[1])])
+ return ret
+
+
+def show(
+ lh5_file: str | h5py.Group,
+ lh5_group: str = "/",
+ attrs: bool = False,
+ indent: str = "",
+ header: bool = True,
+) -> None:
+ """Print a tree of LH5 file contents with LGDO datatype.
+
+ Parameters
+ ----------
+ lh5_file
+ the LH5 file.
+ lh5_group
+ print only contents of this HDF5 group.
+ attrs
+ print the HDF5 attributes too.
+ indent
+ indent the diagram with this string.
+ header
+ print `lh5_group` at the top of the diagram.
+
+ Examples
+ --------
+ >>> from lgdo import show
+ >>> show("file.lh5", "/geds/raw")
+ /geds/raw
+ ├── channel · array<1>{real}
+ ├── energy · array<1>{real}
+ ├── timestamp · array<1>{real}
+ ├── waveform · table{t0,dt,values}
+ │ ├── dt · array<1>{real}
+ │ ├── t0 · array<1>{real}
+ │ └── values · array_of_equalsized_arrays<1,1>{real}
+ └── wf_std · array<1>{real}
+ """
+ # open file
+ if isinstance(lh5_file, str):
+ lh5_file = h5py.File(expand_path(lh5_file), "r")
+
+ # go to group
+ if lh5_group != "/":
+ lh5_file = lh5_file[lh5_group]
+
+ if header:
+ print(f"\033[1m{lh5_group}\033[0m") # noqa: T201
+
+ # get an iterator over the keys in the group
+ it = iter(lh5_file)
+ key = None
+
+ # make sure there is actually something in this file/group
+ try:
+ key = next(it) # get first key
+ except StopIteration:
+ print(f"{indent}└── empty") # noqa: T201
+ return
+
+ # loop over keys
+ while True:
+ val = lh5_file[key]
+ # we want to print the LGDO datatype
+ dtype = val.attrs.get("datatype", default="no datatype")
+ if dtype == "no datatype" and isinstance(val, h5py.Group):
+ dtype = "HDF5 group"
+
+ _attrs = ""
+ if attrs:
+ attrs_d = dict(val.attrs)
+ attrs_d.pop("datatype", "")
+ _attrs = "── " + str(attrs_d) if attrs_d else ""
+
+ # is this the last key?
+ killme = False
+ try:
+ k_new = next(it) # get next key
+ except StopIteration:
+ char = "└──"
+ killme = True # we'll have to kill this loop later
+ else:
+ char = "├──"
+
+ print(f"{indent}{char} \033[1m{key}\033[0m · {dtype} {_attrs}") # noqa: T201
+
+ # if it's a group, call this function recursively
+ if isinstance(val, h5py.Group):
+ show(
+ val,
+ indent=indent + (" " if killme else "│ "),
+ header=False,
+ attrs=attrs,
+ )
+
+ # break or move to next key
+ if killme:
+ break
+ else:
+ key = k_new
+
+
+def load_nda(
+ f_list: str | list[str],
+ par_list: list[str],
+ lh5_group: str = "",
+ idx_list: list[np.ndarray | list | tuple] = None,
+) -> dict[str, np.ndarray]:
+ r"""Build a dictionary of :class:`numpy.ndarray`\ s from LH5 data.
+
+ Given a list of files, a list of LH5 table parameters, and an optional
+ group path, return a NumPy array with all values for each parameter.
+
+ Parameters
+ ----------
+ f_list
+ A list of files. Can contain wildcards.
+ par_list
+ A list of parameters to read from each file.
+ lh5_group
+ group path within which to find the specified parameters.
+ idx_list
+ for fancy-indexed reads. Must be one index array for each file in
+ `f_list`.
+
+ Returns
+ -------
+ par_data
+ A dictionary of the parameter data keyed by the elements of `par_list`.
+ Each entry contains the data for the specified parameter concatenated
+ over all files in `f_list`.
+ """
+ if isinstance(f_list, str):
+ f_list = [f_list]
+ if idx_list is not None:
+ idx_list = [idx_list]
+ if idx_list is not None and len(f_list) != len(idx_list):
+ raise ValueError(
+ f"f_list length ({len(f_list)}) != idx_list length ({len(idx_list)})!"
+ )
+
+ # Expand wildcards
+ f_list = [f for f_wc in f_list for f in sorted(glob.glob(os.path.expandvars(f_wc)))]
+
+ sto = LH5Store()
+ par_data = {par: [] for par in par_list}
+ for ii, f in enumerate(f_list):
+ f = sto.gimme_file(f, "r")
+ for par in par_list:
+ if f"{lh5_group}/{par}" not in f:
+ raise RuntimeError(f"'{lh5_group}/{par}' not in file {f_list[ii]}")
+
+ if idx_list is None:
+ data, _ = sto.read(f"{lh5_group}/{par}", f)
+ else:
+ data, _ = sto.read(f"{lh5_group}/{par}", f, idx=idx_list[ii])
+ if not data:
+ continue
+ par_data[par].append(data.nda)
+ par_data = {par: np.concatenate(par_data[par]) for par in par_list}
+ return par_data
+
+
+def load_dfs(
+ f_list: str | list[str],
+ par_list: list[str],
+ lh5_group: str = "",
+ idx_list: list[np.ndarray | list | tuple] = None,
+) -> pd.DataFrame:
+ """Build a :class:`pandas.DataFrame` from LH5 data.
+
+ Given a list of files (can use wildcards), a list of LH5 columns, and
+ optionally the group path, return a :class:`pandas.DataFrame` with all
+ values for each parameter.
+
+ See Also
+ --------
+ :func:`load_nda`
+
+ Returns
+ -------
+ dataframe
+ contains columns for each parameter in `par_list`, and rows containing
+ all data for the associated parameters concatenated over all files in
+ `f_list`.
+ """
+ return pd.DataFrame(
+ load_nda(f_list, par_list, lh5_group=lh5_group, idx_list=idx_list)
+ )
+
+
+@nb.njit(parallel=False, fastmath=True)
+def _make_fd_idx(starts, stops, idx):
+ k = 0
+ if len(starts) < len(stops):
+ for i in range(stops[0]):
+ idx[k] = i
+ k += 1
+ stops = stops[1:]
+ for j in range(len(starts)):
+ for i in range(starts[j], stops[j]):
+ idx[k] = i
+ k += 1
+ return (idx,)
diff --git a/src/lgdo/lh5/utils.py b/src/lgdo/lh5/utils.py
new file mode 100644
index 00000000..bc1fd425
--- /dev/null
+++ b/src/lgdo/lh5/utils.py
@@ -0,0 +1,118 @@
+"""Implements utilities for LEGEND Data Objects."""
+from __future__ import annotations
+
+import glob
+import logging
+import os
+import string
+
+log = logging.getLogger(__name__)
+
+
+def parse_datatype(datatype: str) -> tuple[str, tuple[int, ...], str | list[str]]:
+ """Parse datatype string and return type, dimensions and elements.
+
+ Parameters
+ ----------
+ datatype
+ a LGDO-formatted datatype string.
+
+ Returns
+ -------
+ element_type
+ the datatype name dims if not ``None``, a tuple of dimensions for the
+ LGDO. Note this is not the same as the NumPy shape of the underlying
+ data object. See the LGDO specification for more information. Also see
+ :class:`~.types.ArrayOfEqualSizedArrays` and
+ :meth:`.lh5_store.LH5Store.read` for example code elements for
+ numeric objects, the element type for struct-like objects, the list of
+ fields in the struct.
+ """
+ if "{" not in datatype:
+ return "scalar", None, datatype
+
+ # for other datatypes, need to parse the datatype string
+ from parse import parse
+
+ datatype, element_description = parse("{}{{{}}}", datatype)
+ if datatype.endswith(">"):
+ datatype, dims = parse("{}<{}>", datatype)
+ dims = [int(i) for i in dims.split(",")]
+ return datatype, tuple(dims), element_description
+ else:
+ return datatype, None, element_description.split(",")
+
+
+def expand_vars(expr: str, substitute: dict[str, str] = None) -> str:
+ """Expand (environment) variables.
+
+ Note
+ ----
+ Malformed variable names and references to non-existing variables are left
+ unchanged.
+
+ Parameters
+ ----------
+ expr
+ string expression, which may include (environment) variables prefixed by
+ ``$``.
+ substitute
+ use this dictionary to substitute variables. Takes precedence over
+ environment variables.
+ """
+ if substitute is None:
+ substitute = {}
+
+ # use provided mapping
+ # then expand env variables
+ return os.path.expandvars(string.Template(expr).safe_substitute(substitute))
+
+
+def expand_path(
+ path: str,
+ substitute: dict[str, str] = None,
+ list: bool = False,
+ base_path: str = None,
+) -> str | list:
+ """Expand (environment) variables and wildcards to return absolute paths.
+
+ Parameters
+ ----------
+ path
+ name of path, which may include environment variables and wildcards.
+ list
+ if ``True``, return a list. If ``False``, return a string; if ``False``
+ and a unique file is not found, raise an exception.
+ substitute
+ use this dictionary to substitute variables. Environment variables take
+ precedence.
+ base_path
+ name of base path. Returned paths will be relative to base.
+
+ Returns
+ -------
+ path or list of paths
+ Unique absolute path, or list of all absolute paths
+ """
+ if base_path is not None and base_path != "":
+ base_path = os.path.expanduser(os.path.expandvars(base_path))
+ path = os.path.join(base_path, path)
+
+ # first expand variables
+ _path = expand_vars(path, substitute)
+
+ # then expand wildcards
+ paths = sorted(glob.glob(os.path.expanduser(_path)))
+
+ if base_path is not None and base_path != "":
+ paths = [os.path.relpath(p, base_path) for p in paths]
+
+ if not list:
+ if len(paths) == 0:
+ raise FileNotFoundError(f"could not find path matching {path}")
+ elif len(paths) > 1:
+ raise FileNotFoundError(f"found multiple paths matching {path}")
+ else:
+ return paths[0]
+ else:
+ return paths
diff --git a/src/lgdo/lh5_store.py b/src/lgdo/lh5_store.py
index 7103d05c..ce8b72cd 100644
--- a/src/lgdo/lh5_store.py
+++ b/src/lgdo/lh5_store.py
@@ -1,166 +1,91 @@
-"""
-This module implements routines from reading and writing LEGEND Data Objects in
-HDF5 files.
-"""
from __future__ import annotations
-import fnmatch
-import glob
-import logging
-import os
import sys
-from bisect import bisect_left
-from collections import defaultdict
-from typing import Any, Iterator, Union
+from typing import Iterator, Union
+from warnings import warn
import h5py
-import numba as nb
import numpy as np
import pandas as pd
-from . import compression as compress
-from .compression import WaveformCodec
-from .lgdo_utils import expand_path, parse_datatype
-from .types import (
- Array,
- ArrayOfEncodedEqualSizedArrays,
- ArrayOfEqualSizedArrays,
- FixedSizeArray,
- Scalar,
- Struct,
- Table,
- VectorOfEncodedVectors,
- VectorOfVectors,
- WaveformTable,
-)
-
+from . import lh5
+from .types import Array # noqa: F401
+from .types import ArrayOfEncodedEqualSizedArrays # noqa: F401
+from .types import ArrayOfEqualSizedArrays # noqa: F401
+from .types import FixedSizeArray # noqa: F401
+from .types import Scalar # noqa: F401
+from .types import Struct # noqa: F401
+from .types import Table # noqa: F401
+from .types import VectorOfEncodedVectors # noqa: F401
+from .types import VectorOfVectors # noqa: F401
+from .types import WaveformTable # noqa: F401
+
+DEFAULT_HDF5_COMPRESSION = None
LGDO = Union[Array, Scalar, Struct, VectorOfVectors]
-
-log = logging.getLogger(__name__)
-
DEFAULT_HDF5_SETTINGS: dict[str, ...] = {"shuffle": True, "compression": "gzip"}
-class LH5Store:
- """
- Class to represent a store of LEGEND HDF5 files. The two main methods
- implemented by the class are :meth:`read_object` and :meth:`write_object`.
-
- Examples
- --------
- >>> from lgdo import LH5Store
- >>> store = LH5Store()
- >>> obj, _ = store.read_object("/geds/waveform", "file.lh5")
- >>> type(obj)
- lgdo.waveform_table.WaveformTable
- """
-
- def __init__(self, base_path: str = "", keep_open: bool = False) -> None:
- """
- Parameters
- ----------
- base_path
- directory path to prepend to LH5 files.
- keep_open
- whether to keep files open by storing the :mod:`h5py` objects as
- class attributes.
- """
- self.base_path = "" if base_path == "" else expand_path(base_path)
- self.keep_open = keep_open
- self.files = {}
-
- def gimme_file(self, lh5_file: str | h5py.File, mode: str = "r") -> h5py.File:
- """Returns a :mod:`h5py` file object from the store or creates a new one.
-
- Parameters
- ----------
- lh5_file
- LH5 file name.
- mode
- mode in which to open file. See :class:`h5py.File` documentation.
- """
- if isinstance(lh5_file, h5py.File):
- return lh5_file
- if mode == "r":
- lh5_file = expand_path(lh5_file, base_path=self.base_path)
- if lh5_file in self.files.keys():
- return self.files[lh5_file]
- if self.base_path != "":
- full_path = os.path.join(self.base_path, lh5_file)
- else:
- full_path = lh5_file
- if mode != "r":
- directory = os.path.dirname(full_path)
- if directory != "" and not os.path.exists(directory):
- log.debug(f"making path {directory}")
- os.makedirs(directory)
- if mode == "r" and not os.path.exists(full_path):
- raise FileNotFoundError(f"file {full_path} not found")
- if mode != "r" and os.path.exists(full_path):
- log.debug(f"opening existing file {full_path} in mode '{mode}'")
- h5f = h5py.File(full_path, mode)
- if self.keep_open:
- self.files[lh5_file] = h5f
- return h5f
-
- def gimme_group(
+class LH5Iterator(lh5.LH5Iterator):
+ def __init__(
self,
- group: str | h5py.Group,
- base_group: h5py.Group,
- grp_attrs: dict[str, Any] = None,
- overwrite: bool = False,
- ) -> h5py.Group:
- """
- Returns an existing :class:`h5py` group from a base group or creates a
- new one. Can also set (or replace) group attributes.
-
- Parameters
- ----------
- group
- name of the HDF5 group.
- base_group
- HDF5 group to be used as a base.
- grp_attrs
- HDF5 group attributes.
- overwrite
- whether overwrite group attributes, ignored if `grp_attrs` is
- ``None``.
- """
- if not isinstance(group, h5py.Group):
- if group in base_group:
- group = base_group[group]
- else:
- group = base_group.create_group(group)
- if grp_attrs is not None:
- group.attrs.update(grp_attrs)
- return group
- if (
- grp_attrs is not None
- and len(set(grp_attrs.items()) ^ set(group.attrs.items())) > 0
- ):
- if not overwrite:
- raise RuntimeError("grp_attrs != group.attrs but overwrite not set")
- else:
- log.debug(f"overwriting {group}.attrs...")
- for key in group.attrs.keys():
- group.attrs.pop(key)
- group.attrs.update(grp_attrs)
- return group
+ lh5_files: str | list[str],
+ groups: str | list[str],
+ base_path: str = "",
+ entry_list: list[int] | list[list[int]] = None,
+ entry_mask: list[bool] | list[list[bool]] = None,
+ field_mask: dict[str, bool] | list[str] | tuple[str] = None,
+ buffer_len: int = 3200,
+ friend: Iterator = None,
+ ) -> None:
+ warn(
+ "lgdo.lh5_store has moved to a subfolder lgdo.lh5 containing LH5Iterator."
+ "Please replace 'from lgdo.lh5_store import LH5Iterator' with 'from lgdo.lh5 import LH5Iterator'."
+ "lgdo.lh5_store will be removed in a future release.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ super().__init__(
+ lh5_files,
+ groups,
+ base_path,
+ entry_list,
+ entry_mask,
+ field_mask,
+ buffer_len,
+ friend,
+ )
- def get_buffer(
+ def write_object(
self,
+ obj: LGDO,
name: str,
- lh5_file: str | h5py.File | list[str | h5py.File],
- size: int = None,
- field_mask: dict[str, bool] | list[str] | tuple[str] = None,
- ) -> LGDO:
- """Returns an LH5 object appropriate for use as a pre-allocated buffer
- in a read loop. Sets size to `size` if object has a size.
- """
- obj, n_rows = self.read_object(name, lh5_file, n_rows=0, field_mask=field_mask)
- if hasattr(obj, "resize") and size is not None:
- obj.resize(new_size=size)
- return obj
+ lh5_file: str | h5py.File,
+ group: str | h5py.Group = "/",
+ start_row: int = 0,
+ n_rows: int = None,
+ wo_mode: str = "append",
+ write_start: int = 0,
+ **h5py_kwargs,
+ ) -> None:
+ warn(
+ "lgdo.lh5_store has moved to a subfolder lgdo.lh5 containing LH5Iterator. "
+ "The object you are calling this function from uses the old LH5Iterator class."
+ "Please replace 'from lgdo.lh5_store import LH5Iterator' with 'from lgdo.lh5 import LH5Iterator'."
+ "lgdo.lh5_store will be removed in a future release.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ self.write(
+ obj,
+ name,
+ lh5_file,
+ group,
+ start_row,
+ n_rows,
+ wo_mode,
+ write_start,
+ h5py_kwargs,
+ )
def read_object(
self,
@@ -169,1165 +94,85 @@ def read_object(
start_row: int = 0,
n_rows: int = sys.maxsize,
idx: np.ndarray | list | tuple | list[np.ndarray | list | tuple] = None,
- use_h5idx: bool = False,
field_mask: dict[str, bool] | list[str] | tuple[str] = None,
obj_buf: LGDO = None,
obj_buf_start: int = 0,
decompress: bool = True,
) -> tuple[LGDO, int]:
- """Read LH5 object data from a file.
-
- Use the ``idx`` parameter to read out particular rows of the data. The ``use_h5idx`` flag
- controls whether *only* those rows are read from disk or if the rows are indexed after reading
- the entire object. Reading individual rows can be orders of magnitude slower than reading
- the whole object and then indexing the desired rows. The default behavior (``use_h5idx=False``)
- is to use slightly more memory for a much faster read. See
- `legend-pydataobj #29 `_
- for additional information.
-
- Parameters
- ----------
- name
- Name of the LH5 object to be read (including its group path).
- lh5_file
- The file(s) containing the object to be read out. If a list of
- files, array-like object data will be concatenated into the output
- object.
- start_row
- Starting entry for the object read (for array-like objects). For a
- list of files, only applies to the first file.
- n_rows
- The maximum number of rows to read (for array-like objects). The
- actual number of rows read will be returned as one of the return
- values (see below).
- idx
- For NumPy-style "fancying indexing" for the read to select only some
- rows, e.g. after applying some cuts to particular columns.
- Only selection along the first axis is supported, so tuple arguments
- must be one-tuples. If `n_rows` is not false, `idx` will be truncated to
- `n_rows` before reading. To use with a list of files, can pass in a list of
- `idx`'s (one for each file) or use a long contiguous list (e.g. built from a previous
- identical read). If used in conjunction with `start_row` and `n_rows`,
- will be sliced to obey those constraints, where `n_rows` is
- interpreted as the (max) number of *selected* values (in `idx`) to be
- read out. Note that the ``use_h5idx`` parameter controls some behaviour of the
- read and that the default behavior (``use_h5idx=False``) prioritizes speed over
- a small memory penalty.
- use_h5idx
- ``True`` will directly pass the ``idx`` parameter to the underlying
- ``h5py`` call such that only the selected rows are read directly into memory,
- which conserves memory at the cost of speed. There can be a significant penalty
- to speed for larger files (1 - 2 orders of magnitude longer time).
- ``False`` (default) will read the entire object into memory before
- performing the indexing. The default is much faster but requires additional memory,
- though a relatively small amount in the typical use case. It is recommended to
- leave this parameter as its default.
- field_mask
- For tables and structs, determines which fields get written out.
- Only applies to immediate fields of the requested objects. If a dict
- is used, a default dict will be made with the default set to the
- opposite of the first element in the dict. This way if one specifies
- a few fields at ``False``, all but those fields will be read out,
- while if one specifies just a few fields as ``True``, only those
- fields will be read out. If a list is provided, the listed fields
- will be set to ``True``, while the rest will default to ``False``.
- obj_buf
- Read directly into memory provided in `obj_buf`. Note: the buffer
- will be expanded to accommodate the data requested. To maintain the
- buffer length, send in ``n_rows = len(obj_buf)``.
- obj_buf_start
- Start location in ``obj_buf`` for read. For concatenating data to
- array-like objects.
- decompress
- Decompress data encoded with LGDO's compression routines right
- after reading. The option has no effect on data encoded with HDF5
- built-in filters, which is always decompressed upstream by HDF5.
-
-
- Returns
- -------
- (object, n_rows_read)
- `object` is the read-out object `n_rows_read` is the number of rows
- successfully read out. Essential for arrays when the amount of data
- is smaller than the object buffer. For scalars and structs
- `n_rows_read` will be``1``. For tables it is redundant with
- ``table.loc``.
- """
- # Handle list-of-files recursively
- if not isinstance(lh5_file, (str, h5py.File)):
- lh5_file = list(lh5_file)
- n_rows_read = 0
-
- # to know whether we are reading in a list of files.
- # this is part of the fix for reading data by idx
- # (see https://github.com/legend-exp/legend-pydataobj/issues/29)
- # so that we only make a copy of the data if absolutely necessary
- # or if we can read the data from file without having to make a copy
- self.in_file_loop = True
-
- for i, h5f in enumerate(lh5_file):
- if isinstance(idx, list) and len(idx) > 0 and not np.isscalar(idx[0]):
- # a list of lists: must be one per file
- idx_i = idx[i]
- elif idx is not None:
- # make idx a proper tuple if it's not one already
- if not (isinstance(idx, tuple) and len(idx) == 1):
- idx = (idx,)
- # idx is a long continuous array
- n_rows_i = self.read_n_rows(name, h5f)
- # find the length of the subset of idx that contains indices
- # that are less than n_rows_i
- n_rows_to_read_i = bisect_left(idx[0], n_rows_i)
- # now split idx into idx_i and the remainder
- idx_i = (idx[0][:n_rows_to_read_i],)
- idx = (idx[0][n_rows_to_read_i:] - n_rows_i,)
- else:
- idx_i = None
- n_rows_i = n_rows - n_rows_read
-
- # maybe someone passed in a list of len==1?
- if i == (len(lh5_file) - 1):
- self.in_file_loop = False
-
- obj_buf, n_rows_read_i = self.read_object(
- name,
- lh5_file[i],
- start_row=start_row,
- n_rows=n_rows_i,
- idx=idx_i,
- use_h5idx=use_h5idx,
- field_mask=field_mask,
- obj_buf=obj_buf,
- obj_buf_start=obj_buf_start,
- decompress=decompress,
- )
-
- n_rows_read += n_rows_read_i
- if n_rows_read >= n_rows or obj_buf is None:
- return obj_buf, n_rows_read
- start_row = 0
- obj_buf_start += n_rows_read_i
-
- self.in_file_loop = False
-
- return obj_buf, n_rows_read
-
- # get the file from the store
- h5f = self.gimme_file(lh5_file, "r")
- if not h5f or name not in h5f:
- raise KeyError(f"'{name}' not in {h5f.filename}")
-
- log.debug(
- f"reading {h5f.filename}:{name}[{start_row}:{n_rows}], decompress = {decompress}, "
- + (f" with field mask {field_mask}" if field_mask else "")
+ warn(
+ "lgdo.lh5_store has moved to a subfolder lgdo.lh5 containing LH5Iterator. "
+ "The object you are calling this function from uses the old LH5Iterator class."
+ "Please replace 'from lgdo.lh5_store import LH5Iterator' with 'from lgdo.lh5 import LH5Iterator'."
+ "lgdo.lh5_store will be removed in a future release.",
+ DeprecationWarning,
+ stacklevel=2,
)
-
- # make idx a proper tuple if it's not one already
- if not (isinstance(idx, tuple) and len(idx) == 1):
- if idx is not None:
- idx = (idx,)
-
- # get the object's datatype
- if "datatype" not in h5f[name].attrs:
- raise RuntimeError(
- f"'{name}' in file {lh5_file} is missing the datatype attribute"
- )
-
- datatype = h5f[name].attrs["datatype"]
- datatype, shape, elements = parse_datatype(datatype)
-
- # check field_mask and make it a default dict
- if datatype == "struct" or datatype == "table":
- if field_mask is None:
- field_mask = defaultdict(lambda: True)
- elif isinstance(field_mask, dict):
- default = True
- if len(field_mask) > 0:
- default = not field_mask[list(field_mask.keys())[0]]
- field_mask = defaultdict(lambda: default, field_mask)
- elif isinstance(field_mask, (list, tuple)):
- field_mask = defaultdict(
- lambda: False, {field: True for field in field_mask}
- )
- elif not isinstance(field_mask, defaultdict):
- raise RuntimeError("bad field_mask of type", type(field_mask).__name__)
- elif field_mask is not None:
- raise RuntimeError(f"datatype {datatype} does not accept a field_mask")
-
- # Scalar
- # scalars are dim-0 datasets
- if datatype == "scalar":
- value = h5f[name][()]
- if elements == "bool":
- value = np.bool_(value)
- if obj_buf is not None:
- obj_buf.value = value
- obj_buf.attrs.update(h5f[name].attrs)
- return obj_buf, 1
- else:
- return Scalar(value=value, attrs=h5f[name].attrs), 1
-
- # Struct
- # recursively build a struct, return as a dictionary
- if datatype == "struct":
- # ignore obj_buf.
- # TODO: could append new fields or overwrite/concat to existing
- # fields. If implemented, get_buffer() above should probably also
- # (optionally?) prep buffers for each field
- if obj_buf is not None:
- raise NotImplementedError("obj_buf not implemented for LGOD Structs")
-
- # loop over fields and read
- obj_dict = {}
- for field in elements:
- if not field_mask[field]:
- continue
- # TODO: it's strange to pass start_row, n_rows, idx to struct
- # fields. If they all had shared indexing, they should be in a
- # table... Maybe should emit a warning? Or allow them to be
- # dicts keyed by field name?
- if "int_keys" in h5f[name].attrs:
- if dict(h5f[name].attrs)["int_keys"]:
- f = int(field)
- else:
- f = str(field)
- obj_dict[f], _ = self.read_object(
- name + "/" + field,
- h5f,
- start_row=start_row,
- n_rows=n_rows,
- idx=idx,
- use_h5idx=use_h5idx,
- decompress=decompress,
- )
- # modify datatype in attrs if a field_mask was used
- attrs = dict(h5f[name].attrs)
- if field_mask is not None:
- selected_fields = []
- for field in elements:
- if field_mask[field]:
- selected_fields.append(field)
- attrs["datatype"] = "struct" + "{" + ",".join(selected_fields) + "}"
- return Struct(obj_dict=obj_dict, attrs=attrs), 1
-
- # Below here is all array-like types. So trim idx if needed
- if idx is not None:
- # chop off indices < start_row
- i_first_valid = bisect_left(idx[0], start_row)
- idxa = idx[0][i_first_valid:]
- # don't readout more than n_rows indices
- idx = (idxa[:n_rows],) # works even if n_rows > len(idxa)
-
- # Table or WaveformTable
- if datatype == "table":
- col_dict = {}
-
- # read out each of the fields
- rows_read = []
- for field in elements:
- if not field_mask[field]:
- continue
-
- fld_buf = None
- if obj_buf is not None:
- if not isinstance(obj_buf, Table) or field not in obj_buf:
- raise ValueError(
- f"obj_buf for LGDO Table '{name}' not formatted correctly"
- )
-
- else:
- fld_buf = obj_buf[field]
-
- col_dict[field], n_rows_read = self.read_object(
- name + "/" + field,
- h5f,
- start_row=start_row,
- n_rows=n_rows,
- idx=idx,
- use_h5idx=use_h5idx,
- obj_buf=fld_buf,
- obj_buf_start=obj_buf_start,
- decompress=decompress,
- )
- if obj_buf is not None and obj_buf_start + n_rows_read > len(obj_buf):
- obj_buf.resize(obj_buf_start + n_rows_read)
-
- rows_read.append(n_rows_read)
-
- # warn if all columns don't read in the same number of rows
- if len(rows_read) > 0:
- n_rows_read = rows_read[0]
- else:
- n_rows_read = 0
- log.warning(f"Table '{name}' has no subgroups accepted by field mask")
-
- for n in rows_read[1:]:
- if n != n_rows_read:
- log.warning(
- f"Table '{name}' got strange n_rows_read = {n}, {n_rows_read} was expected ({rows_read})"
- )
-
- # modify datatype in attrs if a field_mask was used
- attrs = dict(h5f[name].attrs)
- if field_mask is not None:
- selected_fields = []
- for field in elements:
- if field_mask[field]:
- selected_fields.append(field)
- attrs["datatype"] = "table" + "{" + ",".join(selected_fields) + "}"
-
- # fields have been read out, now return a table
- if obj_buf is None:
- # if col_dict contains just 3 objects called t0, dt, and values,
- # return a WaveformTable
- if (
- len(col_dict) == 3
- and "t0" in col_dict
- and "dt" in col_dict
- and "values" in col_dict
- ):
- table = WaveformTable(
- t0=col_dict["t0"], dt=col_dict["dt"], values=col_dict["values"]
- )
- else:
- table = Table(col_dict=col_dict, attrs=attrs)
-
- # set (write) loc to end of tree
- table.loc = n_rows_read
- return table, n_rows_read
- else:
- # We have read all fields into the object buffer. Run
- # checks: All columns should be the same size. So update
- # table's size as necessary, warn if any mismatches are found
- obj_buf.resize(do_warn=True)
- # set (write) loc to end of tree
- obj_buf.loc = obj_buf_start + n_rows_read
- # check attributes
- if set(obj_buf.attrs.keys()) != set(attrs.keys()):
- raise RuntimeError(
- f"attrs mismatch. obj_buf.attrs: "
- f"{obj_buf.attrs}, h5f[{name}].attrs: {attrs}"
- )
- return obj_buf, n_rows_read
-
- # ArrayOfEncodedEqualSizedArrays and VectorOfEncodedVectors
- for cond, enc_lgdo in [
- (
- datatype == "array_of_encoded_equalsized_arrays",
- ArrayOfEncodedEqualSizedArrays,
- ),
- (elements.startswith("encoded_array"), VectorOfEncodedVectors),
- ]:
- if cond:
- if (
- not decompress
- and obj_buf is not None
- and not isinstance(obj_buf, enc_lgdo)
- ):
- raise ValueError(f"obj_buf for '{name}' not a {enc_lgdo}")
-
- # read out decoded_size, either a Scalar or an Array
- decoded_size_buf = encoded_data_buf = None
- if obj_buf is not None and not decompress:
- decoded_size_buf = obj_buf.decoded_size
- encoded_data_buf = obj_buf.encoded_data
-
- decoded_size, _ = self.read_object(
- f"{name}/decoded_size",
- h5f,
- start_row=start_row,
- n_rows=n_rows,
- idx=idx,
- use_h5idx=use_h5idx,
- obj_buf=None if decompress else decoded_size_buf,
- obj_buf_start=0 if decompress else obj_buf_start,
- )
-
- # read out encoded_data, a VectorOfVectors
- encoded_data, n_rows_read = self.read_object(
- f"{name}/encoded_data",
- h5f,
- start_row=start_row,
- n_rows=n_rows,
- idx=idx,
- use_h5idx=use_h5idx,
- obj_buf=None if decompress else encoded_data_buf,
- obj_buf_start=0 if decompress else obj_buf_start,
- )
-
- # return the still encoded data in the buffer object, if there
- if obj_buf is not None and not decompress:
- return obj_buf, n_rows_read
-
- # otherwise re-create the encoded LGDO
- rawdata = enc_lgdo(
- encoded_data=encoded_data,
- decoded_size=decoded_size,
- attrs=h5f[name].attrs,
- )
-
- # already return if no decompression is requested
- if not decompress:
- return rawdata, n_rows_read
-
- # if no buffer, decode and return
- elif obj_buf is None and decompress:
- return compress.decode(rawdata), n_rows_read
-
- # eventually expand provided obj_buf, if too short
- buf_size = obj_buf_start + n_rows_read
- if len(obj_buf) < buf_size:
- obj_buf.resize(buf_size)
-
- # use the (decoded object type) buffer otherwise
- if enc_lgdo == ArrayOfEncodedEqualSizedArrays:
- if not isinstance(obj_buf, ArrayOfEqualSizedArrays):
- raise ValueError(
- f"obj_buf for decoded '{name}' not an ArrayOfEqualSizedArrays"
- )
-
- compress.decode(rawdata, obj_buf[obj_buf_start:buf_size])
-
- elif enc_lgdo == VectorOfEncodedVectors:
- if not isinstance(obj_buf, VectorOfVectors):
- raise ValueError(
- f"obj_buf for decoded '{name}' not a VectorOfVectors"
- )
-
- # FIXME: not a good idea. an in place decoding version
- # of decode would be needed to avoid extra memory
- # allocations
- for i, wf in enumerate(compress.decode(rawdata)):
- obj_buf[obj_buf_start + i] = wf
-
- return obj_buf, n_rows_read
-
- # VectorOfVectors
- # read out vector of vectors of different size
- if elements.startswith("array"):
- if obj_buf is not None and not isinstance(obj_buf, VectorOfVectors):
- raise ValueError(f"obj_buf for '{name}' not a LGDO VectorOfVectors")
-
- # read out cumulative_length
- cumulen_buf = None if obj_buf is None else obj_buf.cumulative_length
- cumulative_length, n_rows_read = self.read_object(
- f"{name}/cumulative_length",
- h5f,
- start_row=start_row,
- n_rows=n_rows,
- idx=idx,
- use_h5idx=use_h5idx,
- obj_buf=cumulen_buf,
- obj_buf_start=obj_buf_start,
- )
- # get a view of just what was read out for cleaner code below
- this_cumulen_nda = cumulative_length.nda[
- obj_buf_start : obj_buf_start + n_rows_read
- ]
-
- if idx is not None and n_rows_read > 0:
- # get the starting indices for each array in flattended data:
- # the starting index for array[i] is cumulative_length[i-1]
- idx2 = (np.asarray(idx[0]).copy() - 1,)
- # re-read cumulative_length with these indices
- # note this will allocate memory for fd_starts!
- fd_start = None
- if idx2[0][0] == -1:
- idx2 = (idx2[0][1:],)
- fd_start = 0 # this variable avoids an ndarray append
- fd_starts, fds_n_rows_read = self.read_object(
- f"{name}/cumulative_length",
- h5f,
- start_row=start_row,
- n_rows=n_rows,
- idx=idx2,
- use_h5idx=use_h5idx,
- )
- fd_starts = fd_starts.nda # we just need the nda
- if fd_start is None:
- fd_start = fd_starts[0]
-
- # compute the length that flattened_data will have after the
- # fancy-indexed read
- fd_n_rows = np.sum(this_cumulen_nda[-len(fd_starts) :] - fd_starts)
- if fd_start == 0:
- fd_n_rows += this_cumulen_nda[0]
-
- # now make fd_idx
- fd_idx = np.empty(fd_n_rows, dtype="uint32")
- fd_idx = _make_fd_idx(fd_starts, this_cumulen_nda, fd_idx)
-
- # Now clean up this_cumulen_nda, to be ready
- # to match the in-memory version of flattened_data. Note: these
- # operations on the view change the original array because they are
- # numpy arrays, not lists.
- this_cumulen_nda[-len(fd_starts) :] -= fd_starts
- np.cumsum(this_cumulen_nda, out=this_cumulen_nda)
-
- else:
- fd_idx = None
-
- # determine the start_row and n_rows for the flattened_data readout
- fd_start = 0
- if start_row > 0 and n_rows_read > 0:
- # need to read out the cumulen sample -before- the first sample
- # read above in order to get the starting row of the first
- # vector to read out in flattened_data
- fd_start = h5f[f"{name}/cumulative_length"][start_row - 1]
-
- # check limits for values that will be used subsequently
- if this_cumulen_nda[-1] < fd_start:
- log.debug(
- f"this_cumulen_nda[-1] = {this_cumulen_nda[-1]}, "
- f"fd_start = {fd_start}, "
- f"start_row = {start_row}, "
- f"n_rows_read = {n_rows_read}"
- )
- raise RuntimeError(
- f"cumulative_length non-increasing between entries "
- f"{start_row} and {start_row+n_rows_read} ??"
- )
-
- # determine the number of rows for the flattened_data readout
- fd_n_rows = this_cumulen_nda[-1] if n_rows_read > 0 else 0
-
- # Now done with this_cumulen_nda, so we can clean it up to be ready
- # to match the in-memory version of flattened_data. Note: these
- # operations on the view change the original array because they are
- # numpy arrays, not lists.
- #
- # First we need to subtract off the in-file offset for the start of
- # read for flattened_data
- this_cumulen_nda -= fd_start
-
- # If we started with a partially-filled buffer, add the
- # appropriate offset for the start of the in-memory flattened
- # data for this read.
- fd_buf_start = np.uint32(0)
- if obj_buf_start > 0:
- fd_buf_start = cumulative_length.nda[obj_buf_start - 1]
- this_cumulen_nda += fd_buf_start
-
- # Now prepare the object buffer if necessary
- fd_buf = None
- if obj_buf is not None:
- fd_buf = obj_buf.flattened_data
- # grow fd_buf if necessary to hold the data
- fdb_size = fd_buf_start + fd_n_rows
- if len(fd_buf) < fdb_size:
- fd_buf.resize(fdb_size)
-
- # now read
- flattened_data, dummy_rows_read = self.read_object(
- f"{name}/flattened_data",
- h5f,
- start_row=fd_start,
- n_rows=fd_n_rows,
- idx=fd_idx,
- use_h5idx=use_h5idx,
- obj_buf=fd_buf,
- obj_buf_start=fd_buf_start,
- )
- if obj_buf is not None:
- return obj_buf, n_rows_read
- return (
- VectorOfVectors(
- flattened_data=flattened_data,
- cumulative_length=cumulative_length,
- attrs=h5f[name].attrs,
- ),
- n_rows_read,
- )
-
- # Array
- # FixedSizeArray
- # ArrayOfEqualSizedArrays
- # read out all arrays by slicing
- if "array" in datatype:
- if obj_buf is not None:
- if not isinstance(obj_buf, Array):
- raise ValueError(f"obj_buf for '{name}' not an LGDO Array")
- obj_buf = None
-
- # compute the number of rows to read
- # we culled idx above for start_row and n_rows, now we have to apply
- # the constraint of the length of the dataset
- ds_n_rows = h5f[name].shape[0]
- if idx is not None:
- if len(idx[0]) > 0 and idx[0][-1] >= ds_n_rows:
- log.warning(
- "idx indexed past the end of the array in the file. Culling..."
- )
- n_rows_to_read = bisect_left(idx[0], ds_n_rows)
- idx = (idx[0][:n_rows_to_read],)
- if len(idx[0]) == 0:
- log.warning("idx empty after culling.")
- n_rows_to_read = len(idx[0])
- else:
- n_rows_to_read = ds_n_rows - start_row
- if n_rows_to_read > n_rows:
- n_rows_to_read = n_rows
-
- # if idx is passed, check if we can make it a slice instead (faster)
- change_idx_to_slice = False
-
- # prepare the selection for the read. Use idx if available
- if idx is not None:
- # check if idx is empty and convert to slice instead
- if len(idx[0]) == 0:
- source_sel = np.s_[0:0]
- change_idx_to_slice = True
- # check if idx is contiguous and increasing
- # if so, convert it to a slice instead (faster)
- elif np.all(np.diff(idx[0]) == 1):
- source_sel = np.s_[idx[0][0] : idx[0][-1] + 1]
- change_idx_to_slice = True
- else:
- source_sel = idx
- else:
- source_sel = np.s_[start_row : start_row + n_rows_to_read]
-
- # Now read the array
- if obj_buf is not None and n_rows_to_read > 0:
- buf_size = obj_buf_start + n_rows_to_read
- if len(obj_buf) < buf_size:
- obj_buf.resize(buf_size)
- dest_sel = np.s_[obj_buf_start:buf_size]
-
- # this is required to make the read of multiple files faster
- # until a better solution found.
- if change_idx_to_slice or idx is None or use_h5idx:
- h5f[name].read_direct(obj_buf.nda, source_sel, dest_sel)
- else:
- # it is faster to read the whole object and then do fancy indexing
- obj_buf.nda[dest_sel] = h5f[name][...][source_sel]
-
- nda = obj_buf.nda
- else:
- if n_rows == 0:
- tmp_shape = (0,) + h5f[name].shape[1:]
- nda = np.empty(tmp_shape, h5f[name].dtype)
- else:
- if change_idx_to_slice or idx is None or use_h5idx:
- nda = h5f[name][source_sel]
- else:
- # it is faster to read the whole object and then do fancy indexing
- nda = h5f[name][...][source_sel]
-
- # if reading a list of files recursively, this is given to obj_buf on
- # the first file read. obj_buf needs to be resized and therefore
- # it needs to hold the data itself (not a view of the data).
- # a view is returned by the source_sel indexing, which cannot be resized
- # by ndarray.resize().
- if hasattr(self, "in_file_loop") and self.in_file_loop:
- nda = np.copy(nda)
-
- # special handling for bools
- # (c and Julia store as uint8 so cast to bool)
- if elements == "bool":
- nda = nda.astype(np.bool_)
-
- # Finally, set attributes and return objects
- attrs = h5f[name].attrs
- if obj_buf is None:
- if datatype == "array":
- return Array(nda=nda, attrs=attrs), n_rows_to_read
- if datatype == "fixedsize_array":
- return FixedSizeArray(nda=nda, attrs=attrs), n_rows_to_read
- if datatype == "array_of_equalsized_arrays":
- return (
- ArrayOfEqualSizedArrays(nda=nda, dims=shape, attrs=attrs),
- n_rows_to_read,
- )
- else:
- if set(obj_buf.attrs.keys()) != set(attrs.keys()):
- raise RuntimeError(
- f"attrs mismatch. "
- f"obj_buf.attrs: {obj_buf.attrs}, "
- f"h5f[{name}].attrs: {attrs}"
- )
- return obj_buf, n_rows_to_read
-
- raise RuntimeError("don't know how to read datatype {datatype}")
-
- def write_object(
- self,
- obj: LGDO,
- name: str,
- lh5_file: str | h5py.File,
- group: str | h5py.Group = "/",
- start_row: int = 0,
- n_rows: int = None,
- wo_mode: str = "append",
- write_start: int = 0,
- **h5py_kwargs,
- ) -> None:
- """Write an LGDO into an LH5 file.
-
- If the `obj` :class:`.LGDO` has a `compression` attribute, its value is
- interpreted as the algorithm to be used to compress `obj` before
- writing to disk. The type of `compression` can be:
-
- string, kwargs dictionary, hdf5plugin filter
- interpreted as the name of a built-in or custom `HDF5 compression
- filter `_
- (``"gzip"``, ``"lzf"``, :mod:`hdf5plugin` filter object etc.) and
- passed directly to :meth:`h5py.Group.create_dataset`.
-
- :class:`.WaveformCodec` object
- If `obj` is a :class:`.WaveformTable` and ``obj.values`` holds the
- attribute, compress ``values`` using this algorithm. More
- documentation about the supported waveform compression algorithms at
- :mod:`.lgdo.compression`.
-
- If the `obj` :class:`.LGDO` has a `hdf5_settings` attribute holding a
- dictionary, it is interpreted as a list of keyword arguments to be
- forwarded directly to :meth:`h5py.Group.create_dataset` (exactly like
- the first format of `compression` above). This is the preferred way to
- specify HDF5 dataset options such as chunking etc. If compression
- options are specified, they take precedence over those set with the
- `compression` attribute.
-
- Note
- ----
- The `compression` LGDO attribute takes precedence over the default HDF5
- compression settings. The `hdf5_settings` attribute takes precedence
- over `compression`. These attributes are not written to disk.
-
- Note
- ----
- HDF5 compression is skipped for the `encoded_data.flattened_data`
- dataset of :class:`.VectorOfEncodedVectors` and
- :class:`.ArrayOfEncodedEqualSizedArrays`.
-
- Parameters
- ----------
- obj
- LH5 object. if object is array-like, writes `n_rows` starting from
- `start_row` in `obj`.
- name
- name of the object in the output HDF5 file.
- lh5_file
- HDF5 file name or :class:`h5py.File` object.
- group
- HDF5 group name or :class:`h5py.Group` object in which `obj` should
- be written.
- start_row
- first row in `obj` to be written.
- n_rows
- number of rows in `obj` to be written.
- wo_mode
- - ``write_safe`` or ``w``: only proceed with writing if the
- object does not already exist in the file.
- - ``append`` or ``a``: append along axis 0 (the first dimension)
- of array-like objects and array-like subfields of structs.
- :class:`~.lgdo.scalar.Scalar` objects get overwritten.
- - ``overwrite`` or ``o``: replace data in the file if present,
- starting from `write_start`. Note: overwriting with `write_start` =
- end of array is the same as ``append``.
- - ``overwrite_file`` or ``of``: delete file if present prior to
- writing to it. `write_start` should be 0 (its ignored).
- - ``append_column`` or ``ac``: append columns from an :class:`~.lgdo.table.Table`
- `obj` only if there is an existing :class:`~.lgdo.table.Table` in the `lh5_file` with
- the same `name` and :class:`~.lgdo.table.Table.size`. If the sizes don't match,
- or if there are matching fields, it errors out.
- write_start
- row in the output file (if already existing) to start overwriting
- from.
- **h5py_kwargs
- additional keyword arguments forwarded to
- :meth:`h5py.Group.create_dataset` to specify, for example, an HDF5
- compression filter to be applied before writing non-scalar
- datasets. **Note: `compression` Ignored if compression is specified
- as an `obj` attribute.**
- """
- log.debug(
- f"writing {repr(obj)}[{start_row}:{n_rows}] as "
- f"{lh5_file}:{group}/{name}[{write_start}:], "
- f"mode = {wo_mode}, h5py_kwargs = {h5py_kwargs}"
+ return self.read(
+ name,
+ lh5_file,
+ start_row,
+ n_rows,
+ idx,
+ field_mask,
+ obj_buf,
+ obj_buf_start,
+ decompress,
)
- if wo_mode == "write_safe":
- wo_mode = "w"
- if wo_mode == "append":
- wo_mode = "a"
- if wo_mode == "overwrite":
- wo_mode = "o"
- if wo_mode == "overwrite_file":
- wo_mode = "of"
- write_start = 0
- if wo_mode == "append_column":
- wo_mode = "ac"
- if wo_mode not in ["w", "a", "o", "of", "ac"]:
- raise ValueError(f"unknown wo_mode '{wo_mode}'")
-
- # "mode" is for the h5df.File and wo_mode is for this function
- # In hdf5, 'a' is really "modify" -- in addition to appending, you can
- # change any object in the file. So we use file:append for
- # write_object:overwrite.
- mode = "w" if wo_mode == "of" else "a"
- lh5_file = self.gimme_file(lh5_file, mode=mode)
- group = self.gimme_group(group, lh5_file)
- if wo_mode == "w" and name in group:
- raise RuntimeError(f"can't overwrite '{name}' in wo_mode 'write_safe'")
-
- # struct or table or waveform table
- if isinstance(obj, Struct):
- # In order to append a column, we need to update the `table{old_fields}` value in `group.attrs['datatype"]` to include the new fields.
- # One way to do this is to override `obj.attrs["datatype"]` to include old and new fields. Then we can write the fields to the table as normal.
- if wo_mode == "ac":
- old_group = self.gimme_group(name, group)
- datatype, shape, fields = parse_datatype(old_group.attrs["datatype"])
- if datatype not in ["table", "struct"]:
- raise RuntimeError(
- f"Trying to append columns to an object of type {datatype}"
- )
-
- # If the mode is `append_column`, make sure we aren't appending a table that has a column of the same name as in the existing table
- # Also make sure that the field we are adding has the same size
- if len(list(set(fields).intersection(set(obj.keys())))) != 0:
- raise ValueError(
- f"Can't append {list(set(fields).intersection(set(obj.keys())))} column(s) to a table with the same field(s)"
- )
- # It doesn't matter what key we access, as all fields in the old table have the same size
- if old_group[list(old_group.keys())[0]].size != obj.size:
- raise ValueError(
- f"Table sizes don't match. Trying to append column of size {obj.size} to a table of size {old_group[list(old_group.keys())[0]].size}."
- )
-
- # Now we can append the obj.keys() to the old fields, and then update obj.attrs.
- fields.extend(list(obj.keys()))
- obj.attrs.pop("datatype")
- obj.attrs["datatype"] = "table" + "{" + ",".join(fields) + "}"
-
- group = self.gimme_group(
- name,
- group,
- grp_attrs=obj.attrs,
- overwrite=(wo_mode in ["o", "ac"]),
- )
- # If the mode is overwrite, then we need to peek into the file's table's existing fields
- # If we are writing a new table to the group that does not contain an old field, we should delete that old field from the file
- if wo_mode == "o":
- # Find the old keys in the group that are not present in the new table's keys, then delete them
- for key in list(set(group.keys()) - set(obj.keys())):
- log.debug(f"{key} is not present in new table, deleting field")
- del group[key]
-
- for field in obj.keys():
- # eventually compress waveform table values with LGDO's
- # custom codecs before writing
- # if waveformtable.values.attrs["compression"] is NOT a
- # WaveformCodec, just leave it there
- obj_fld = None
- if (
- isinstance(obj, WaveformTable)
- and field == "values"
- and not isinstance(obj.values, VectorOfEncodedVectors)
- and not isinstance(obj.values, ArrayOfEncodedEqualSizedArrays)
- and "compression" in obj.values.attrs
- and isinstance(obj.values.attrs["compression"], WaveformCodec)
- ):
- codec = obj.values.attrs["compression"]
- obj_fld = compress.encode(obj.values, codec=codec)
- else:
- obj_fld = obj[field]
-
- # Convert keys to string for dataset names
- f = str(field)
- self.write_object(
- obj_fld,
- f,
- lh5_file,
- group=group,
- start_row=start_row,
- n_rows=n_rows,
- wo_mode=wo_mode,
- write_start=write_start,
- **h5py_kwargs,
- )
- return
-
- # scalars
- elif isinstance(obj, Scalar):
- if name in group:
- if wo_mode in ["o", "a"]:
- log.debug(f"overwriting {name} in {group}")
- del group[name]
- else:
- raise RuntimeError(
- f"tried to overwrite {name} in {group} for wo_mode {wo_mode}"
- )
- ds = group.create_dataset(name, shape=(), data=obj.value)
- ds.attrs.update(obj.attrs)
- return
-
- # vector of encoded vectors
- elif isinstance(obj, (VectorOfEncodedVectors, ArrayOfEncodedEqualSizedArrays)):
- group = self.gimme_group(
- name, group, grp_attrs=obj.attrs, overwrite=(wo_mode == "o")
- )
-
- # ask not to further compress flattened_data, it is already compressed!
- obj.encoded_data.flattened_data.attrs["compression"] = None
-
- self.write_object(
- obj.encoded_data,
- "encoded_data",
- lh5_file,
- group=group,
- start_row=start_row,
- n_rows=n_rows,
- wo_mode=wo_mode,
- write_start=write_start,
- **h5py_kwargs,
- )
-
- self.write_object(
- obj.decoded_size,
- "decoded_size",
- lh5_file,
- group=group,
- start_row=start_row,
- n_rows=n_rows,
- wo_mode=wo_mode,
- write_start=write_start,
- **h5py_kwargs,
- )
-
- # vector of vectors
- elif isinstance(obj, VectorOfVectors):
- group = self.gimme_group(
- name, group, grp_attrs=obj.attrs, overwrite=(wo_mode == "o")
- )
- if (
- n_rows is None
- or n_rows > obj.cumulative_length.nda.shape[0] - start_row
- ):
- n_rows = obj.cumulative_length.nda.shape[0] - start_row
- # if appending we need to add an appropriate offset to the
- # cumulative lengths as appropriate for the in-file object
- offset = 0 # declare here because we have to subtract it off at the end
- if (wo_mode == "a" or wo_mode == "o") and "cumulative_length" in group:
- len_cl = len(group["cumulative_length"])
- if wo_mode == "a":
- write_start = len_cl
- if len_cl > 0:
- offset = group["cumulative_length"][write_start - 1]
-
- # First write flattened_data array. Only write rows with data.
- fd_start = 0 if start_row == 0 else obj.cumulative_length.nda[start_row - 1]
- fd_n_rows = obj.cumulative_length.nda[start_row + n_rows - 1] - fd_start
- self.write_object(
- obj.flattened_data,
- "flattened_data",
- lh5_file,
- group=group,
- start_row=fd_start,
- n_rows=fd_n_rows,
- wo_mode=wo_mode,
- write_start=offset,
- **h5py_kwargs,
- )
-
- # now offset is used to give appropriate in-file values for
- # cumulative_length. Need to adjust it for start_row
- if start_row > 0:
- offset -= obj.cumulative_length.nda[start_row - 1]
-
- # Add offset to obj.cumulative_length itself to avoid memory allocation.
- # Then subtract it off after writing! (otherwise it will be changed
- # upon return)
- cl_dtype = obj.cumulative_length.nda.dtype.type
- obj.cumulative_length.nda += cl_dtype(offset)
-
- self.write_object(
- obj.cumulative_length,
- "cumulative_length",
- lh5_file,
- group=group,
- start_row=start_row,
- n_rows=n_rows,
- wo_mode=wo_mode,
- write_start=write_start,
- **h5py_kwargs,
- )
- obj.cumulative_length.nda -= cl_dtype(offset)
-
- return
-
- # if we get this far, must be one of the Array types
- elif isinstance(obj, Array):
- if n_rows is None or n_rows > obj.nda.shape[0] - start_row:
- n_rows = obj.nda.shape[0] - start_row
-
- nda = obj.nda[start_row : start_row + n_rows]
-
- # hack to store bools as uint8 for c / Julia compliance
- if nda.dtype.name == "bool":
- nda = nda.astype(np.uint8)
-
- # need to create dataset from ndarray the first time for speed
- # creating an empty dataset and appending to that is super slow!
- if (wo_mode != "a" and write_start == 0) or name not in group:
- # this is needed in order to have a resizable (in the first
- # axis) data set, i.e. rows can be appended later
- # NOTE: this automatically turns chunking on!
- maxshape = (None,) + nda.shape[1:]
- h5py_kwargs.setdefault("maxshape", maxshape)
-
- if wo_mode == "o" and name in group:
- log.debug(f"overwriting {name} in {group}")
- del group[name]
-
- # set default compression options
- for k, v in DEFAULT_HDF5_SETTINGS.items():
- h5py_kwargs.setdefault(k, v)
-
- # compress using the 'compression' LGDO attribute, if available
- if "compression" in obj.attrs:
- comp_algo = obj.attrs["compression"]
- if isinstance(comp_algo, dict):
- h5py_kwargs |= obj.attrs["compression"]
- else:
- h5py_kwargs["compression"] = obj.attrs["compression"]
-
- # and even the 'hdf5_settings' one, preferred
- if "hdf5_settings" in obj.attrs:
- h5py_kwargs |= obj.attrs["hdf5_settings"]
-
- # create HDF5 dataset
- ds = group.create_dataset(name, data=nda, **h5py_kwargs)
-
- # attach HDF5 dataset attributes, but not "compression"!
- _attrs = obj.getattrs(datatype=True)
- _attrs.pop("compression", None)
- _attrs.pop("hdf5_settings", None)
- ds.attrs.update(_attrs)
- return
-
- # Now append or overwrite
- ds = group[name]
- if not isinstance(ds, h5py.Dataset):
- raise RuntimeError(
- f"existing HDF5 object '{name}' in group '{group}'"
- " is not a dataset! Cannot overwrite or append"
- )
-
- old_len = ds.shape[0]
- if wo_mode == "a":
- write_start = old_len
- add_len = write_start + nda.shape[0] - old_len
- ds.resize(old_len + add_len, axis=0)
- ds[write_start:] = nda
- return
-
- else:
- raise RuntimeError(
- f"do not know how to write '{name}' of type '{type(obj).__name__}'"
- )
-
- def read_n_rows(self, name: str, lh5_file: str | h5py.File) -> int | None:
- """Look up the number of rows in an Array-like object called `name` in
- `lh5_file`.
-
- Return ``None`` if it is a :class:`.Scalar` or a :class:`.Struct`."""
- # this is basically a stripped down version of read_object
- h5f = self.gimme_file(lh5_file, "r")
- if not h5f or name not in h5f:
- raise KeyError(f"'{name}' not in {lh5_file}")
-
- # get the datatype
- if "datatype" not in h5f[name].attrs:
- raise RuntimeError(
- f"'{name}' in file {lh5_file} is missing the datatype attribute"
- )
-
- datatype = h5f[name].attrs["datatype"]
- datatype, shape, elements = parse_datatype(datatype)
-
- # scalars are dim-0 datasets
- if datatype == "scalar":
- return None
-
- # structs don't have rows
- if datatype == "struct":
- return None
-
- # tables should have elements with all the same length
- if datatype == "table":
- # read out each of the fields
- rows_read = None
- for field in elements:
- n_rows_read = self.read_n_rows(name + "/" + field, h5f)
- if not rows_read:
- rows_read = n_rows_read
- elif rows_read != n_rows_read:
- log.warning(
- f"'{field}' field in table '{name}' has {rows_read} rows, "
- f"{n_rows_read} was expected"
- )
- return rows_read
+class LH5Store(lh5.LH5Store):
+ def __init__(self, base_path: str = "", keep_open: bool = False):
+ warn(
+ "lgdo.lh5_store has moved to a subfolder lgdo.lh5 containing LH5Store. "
+ "Please replace 'from lgdo.lh5_store import LH5Store' with 'from lgdo.lh5 import LH5Store'."
+ "lgdo.lh5_store will be removed in a future release.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ super().__init__(base_path, keep_open)
- # length of vector of vectors is the length of its cumulative_length
- if elements.startswith("array"):
- return self.read_n_rows(f"{name}/cumulative_length", h5f)
- # length of vector of encoded vectors is the length of its decoded_size
- if (
- elements.startswith("encoded_array")
- or datatype == "array_of_encoded_equalsized_arrays"
- ):
- return self.read_n_rows(f"{name}/encoded_data", h5f)
+def load_dfs(
+ f_list: str | list[str],
+ par_list: list[str],
+ lh5_group: str = "",
+ idx_list: list[np.ndarray | list | tuple] = None,
+) -> pd.DataFrame:
+ warn(
+ "lgdo.lh5_store has moved to a subfolder lgdo.lh5. "
+ "Please replace 'from lgdo.lh5_store import load_dfs' with 'from lgdo.lh5 import load_dfs'. "
+ "lgdo.lh5_store will be removed in a future release.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ return lh5.load_dfs(f_list, par_list, lh5_group, idx_list)
- # return array length (without reading the array!)
- if "array" in datatype:
- # compute the number of rows to read
- return h5f[name].shape[0]
- raise RuntimeError(f"don't know how to read datatype '{datatype}'")
+def load_nda(
+ f_list: str | list[str],
+ par_list: list[str],
+ lh5_group: str = "",
+ idx_list: list[np.ndarray | list | tuple] = None,
+) -> dict[str, np.ndarray]:
+ warn(
+ "lgdo.lh5_store has moved to a subfolder lgdo.lh5. "
+ "Please replace 'from lgdo.lh5_store import load_nda' with 'from lgdo.lh5 import load_nda'. "
+ "lgdo.lh5_store will be removed in a future release.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ return lh5.load_nda(f_list, par_list, lh5_group, idx_list)
def ls(lh5_file: str | h5py.Group, lh5_group: str = "") -> list[str]:
- """Return a list of LH5 groups in the input file and group, similar
- to ``ls`` or ``h5ls``. Supports wildcards in group names.
-
-
- Parameters
- ----------
- lh5_file
- name of file.
- lh5_group
- group to search. add a ``/`` to the end of the group name if you want to
- list all objects inside that group.
- """
-
- log.debug(
- f"Listing objects in '{lh5_file}'"
- + ("" if lh5_group == "" else f" (and group {lh5_group})")
+ warn(
+ "lgdo.lh5_store has moved to a subfolder lgdo.lh5. "
+ "Please replace 'from lgdo.lh5_store import ls' with 'from lgdo.lh5 import ls'. "
+ "lgdo.lh5_store will be removed in a future release.",
+ DeprecationWarning,
+ stacklevel=2,
)
-
- lh5_st = LH5Store()
- # To use recursively, make lh5_file a h5group instead of a string
- if isinstance(lh5_file, str):
- lh5_file = lh5_st.gimme_file(lh5_file, "r")
- if lh5_group.startswith("/"):
- lh5_group = lh5_group[1:]
-
- if lh5_group == "":
- lh5_group = "*"
-
- splitpath = lh5_group.split("/", 1)
- matchingkeys = fnmatch.filter(lh5_file.keys(), splitpath[0])
-
- if len(splitpath) == 1:
- return matchingkeys
- else:
- ret = []
- for key in matchingkeys:
- ret.extend([f"{key}/{path}" for path in ls(lh5_file[key], splitpath[1])])
- return ret
+ return lh5.ls(lh5_file, lh5_group)
def show(
@@ -1337,495 +182,11 @@ def show(
indent: str = "",
header: bool = True,
) -> None:
- """Print a tree of LH5 file contents with LGDO datatype.
-
- Parameters
- ----------
- lh5_file
- the LH5 file.
- lh5_group
- print only contents of this HDF5 group.
- attrs
- print the HDF5 attributes too.
- indent
- indent the diagram with this string.
- header
- print `lh5_group` at the top of the diagram.
-
- Examples
- --------
- >>> from lgdo import show
- >>> show("file.lh5", "/geds/raw")
- /geds/raw
- ├── channel · array<1>{real}
- ├── energy · array<1>{real}
- ├── timestamp · array<1>{real}
- ├── waveform · table{t0,dt,values}
- │ ├── dt · array<1>{real}
- │ ├── t0 · array<1>{real}
- │ └── values · array_of_equalsized_arrays<1,1>{real}
- └── wf_std · array<1>{real}
- """
- # open file
- if isinstance(lh5_file, str):
- lh5_file = h5py.File(expand_path(lh5_file), "r")
-
- # go to group
- if lh5_group != "/":
- lh5_file = lh5_file[lh5_group]
-
- if header:
- print(f"\033[1m{lh5_group}\033[0m") # noqa: T201
-
- # get an iterator over the keys in the group
- it = iter(lh5_file)
- key = None
-
- # make sure there is actually something in this file/group
- try:
- key = next(it) # get first key
- except StopIteration:
- print(f"{indent}└── empty") # noqa: T201
- return
-
- # loop over keys
- while True:
- val = lh5_file[key]
- # we want to print the LGDO datatype
- dtype = val.attrs.get("datatype", default="no datatype")
- if dtype == "no datatype" and isinstance(val, h5py.Group):
- dtype = "HDF5 group"
-
- _attrs = ""
- if attrs:
- attrs_d = dict(val.attrs)
- attrs_d.pop("datatype", "")
- _attrs = "── " + str(attrs_d) if attrs_d else ""
-
- # is this the last key?
- killme = False
- try:
- k_new = next(it) # get next key
- except StopIteration:
- char = "└──"
- killme = True # we'll have to kill this loop later
- else:
- char = "├──"
-
- print(f"{indent}{char} \033[1m{key}\033[0m · {dtype} {_attrs}") # noqa: T201
-
- # if it's a group, call this function recursively
- if isinstance(val, h5py.Group):
- show(
- val,
- indent=indent + (" " if killme else "│ "),
- header=False,
- attrs=attrs,
- )
-
- # break or move to next key
- if killme:
- break
- else:
- key = k_new
-
-
-def load_nda(
- f_list: str | list[str],
- par_list: list[str],
- lh5_group: str = "",
- idx_list: list[np.ndarray | list | tuple] = None,
-) -> dict[str, np.ndarray]:
- r"""Build a dictionary of :class:`numpy.ndarray`\ s from LH5 data.
-
- Given a list of files, a list of LH5 table parameters, and an optional
- group path, return a NumPy array with all values for each parameter.
-
- Parameters
- ----------
- f_list
- A list of files. Can contain wildcards.
- par_list
- A list of parameters to read from each file.
- lh5_group
- group path within which to find the specified parameters.
- idx_list
- for fancy-indexed reads. Must be one index array for each file in
- `f_list`.
-
- Returns
- -------
- par_data
- A dictionary of the parameter data keyed by the elements of `par_list`.
- Each entry contains the data for the specified parameter concatenated
- over all files in `f_list`.
- """
- if isinstance(f_list, str):
- f_list = [f_list]
- if idx_list is not None:
- idx_list = [idx_list]
- if idx_list is not None and len(f_list) != len(idx_list):
- raise ValueError(
- f"f_list length ({len(f_list)}) != idx_list length ({len(idx_list)})!"
- )
-
- # Expand wildcards
- f_list = [f for f_wc in f_list for f in sorted(glob.glob(os.path.expandvars(f_wc)))]
-
- sto = LH5Store()
- par_data = {par: [] for par in par_list}
- for ii, f in enumerate(f_list):
- f = sto.gimme_file(f, "r")
- for par in par_list:
- if f"{lh5_group}/{par}" not in f:
- raise RuntimeError(f"'{lh5_group}/{par}' not in file {f_list[ii]}")
-
- if idx_list is None:
- data, _ = sto.read_object(f"{lh5_group}/{par}", f)
- else:
- data, _ = sto.read_object(f"{lh5_group}/{par}", f, idx=idx_list[ii])
- if not data:
- continue
- par_data[par].append(data.nda)
- par_data = {par: np.concatenate(par_data[par]) for par in par_list}
- return par_data
-
-
-def load_dfs(
- f_list: str | list[str],
- par_list: list[str],
- lh5_group: str = "",
- idx_list: list[np.ndarray | list | tuple] = None,
-) -> pd.DataFrame:
- """Build a :class:`pandas.DataFrame` from LH5 data.
-
- Given a list of files (can use wildcards), a list of LH5 columns, and
- optionally the group path, return a :class:`pandas.DataFrame` with all
- values for each parameter.
-
- See Also
- --------
- :func:`load_nda`
-
- Returns
- -------
- dataframe
- contains columns for each parameter in `par_list`, and rows containing
- all data for the associated parameters concatenated over all files in
- `f_list`.
- """
- return pd.DataFrame(
- load_nda(f_list, par_list, lh5_group=lh5_group, idx_list=idx_list)
+ warn(
+ "lgdo.lh5_store has moved to a subfolder lgdo.lh5. "
+ "Please replace 'from lgdo.lh5_store import show' with 'from lgdo.lh5 import show'. "
+ "lgdo.lh5_store will be removed in a future release.",
+ DeprecationWarning,
+ stacklevel=2,
)
-
-
-class LH5Iterator(Iterator):
- """
- A class for iterating through one or more LH5 files, one block of entries
- at a time. This also accepts an entry list/mask to enable event selection,
- and a field mask.
-
- This class can be used either for random access:
-
- >>> lh5_obj, n_rows = lh5_it.read(entry)
-
- to read the block of entries starting at entry. In case of multiple files
- or the use of an event selection, entry refers to a global event index
- across files and does not count events that are excluded by the selection.
-
- This can also be used as an iterator:
-
- >>> for lh5_obj, entry, n_rows in LH5Iterator(...):
- >>> # do the thing!
-
- This is intended for if you are reading a large quantity of data but
- want to limit your memory usage (particularly when reading in waveforms!).
- The ``lh5_obj`` that is read by this class is reused in order to avoid
- reallocation of memory; this means that if you want to hold on to data
- between reads, you will have to copy it somewhere!
- """
-
- def __init__(
- self,
- lh5_files: str | list[str],
- groups: str | list[str],
- base_path: str = "",
- entry_list: list[int] | list[list[int]] = None,
- entry_mask: list[bool] | list[list[bool]] = None,
- field_mask: dict[str, bool] | list[str] | tuple[str] = None,
- buffer_len: int = 3200,
- friend: LH5Iterator = None,
- ) -> None:
- """
- Parameters
- ----------
- lh5_files
- file or files to read from. May include wildcards and environment
- variables.
- groups
- HDF5 group(s) to read. If a list is provided for both lh5_files
- and group, they must be the same size. If a file is wild-carded,
- the same group will be assigned to each file found
- entry_list
- list of entry numbers to read. If a nested list is provided,
- expect one top-level list for each file, containing a list of
- local entries. If a list of ints is provided, use global entries.
- entry_mask
- mask of entries to read. If a list of arrays is provided, expect
- one for each file. Ignore if a selection list is provided.
- field_mask
- mask of which fields to read. See :meth:`LH5Store.read_object` for
- more details.
- buffer_len
- number of entries to read at a time while iterating through files.
- friend
- a ''friend'' LH5Iterator that will be read in parallel with this.
- The friend should have the same length and entry list. A single
- LH5 table containing columns from both iterators will be returned.
- """
- self.lh5_st = LH5Store(base_path=base_path, keep_open=True)
-
- # List of files, with wildcards and env vars expanded
- if isinstance(lh5_files, str):
- lh5_files = [lh5_files]
- if isinstance(groups, list):
- lh5_files *= len(groups)
- elif not isinstance(lh5_files, list):
- raise ValueError("lh5_files must be a string or list of strings")
-
- if isinstance(groups, str):
- groups = [groups] * len(lh5_files)
- elif not isinstance(groups, list):
- raise ValueError("group must be a string or list of strings")
-
- if not len(groups) == len(lh5_files):
- raise ValueError("lh5_files and groups must have same length")
-
- self.lh5_files = []
- self.groups = []
- for f, g in zip(lh5_files, groups):
- f_exp = expand_path(f, list=True, base_path=base_path)
- self.lh5_files += f_exp
- self.groups += [g] * len(f_exp)
-
- if entry_list is not None and entry_mask is not None:
- raise ValueError(
- "entry_list and entry_mask arguments are mutually exclusive"
- )
-
- # Map to last row in each file
- self.file_map = np.full(len(self.lh5_files), np.iinfo("i").max, "i")
- # Map to last iterator entry for each file
- self.entry_map = np.full(len(self.lh5_files), np.iinfo("i").max, "i")
- self.buffer_len = buffer_len
-
- if len(self.lh5_files) > 0:
- f = self.lh5_files[0]
- g = self.groups[0]
- self.lh5_buffer = self.lh5_st.get_buffer(
- g,
- f,
- size=self.buffer_len,
- field_mask=field_mask,
- )
- self.file_map[0] = self.lh5_st.read_n_rows(g, f)
- else:
- raise RuntimeError(f"can't open any files from {lh5_files}")
-
- self.n_rows = 0
- self.current_entry = 0
- self.next_entry = 0
-
- self.field_mask = field_mask
-
- # List of entry indices from each file
- self.local_entry_list = None
- self.global_entry_list = None
- if entry_list is not None:
- entry_list = list(entry_list)
- if isinstance(entry_list[0], int):
- self.local_entry_list = [None] * len(self.file_map)
- self.global_entry_list = np.array(entry_list, "i")
- self.global_entry_list.sort()
-
- else:
- self.local_entry_list = [[]] * len(self.file_map)
- for i_file, local_list in enumerate(entry_list):
- self.local_entry_list[i_file] = np.array(local_list, "i")
- self.local_entry_list[i_file].sort()
-
- elif entry_mask is not None:
- # Convert entry mask into an entry list
- if isinstance(entry_mask, pd.Series):
- entry_mask = entry_mask.values
- if isinstance(entry_mask, np.ndarray):
- self.local_entry_list = [None] * len(self.file_map)
- self.global_entry_list = np.nonzero(entry_mask)[0]
- else:
- self.local_entry_list = [[]] * len(self.file_map)
- for i_file, local_mask in enumerate(entry_mask):
- self.local_entry_list[i_file] = np.nonzero(local_mask)[0]
-
- # Attach the friend
- if friend is not None:
- if not isinstance(friend, LH5Iterator):
- raise ValueError("Friend must be an LH5Iterator")
- self.lh5_buffer.join(friend.lh5_buffer)
- self.friend = friend
-
- def _get_file_cumlen(self, i_file: int) -> int:
- """Helper to get cumulative file length of file"""
- if i_file < 0:
- return 0
- fcl = self.file_map[i_file]
- if fcl == np.iinfo("i").max:
- fcl = self._get_file_cumlen(i_file - 1) + self.lh5_st.read_n_rows(
- self.groups[i_file], self.lh5_files[i_file]
- )
- self.file_map[i_file] = fcl
- return fcl
-
- def _get_file_cumentries(self, i_file: int) -> int:
- """Helper to get cumulative iterator entries in file"""
- if i_file < 0:
- return 0
- n = self.entry_map[i_file]
- if n == np.iinfo("i").max:
- elist = self.get_file_entrylist(i_file)
- fcl = self._get_file_cumlen(i_file)
- if elist is None:
- # no entry list provided
- n = fcl
- else:
- file_entries = self.get_file_entrylist(i_file)
- n = len(file_entries)
- # check that file entries fall inside of file
- if n > 0 and file_entries[-1] >= fcl:
- logging.warning(f"Found entries out of range for file {i_file}")
- n = np.searchsorted(file_entries, fcl, "right")
- n += self._get_file_cumentries(i_file - 1)
- self.entry_map[i_file] = n
- return n
-
- def get_file_entrylist(self, i_file: int) -> np.ndarray:
- """Helper to get entry list for file"""
- # If no entry list is provided
- if self.local_entry_list is None:
- return None
-
- elist = self.local_entry_list[i_file]
- if elist is None:
- # Get local entrylist for this file from global entry list
- f_start = self._get_file_cumlen(i_file - 1)
- f_end = self._get_file_cumlen(i_file)
- i_start = self._get_file_cumentries(i_file - 1)
- i_stop = np.searchsorted(self.global_entry_list, f_end, "right")
- elist = np.array(self.global_entry_list[i_start:i_stop], "i") - f_start
- self.local_entry_list[i_file] = elist
- return elist
-
- def get_global_entrylist(self) -> np.ndarray:
- """Get global entry list, constructing it if needed"""
- if self.global_entry_list is None and self.local_entry_list is not None:
- self.global_entry_list = np.zeros(len(self), "i")
- for i_file in range(len(self.lh5_files)):
- i_start = self.get_file_cumentries(i_file - 1)
- i_stop = self.get_file_cumentries(i_file)
- f_start = self.get_file_cumlen(i_file - 1)
- self.global_entry_list[i_start:i_stop] = (
- self.get_file_entrylist(i_file) + f_start
- )
- return self.global_entry_list
-
- def read(self, entry: int) -> tuple[LGDO, int]:
- """Read the nextlocal chunk of events, starting at entry. Return the
- LH5 buffer and number of rows read."""
- self.n_rows = 0
- i_file = np.searchsorted(self.entry_map, entry, "right")
-
- # if file hasn't been opened yet, search through files
- # sequentially until we find the right one
- if i_file < len(self.lh5_files) and self.entry_map[i_file] == np.iinfo("i").max:
- while i_file < len(self.lh5_files) and entry >= self._get_file_cumentries(
- i_file
- ):
- i_file += 1
-
- if i_file == len(self.lh5_files):
- return (self.lh5_buffer, self.n_rows)
- local_entry = entry - self._get_file_cumentries(i_file - 1)
-
- while self.n_rows < self.buffer_len and i_file < len(self.file_map):
- # Loop through files
- local_idx = self.get_file_entrylist(i_file)
- if local_idx is not None and len(local_idx) == 0:
- i_file += 1
- local_entry = 0
- continue
-
- i_local = local_idx[local_entry] if local_idx is not None else local_entry
- self.lh5_buffer, n_rows = self.lh5_st.read_object(
- self.groups[i_file],
- self.lh5_files[i_file],
- start_row=i_local,
- n_rows=self.buffer_len - self.n_rows,
- idx=local_idx,
- field_mask=self.field_mask,
- obj_buf=self.lh5_buffer,
- obj_buf_start=self.n_rows,
- )
-
- self.n_rows += n_rows
- i_file += 1
- local_entry = 0
-
- self.current_entry = entry
-
- if self.friend is not None:
- self.friend.read(entry)
-
- return (self.lh5_buffer, self.n_rows)
-
- def reset_field_mask(self, mask):
- """Replaces the field mask of this iterator and any friends with mask"""
- self.field_mask = mask
- if self.friend is not None:
- self.friend.reset_field_mask(mask)
-
- def __len__(self) -> int:
- """Return the total number of entries."""
- return (
- self._get_file_cumentries(len(self.lh5_files) - 1)
- if len(self.entry_map) > 0
- else 0
- )
-
- def __iter__(self) -> Iterator:
- """Loop through entries in blocks of size buffer_len."""
- self.current_entry = 0
- self.next_entry = 0
- return self
-
- def __next__(self) -> tuple[LGDO, int, int]:
- """Read next buffer_len entries and return lh5_table, iterator entry
- and n_rows read."""
- buf, n_rows = self.read(self.next_entry)
- self.next_entry = self.current_entry + n_rows
- if n_rows == 0:
- raise StopIteration
- return (buf, self.current_entry, n_rows)
-
-
-@nb.njit(parallel=False, fastmath=True)
-def _make_fd_idx(starts, stops, idx):
- k = 0
- if len(starts) < len(stops):
- for i in range(stops[0]):
- idx[k] = i
- k += 1
- stops = stops[1:]
- for j in range(len(starts)):
- for i in range(starts[j], stops[j]):
- idx[k] = i
- k += 1
- return (idx,)
+ lh5.show(lh5_file, lh5_group, attrs, indent, header)
diff --git a/src/lgdo/types/array.py b/src/lgdo/types/array.py
index 30a47bd2..54fd76f3 100644
--- a/src/lgdo/types/array.py
+++ b/src/lgdo/types/array.py
@@ -10,7 +10,7 @@
import numpy as np
-from .. import lgdo_utils as utils
+from .. import utils as utils
from .lgdo import LGDO
log = logging.getLogger(__name__)
diff --git a/src/lgdo/types/arrayofequalsizedarrays.py b/src/lgdo/types/arrayofequalsizedarrays.py
index 95884bc9..bf16ed8d 100644
--- a/src/lgdo/types/arrayofequalsizedarrays.py
+++ b/src/lgdo/types/arrayofequalsizedarrays.py
@@ -9,7 +9,7 @@
import numpy as np
-from .. import lgdo_utils as utils
+from .. import utils as utils
from . import vectorofvectors as vov
from .array import Array
diff --git a/src/lgdo/types/encoded.py b/src/lgdo/types/encoded.py
index 68886273..766001b3 100644
--- a/src/lgdo/types/encoded.py
+++ b/src/lgdo/types/encoded.py
@@ -6,7 +6,7 @@
import numpy as np
from numpy.typing import NDArray
-from .. import lgdo_utils as utils
+from .. import utils as utils
from .array import Array
from .lgdo import LGDO
from .scalar import Scalar
diff --git a/src/lgdo/types/scalar.py b/src/lgdo/types/scalar.py
index 6b793137..e79bb932 100644
--- a/src/lgdo/types/scalar.py
+++ b/src/lgdo/types/scalar.py
@@ -7,7 +7,7 @@
import numpy as np
-from .. import lgdo_utils as utils
+from .. import utils as utils
from .lgdo import LGDO
log = logging.getLogger(__name__)
diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py
index 7d227a52..2b0d7f13 100644
--- a/src/lgdo/types/vectorofvectors.py
+++ b/src/lgdo/types/vectorofvectors.py
@@ -13,7 +13,7 @@
import numpy as np
from numpy.typing import DTypeLike, NDArray
-from .. import lgdo_utils as utils
+from .. import utils as utils
from . import arrayofequalsizedarrays as aoesa
from .array import Array
from .lgdo import LGDO
diff --git a/src/lgdo/utils.py b/src/lgdo/utils.py
new file mode 100644
index 00000000..22866a35
--- /dev/null
+++ b/src/lgdo/utils.py
@@ -0,0 +1,84 @@
+"""Implements utilities for LEGEND Data Objects."""
+from __future__ import annotations
+
+import logging
+
+import numpy as np
+
+from . import types as lgdo
+
+log = logging.getLogger(__name__)
+
+
+def get_element_type(obj: object) -> str:
+ """Get the LGDO element type of a scalar or array.
+
+ For use in LGDO datatype attributes.
+
+ Parameters
+ ----------
+ obj
+ if a ``str``, will automatically return ``string`` if the object has
+ a :class:`numpy.dtype`, that will be used for determining the element
+ type otherwise will attempt to case the type of the object to a
+ :class:`numpy.dtype`.
+
+ Returns
+ -------
+ element_type
+ A string stating the determined element type of the object.
+ """
+
+ # special handling for strings
+ if isinstance(obj, str):
+ return "string"
+
+ # the rest use dtypes
+ dt = obj.dtype if hasattr(obj, "dtype") else np.dtype(type(obj))
+ kind = dt.kind
+
+ if kind == "b":
+ return "bool"
+ if kind == "V":
+ return "blob"
+ if kind in ["i", "u", "f"]:
+ return "real"
+ if kind == "c":
+ return "complex"
+ if kind in ["S", "U"]:
+ return "string"
+
+ # couldn't figure it out
+ raise ValueError(
+ "cannot determine lgdo element_type for object of type", type(obj).__name__
+ )
+
+
+def copy(obj: lgdo.LGDO, dtype: np.dtype = None) -> lgdo.LGDO:
+ """Return a copy of an LGDO.
+
+ Parameters
+ ----------
+ obj
+ the LGDO to be copied.
+ dtype
+ NumPy dtype to be used for the copied object.
+
+ """
+ if dtype is None:
+ dtype = obj.dtype
+
+ if isinstance(obj, lgdo.Array):
+ return lgdo.Array(
+ np.array(obj.nda, dtype=dtype, copy=True), attrs=dict(obj.attrs)
+ )
+
+ if isinstance(obj, lgdo.VectorOfVectors):
+ return lgdo.VectorOfVectors(
+ flattened_data=copy(obj.flattened_data, dtype=dtype),
+ cumulative_length=copy(obj.cumulative_length),
+ attrs=dict(obj.attrs),
+ )
+
+ else:
+ raise ValueError(f"copy of {type(obj)} not supported")
diff --git a/tests/compression/conftest.py b/tests/compression/conftest.py
index 927ba1ff..e69cc307 100644
--- a/tests/compression/conftest.py
+++ b/tests/compression/conftest.py
@@ -1,12 +1,12 @@
import pytest
-from lgdo import LH5Store
+import lgdo.lh5 as lh5
@pytest.fixture()
def wftable(lgnd_test_data):
- store = LH5Store()
- wft, _ = store.read_object(
+ store = lh5.LH5Store()
+ wft, _ = store.read(
"/geds/raw/waveform",
lgnd_test_data.get_path("lh5/LDQTA_r117_20200110T105115Z_cal_geds_raw.lh5"),
)
diff --git a/tests/compression/test_radware_sigcompress.py b/tests/compression/test_radware_sigcompress.py
index aacf38f6..fe0bdd99 100644
--- a/tests/compression/test_radware_sigcompress.py
+++ b/tests/compression/test_radware_sigcompress.py
@@ -2,7 +2,8 @@
import numpy as np
-from lgdo import ArrayOfEncodedEqualSizedArrays, ArrayOfEqualSizedArrays, LH5Store
+import lgdo.lh5 as lh5
+from lgdo import ArrayOfEncodedEqualSizedArrays, ArrayOfEqualSizedArrays
from lgdo.compression.radware import (
_get_hton_u16,
_radware_sigcompress_decode,
@@ -177,8 +178,8 @@ def test_aoesa(wftable):
def test_performance(lgnd_test_data):
- store = LH5Store()
- obj, _ = store.read_object(
+ store = lh5.LH5Store()
+ obj, _ = store.read(
"/geds/raw/waveform",
lgnd_test_data.get_path("lh5/LDQTA_r117_20200110T105115Z_cal_geds_raw.lh5"),
)
diff --git a/tests/test_lh5_iterator.py b/tests/lh5/test_lh5_iterator.py
similarity index 92%
rename from tests/test_lh5_iterator.py
rename to tests/lh5/test_lh5_iterator.py
index 09297665..95e575af 100644
--- a/tests/test_lh5_iterator.py
+++ b/tests/lh5/test_lh5_iterator.py
@@ -2,7 +2,7 @@
import pytest
import lgdo
-from lgdo.lh5_store import LH5Iterator
+import lgdo.lh5 as lh5
@pytest.fixture(scope="module")
@@ -11,7 +11,7 @@ def lgnd_file(lgnd_test_data):
def test_basics(lgnd_file):
- lh5_it = LH5Iterator(
+ lh5_it = lh5.LH5Iterator(
lgnd_file,
"/geds/raw",
entry_list=range(100),
@@ -35,14 +35,14 @@ def test_basics(lgnd_file):
def test_errors(lgnd_file):
with pytest.raises(RuntimeError):
- LH5Iterator("non-existent-file.lh5", "random-group")
+ lh5.LH5Iterator("non-existent-file.lh5", "random-group")
with pytest.raises(ValueError):
- LH5Iterator(1, 2)
+ lh5.LH5Iterator(1, 2)
def test_lgnd_waveform_table_fancy_idx(lgnd_file):
- lh5_it = LH5Iterator(
+ lh5_it = lh5.LH5Iterator(
lgnd_file,
"geds/raw/waveform",
entry_list=[
@@ -97,13 +97,13 @@ def more_lgnd_files(lgnd_test_data):
def test_friend(more_lgnd_files):
- lh5_raw_it = LH5Iterator(
+ lh5_raw_it = lh5.LH5Iterator(
more_lgnd_files[0],
"ch1084803/raw",
field_mask=["waveform", "baseline"],
buffer_len=5,
)
- lh5_it = LH5Iterator(
+ lh5_it = lh5.LH5Iterator(
more_lgnd_files[1],
"ch1084803/hit",
field_mask=["is_valid_0vbb"],
@@ -121,7 +121,7 @@ def test_friend(more_lgnd_files):
def test_iterate(more_lgnd_files):
# iterate through all hit groups in all files; there are 10 entries in
# each group/file
- lh5_it = LH5Iterator(
+ lh5_it = lh5.LH5Iterator(
more_lgnd_files[1] * 3,
["ch1084803/hit"] * 2 + ["ch1084804/hit"] * 2 + ["ch1121600/hit"] * 2,
field_mask=["is_valid_0vbb", "timestamp", "zacEmax_ctc_cal"],
diff --git a/tests/test_lh5_store.py b/tests/lh5/test_lh5_store.py
similarity index 71%
rename from tests/test_lh5_store.py
rename to tests/lh5/test_lh5_store.py
index 25491660..9d2d254c 100644
--- a/tests/test_lh5_store.py
+++ b/tests/lh5/test_lh5_store.py
@@ -7,10 +7,11 @@
import pytest
import lgdo
-import lgdo.lh5_store as lh5
+import lgdo.lh5 as lh5
+import lgdo.types as types
from lgdo import compression
from lgdo.compression import RadwareSigcompress
-from lgdo.lh5_store import DEFAULT_HDF5_SETTINGS, LH5Store
+from lgdo.lh5.store import DEFAULT_HDF5_SETTINGS
@pytest.fixture(scope="module")
@@ -19,11 +20,11 @@ def lgnd_file(lgnd_test_data):
def test_init():
- LH5Store()
+ lh5.LH5Store()
def test_gimme_file(lgnd_file):
- store = LH5Store(keep_open=True)
+ store = lh5.LH5Store(keep_open=True)
f = store.gimme_file(lgnd_file)
assert isinstance(f, h5py.File)
@@ -35,7 +36,7 @@ def test_gimme_file(lgnd_file):
def test_gimme_group(lgnd_file, tmptestdir):
f = h5py.File(lgnd_file)
- store = LH5Store()
+ store = lh5.LH5Store()
g = store.gimme_group("/geds", f)
assert isinstance(g, h5py.Group)
@@ -44,12 +45,6 @@ def test_gimme_group(lgnd_file, tmptestdir):
assert isinstance(g, h5py.Group)
-def test_show(lgnd_file):
- lh5.show(lgnd_file)
- lh5.show(lgnd_file, "/geds/raw")
- lh5.show(lgnd_file, "geds/raw")
-
-
def test_ls(lgnd_file):
assert lh5.ls(lgnd_file) == ["geds"]
assert lh5.ls(lgnd_file, "/*/raw") == ["geds/raw"]
@@ -68,6 +63,12 @@ def test_ls(lgnd_file):
]
+def test_show(lgnd_file):
+ lh5.show(lgnd_file)
+ lh5.show(lgnd_file, "/geds/raw")
+ lh5.show(lgnd_file, "geds/raw")
+
+
def test_load_nda(lgnd_file):
nda = lh5.load_nda(
[lgnd_file, lgnd_file],
@@ -83,49 +84,38 @@ def test_load_nda(lgnd_file):
assert nda["waveform/values"].shape == (6, 5592)
-def test_load_dfs(lgnd_file):
- dfs = lh5.load_dfs(
- [lgnd_file, lgnd_file],
- ["baseline", "waveform/t0"],
- lh5_group="/geds/raw",
- idx_list=[[1, 3, 5], [2, 6, 7]],
- )
-
- assert isinstance(dfs, pd.DataFrame)
-
-
@pytest.fixture(scope="module")
def lh5_file(tmptestdir):
- store = LH5Store()
+ store = lh5.LH5Store()
struct = lgdo.Struct()
struct.add_field("scalar", lgdo.Scalar(value=10, attrs={"sth": 1}))
- struct.add_field("array", lgdo.Array(nda=np.array([1, 2, 3, 4, 5])))
+ struct.add_field("array", types.Array(nda=np.array([1, 2, 3, 4, 5])))
struct.add_field(
"aoesa",
- lgdo.ArrayOfEqualSizedArrays(shape=(5, 5), dtype=np.float32, fill_val=42),
+ types.ArrayOfEqualSizedArrays(shape=(5, 5), dtype=np.float32, fill_val=42),
)
struct.add_field(
"vov",
- lgdo.VectorOfVectors(
- flattened_data=lgdo.Array(
+ types.VectorOfVectors(
+ flattened_data=types.Array(
nda=np.array([1, 2, 3, 4, 5, 2, 4, 8, 9, 7, 5, 3, 1])
),
- cumulative_length=lgdo.Array(nda=np.array([2, 5, 6, 10, 13])),
+ cumulative_length=types.Array(nda=np.array([2, 5, 6, 10, 13])),
attrs={"myattr": 2},
),
)
struct.add_field(
"voev",
- lgdo.VectorOfEncodedVectors(
- encoded_data=lgdo.VectorOfVectors(
- flattened_data=lgdo.Array(
+ types.VectorOfEncodedVectors(
+ encoded_data=types.VectorOfVectors(
+ flattened_data=types.Array(
nda=np.array([1, 2, 3, 4, 5, 2, 4, 8, 9, 7, 5, 3, 1])
),
- cumulative_length=lgdo.Array(nda=np.array([2, 5, 6, 10, 13])),
+ cumulative_length=types.Array(nda=np.array([2, 5, 6, 10, 13])),
),
- decoded_size=lgdo.Array(shape=5, fill_val=6),
+ decoded_size=types.Array(shape=5, fill_val=6),
),
)
@@ -142,14 +132,14 @@ def lh5_file(tmptestdir):
),
}
- struct.add_field("table", lgdo.Table(col_dict=col_dict, attrs={"stuff": 5}))
+ struct.add_field("table", types.Table(col_dict=col_dict, attrs={"stuff": 5}))
struct.add_field(
"wftable",
- lgdo.WaveformTable(
- t0=lgdo.Array(np.zeros(10)),
- dt=lgdo.Array(np.full(10, fill_value=1)),
- values=lgdo.ArrayOfEqualSizedArrays(
+ types.WaveformTable(
+ t0=types.Array(np.zeros(10)),
+ dt=types.Array(np.full(10, fill_value=1)),
+ values=types.ArrayOfEqualSizedArrays(
shape=(10, 1000), dtype=np.uint16, fill_val=100, attrs={"custom": 8}
),
),
@@ -157,16 +147,16 @@ def lh5_file(tmptestdir):
struct.add_field(
"wftable_enc",
- lgdo.WaveformTable(
- t0=lgdo.Array(np.zeros(10)),
- dt=lgdo.Array(np.full(10, fill_value=1)),
+ types.WaveformTable(
+ t0=types.Array(np.zeros(10)),
+ dt=types.Array(np.full(10, fill_value=1)),
values=compression.encode(
struct["wftable"].values, codec=RadwareSigcompress(codec_shift=-32768)
),
),
)
- store.write_object(
+ store.write(
struct,
"struct",
f"{tmptestdir}/tmp-pygama-lgdo-types.lh5",
@@ -176,7 +166,7 @@ def lh5_file(tmptestdir):
wo_mode="overwrite_file",
)
- store.write_object(
+ store.write(
struct,
"struct_full",
f"{tmptestdir}/tmp-pygama-lgdo-types.lh5",
@@ -194,7 +184,7 @@ def test_write_objects(lh5_file):
def test_read_n_rows(lh5_file):
- store = LH5Store()
+ store = lh5.LH5Store()
assert store.read_n_rows("/data/struct_full/aoesa", lh5_file) == 5
assert store.read_n_rows("/data/struct_full/array", lh5_file) == 5
assert store.read_n_rows("/data/struct_full/scalar", lh5_file) is None
@@ -206,14 +196,14 @@ def test_read_n_rows(lh5_file):
def test_get_buffer(lh5_file):
- store = LH5Store()
+ store = lh5.LH5Store()
buf = store.get_buffer("/data/struct_full/wftable_enc", lh5_file)
- assert isinstance(buf.values, lgdo.ArrayOfEqualSizedArrays)
+ assert isinstance(buf.values, types.ArrayOfEqualSizedArrays)
def test_read_scalar(lh5_file):
- store = LH5Store()
- lh5_obj, n_rows = store.read_object("/data/struct/scalar", lh5_file)
+ store = lh5.LH5Store()
+ lh5_obj, n_rows = store.read("/data/struct/scalar", lh5_file)
assert isinstance(lh5_obj, lgdo.Scalar)
assert lh5_obj.value == 10
assert n_rows == 1
@@ -223,9 +213,9 @@ def test_read_scalar(lh5_file):
def test_read_array(lh5_file):
- store = LH5Store()
- lh5_obj, n_rows = store.read_object("/data/struct/array", lh5_file)
- assert isinstance(lh5_obj, lgdo.Array)
+ store = lh5.LH5Store()
+ lh5_obj, n_rows = store.read("/data/struct/array", lh5_file)
+ assert isinstance(lh5_obj, types.Array)
assert (lh5_obj.nda == np.array([2, 3, 4])).all()
assert n_rows == 3
with h5py.File(lh5_file) as h5f:
@@ -236,19 +226,17 @@ def test_read_array(lh5_file):
def test_read_array_fancy_idx(lh5_file):
- store = LH5Store()
- lh5_obj, n_rows = store.read_object(
- "/data/struct_full/array", lh5_file, idx=[0, 3, 4]
- )
- assert isinstance(lh5_obj, lgdo.Array)
+ store = lh5.LH5Store()
+ lh5_obj, n_rows = store.read("/data/struct_full/array", lh5_file, idx=[0, 3, 4])
+ assert isinstance(lh5_obj, types.Array)
assert (lh5_obj.nda == np.array([1, 4, 5])).all()
assert n_rows == 3
def test_read_vov(lh5_file):
- store = LH5Store()
- lh5_obj, n_rows = store.read_object("/data/struct/vov", lh5_file)
- assert isinstance(lh5_obj, lgdo.VectorOfVectors)
+ store = lh5.LH5Store()
+ lh5_obj, n_rows = store.read("/data/struct/vov", lh5_file)
+ assert isinstance(lh5_obj, types.VectorOfVectors)
desired = [np.array([3, 4, 5]), np.array([2]), np.array([4, 8, 9, 7])]
@@ -270,9 +258,9 @@ def test_read_vov(lh5_file):
def test_read_vov_fancy_idx(lh5_file):
- store = LH5Store()
- lh5_obj, n_rows = store.read_object("/data/struct_full/vov", lh5_file, idx=[0, 2])
- assert isinstance(lh5_obj, lgdo.VectorOfVectors)
+ store = lh5.LH5Store()
+ lh5_obj, n_rows = store.read("/data/struct_full/vov", lh5_file, idx=[0, 2])
+ assert isinstance(lh5_obj, types.VectorOfVectors)
desired = [np.array([1, 2]), np.array([2])]
@@ -283,9 +271,9 @@ def test_read_vov_fancy_idx(lh5_file):
def test_read_voev(lh5_file):
- store = LH5Store()
- lh5_obj, n_rows = store.read_object("/data/struct/voev", lh5_file, decompress=False)
- assert isinstance(lh5_obj, lgdo.VectorOfEncodedVectors)
+ store = lh5.LH5Store()
+ lh5_obj, n_rows = store.read("/data/struct/voev", lh5_file, decompress=False)
+ assert isinstance(lh5_obj, types.VectorOfEncodedVectors)
desired = [np.array([3, 4, 5]), np.array([2]), np.array([4, 8, 9, 7])]
@@ -294,10 +282,10 @@ def test_read_voev(lh5_file):
assert n_rows == 3
- lh5_obj, n_rows = store.read_object(
+ lh5_obj, n_rows = store.read(
"/data/struct/voev", [lh5_file, lh5_file], decompress=False
)
- assert isinstance(lh5_obj, lgdo.VectorOfEncodedVectors)
+ assert isinstance(lh5_obj, types.VectorOfEncodedVectors)
assert n_rows == 6
with h5py.File(lh5_file) as h5f:
@@ -313,11 +301,11 @@ def test_read_voev(lh5_file):
def test_read_voev_fancy_idx(lh5_file):
- store = LH5Store()
- lh5_obj, n_rows = store.read_object(
+ store = lh5.LH5Store()
+ lh5_obj, n_rows = store.read(
"/data/struct_full/voev", lh5_file, idx=[0, 2], decompress=False
)
- assert isinstance(lh5_obj, lgdo.VectorOfEncodedVectors)
+ assert isinstance(lh5_obj, types.VectorOfEncodedVectors)
desired = [np.array([1, 2]), np.array([2])]
@@ -328,27 +316,27 @@ def test_read_voev_fancy_idx(lh5_file):
def test_read_aoesa(lh5_file):
- store = LH5Store()
- lh5_obj, n_rows = store.read_object("/data/struct/aoesa", lh5_file)
- assert isinstance(lh5_obj, lgdo.ArrayOfEqualSizedArrays)
+ store = lh5.LH5Store()
+ lh5_obj, n_rows = store.read("/data/struct/aoesa", lh5_file)
+ assert isinstance(lh5_obj, types.ArrayOfEqualSizedArrays)
assert (lh5_obj.nda == np.full((3, 5), fill_value=42)).all()
def test_read_table(lh5_file):
- store = LH5Store()
- lh5_obj, n_rows = store.read_object("/data/struct/table", lh5_file)
- assert isinstance(lh5_obj, lgdo.Table)
+ store = lh5.LH5Store()
+ lh5_obj, n_rows = store.read("/data/struct/table", lh5_file)
+ assert isinstance(lh5_obj, types.Table)
assert n_rows == 3
- lh5_obj, n_rows = store.read_object("/data/struct/table", [lh5_file, lh5_file])
+ lh5_obj, n_rows = store.read("/data/struct/table", [lh5_file, lh5_file])
assert n_rows == 6
assert lh5_obj.attrs["stuff"] == 5
assert lh5_obj["a"].attrs["attr"] == 9
def test_read_hdf5_compressed_data(lh5_file):
- store = LH5Store()
- lh5_obj, n_rows = store.read_object("/data/struct/table", lh5_file)
+ store = lh5.LH5Store()
+ lh5_obj, n_rows = store.read("/data/struct/table", lh5_file)
assert "compression" not in lh5_obj["b"].attrs
with h5py.File(lh5_file) as h5f:
@@ -363,12 +351,12 @@ def test_read_hdf5_compressed_data(lh5_file):
def test_read_wftable(lh5_file):
- store = LH5Store()
- lh5_obj, n_rows = store.read_object("/data/struct/wftable", lh5_file)
- assert isinstance(lh5_obj, lgdo.WaveformTable)
+ store = lh5.LH5Store()
+ lh5_obj, n_rows = store.read("/data/struct/wftable", lh5_file)
+ assert isinstance(lh5_obj, types.WaveformTable)
assert n_rows == 3
- lh5_obj, n_rows = store.read_object("/data/struct/wftable", [lh5_file, lh5_file])
+ lh5_obj, n_rows = store.read("/data/struct/wftable", [lh5_file, lh5_file])
assert n_rows == 6
assert lh5_obj.values.attrs["custom"] == 8
@@ -388,32 +376,30 @@ def test_read_wftable(lh5_file):
def test_read_wftable_encoded(lh5_file):
- store = LH5Store()
- lh5_obj, n_rows = store.read_object(
- "/data/struct/wftable_enc", lh5_file, decompress=False
- )
- assert isinstance(lh5_obj, lgdo.WaveformTable)
- assert isinstance(lh5_obj.values, lgdo.ArrayOfEncodedEqualSizedArrays)
+ store = lh5.LH5Store()
+ lh5_obj, n_rows = store.read("/data/struct/wftable_enc", lh5_file, decompress=False)
+ assert isinstance(lh5_obj, types.WaveformTable)
+ assert isinstance(lh5_obj.values, types.ArrayOfEncodedEqualSizedArrays)
assert n_rows == 3
assert lh5_obj.values.attrs["codec"] == "radware_sigcompress"
assert "codec_shift" in lh5_obj.values.attrs
- lh5_obj, n_rows = store.read_object("/data/struct/wftable_enc/values", lh5_file)
+ lh5_obj, n_rows = store.read("/data/struct/wftable_enc/values", lh5_file)
assert isinstance(lh5_obj, lgdo.ArrayOfEqualSizedArrays)
assert n_rows == 3
- lh5_obj, n_rows = store.read_object("/data/struct/wftable_enc", lh5_file)
+ lh5_obj, n_rows = store.read("/data/struct/wftable_enc", lh5_file)
assert isinstance(lh5_obj, lgdo.WaveformTable)
assert isinstance(lh5_obj.values, lgdo.ArrayOfEqualSizedArrays)
assert n_rows == 3
- lh5_obj_chain, n_rows = store.read_object(
+ lh5_obj_chain, n_rows = store.read(
"/data/struct/wftable_enc", [lh5_file, lh5_file], decompress=False
)
assert n_rows == 6
assert isinstance(lh5_obj_chain.values, lgdo.ArrayOfEncodedEqualSizedArrays)
- lh5_obj_chain, n_rows = store.read_object(
+ lh5_obj_chain, n_rows = store.read(
"/data/struct/wftable_enc", [lh5_file, lh5_file], decompress=True
)
assert isinstance(lh5_obj_chain.values, lgdo.ArrayOfEqualSizedArrays)
@@ -440,24 +426,22 @@ def test_read_wftable_encoded(lh5_file):
def test_read_with_field_mask(lh5_file):
- store = LH5Store()
+ store = lh5.LH5Store()
- lh5_obj, n_rows = store.read_object(
- "/data/struct_full", lh5_file, field_mask=["array"]
- )
+ lh5_obj, n_rows = store.read("/data/struct_full", lh5_file, field_mask=["array"])
assert list(lh5_obj.keys()) == ["array"]
- lh5_obj, n_rows = store.read_object(
+ lh5_obj, n_rows = store.read(
"/data/struct_full", lh5_file, field_mask=("array", "table")
)
assert list(lh5_obj.keys()) == ["array", "table"]
- lh5_obj, n_rows = store.read_object(
+ lh5_obj, n_rows = store.read(
"/data/struct_full", lh5_file, field_mask={"array": True}
)
assert list(lh5_obj.keys()) == ["array"]
- lh5_obj, n_rows = store.read_object(
+ lh5_obj, n_rows = store.read(
"/data/struct_full", lh5_file, field_mask={"vov": False, "voev": False}
)
assert list(lh5_obj.keys()) == [
@@ -471,45 +455,45 @@ def test_read_with_field_mask(lh5_file):
def test_read_lgnd_array(lgnd_file):
- store = LH5Store()
+ store = lh5.LH5Store()
- lh5_obj, n_rows = store.read_object("/geds/raw/baseline", lgnd_file)
- assert isinstance(lh5_obj, lgdo.Array)
+ lh5_obj, n_rows = store.read("/geds/raw/baseline", lgnd_file)
+ assert isinstance(lh5_obj, types.Array)
assert n_rows == 100
assert len(lh5_obj) == 100
- lh5_obj, n_rows = store.read_object("/geds/raw/waveform/values", lgnd_file)
- assert isinstance(lh5_obj, lgdo.ArrayOfEqualSizedArrays)
+ lh5_obj, n_rows = store.read("/geds/raw/waveform/values", lgnd_file)
+ assert isinstance(lh5_obj, types.ArrayOfEqualSizedArrays)
def test_read_lgnd_array_fancy_idx(lgnd_file):
- store = LH5Store()
+ store = lh5.LH5Store()
- lh5_obj, n_rows = store.read_object(
+ lh5_obj, n_rows = store.read(
"/geds/raw/baseline", lgnd_file, idx=[2, 4, 6, 9, 11, 16, 68]
)
- assert isinstance(lh5_obj, lgdo.Array)
+ assert isinstance(lh5_obj, types.Array)
assert n_rows == 7
assert len(lh5_obj) == 7
assert (lh5_obj.nda == [13508, 14353, 14525, 14341, 15079, 11675, 13995]).all()
def test_read_lgnd_vov(lgnd_file):
- store = LH5Store()
+ store = lh5.LH5Store()
- lh5_obj, n_rows = store.read_object("/geds/raw/tracelist", lgnd_file)
- assert isinstance(lh5_obj, lgdo.VectorOfVectors)
+ lh5_obj, n_rows = store.read("/geds/raw/tracelist", lgnd_file)
+ assert isinstance(lh5_obj, types.VectorOfVectors)
assert n_rows == 100
assert len(lh5_obj) == 100
def test_read_lgnd_vov_fancy_idx(lgnd_file):
- store = LH5Store()
+ store = lh5.LH5Store()
- lh5_obj, n_rows = store.read_object(
+ lh5_obj, n_rows = store.read(
"/geds/raw/tracelist", lgnd_file, idx=[2, 4, 6, 9, 11, 16, 68]
)
- assert isinstance(lh5_obj, lgdo.VectorOfVectors)
+ assert isinstance(lh5_obj, types.VectorOfVectors)
assert n_rows == 7
assert len(lh5_obj) == 7
assert (lh5_obj.cumulative_length.nda == [1, 2, 3, 4, 5, 6, 7]).all()
@@ -517,20 +501,20 @@ def test_read_lgnd_vov_fancy_idx(lgnd_file):
def test_read_array_concatenation(lgnd_file):
- store = LH5Store()
- lh5_obj, n_rows = store.read_object("/geds/raw/baseline", [lgnd_file, lgnd_file])
- assert isinstance(lh5_obj, lgdo.Array)
+ store = lh5.LH5Store()
+ lh5_obj, n_rows = store.read("/geds/raw/baseline", [lgnd_file, lgnd_file])
+ assert isinstance(lh5_obj, types.Array)
assert n_rows == 200
assert len(lh5_obj) == 200
def test_read_lgnd_waveform_table(lgnd_file):
- store = LH5Store()
+ store = lh5.LH5Store()
- lh5_obj, n_rows = store.read_object("/geds/raw/waveform", lgnd_file)
- assert isinstance(lh5_obj, lgdo.WaveformTable)
+ lh5_obj, n_rows = store.read("/geds/raw/waveform", lgnd_file)
+ assert isinstance(lh5_obj, types.WaveformTable)
- lh5_obj, n_rows = store.read_object(
+ lh5_obj, n_rows = store.read(
"/geds/raw/waveform",
lgnd_file,
start_row=10,
@@ -538,29 +522,29 @@ def test_read_lgnd_waveform_table(lgnd_file):
field_mask=["t0", "dt"],
)
- assert isinstance(lh5_obj, lgdo.Table)
+ assert isinstance(lh5_obj, types.Table)
assert list(lh5_obj.keys()) == ["t0", "dt"]
assert len(lh5_obj) == 10
def test_read_lgnd_waveform_table_fancy_idx(lgnd_file):
- store = LH5Store()
+ store = lh5.LH5Store()
- lh5_obj, n_rows = store.read_object(
+ lh5_obj, n_rows = store.read(
"/geds/raw/waveform",
lgnd_file,
idx=[7, 9, 25, 27, 33, 38, 46, 52, 57, 59, 67, 71, 72, 82, 90, 92, 93, 94, 97],
)
- assert isinstance(lh5_obj, lgdo.WaveformTable)
+ assert isinstance(lh5_obj, types.WaveformTable)
assert len(lh5_obj) == 19
@pytest.fixture(scope="module")
def enc_lgnd_file(lgnd_file, tmptestdir):
- store = LH5Store()
- wft, n_rows = store.read_object("/geds/raw/waveform", lgnd_file)
+ store = lh5.LH5Store()
+ wft, n_rows = store.read("/geds/raw/waveform", lgnd_file)
wft.values.attrs["compression"] = RadwareSigcompress(codec_shift=-32768)
- store.write_object(
+ store.write(
wft,
"/geds/raw/waveform",
f"{tmptestdir}/tmp-pygama-compressed-wfs.lh5",
@@ -574,16 +558,16 @@ def test_write_compressed_lgnd_waveform_table(enc_lgnd_file):
def test_read_compressed_lgnd_waveform_table(lgnd_file, enc_lgnd_file):
- store = LH5Store()
- wft, _ = store.read_object("/geds/raw/waveform", enc_lgnd_file)
- assert isinstance(wft.values, lgdo.ArrayOfEqualSizedArrays)
+ store = lh5.LH5Store()
+ wft, _ = store.read("/geds/raw/waveform", enc_lgnd_file)
+ assert isinstance(wft.values, types.ArrayOfEqualSizedArrays)
assert "compression" not in wft.values.attrs
def test_write_with_hdf5_compression(lgnd_file, tmptestdir):
- store = LH5Store()
- wft, n_rows = store.read_object("/geds/raw/waveform", lgnd_file)
- store.write_object(
+ store = lh5.LH5Store()
+ wft, n_rows = store.read("/geds/raw/waveform", lgnd_file)
+ store.write(
wft,
"/geds/raw/waveform",
f"{tmptestdir}/tmp-pygama-hdf5-compressed-wfs.lh5",
@@ -597,7 +581,7 @@ def test_write_with_hdf5_compression(lgnd_file, tmptestdir):
assert h5f["/geds/raw/waveform/values"].compression_opts == 9
assert h5f["/geds/raw/waveform/values"].shuffle is True
- store.write_object(
+ store.write(
wft,
"/geds/raw/waveform",
f"{tmptestdir}/tmp-pygama-hdf5-compressed-wfs.lh5",
@@ -618,13 +602,13 @@ def test_write_object_overwrite_table_no_deletion(caplog, tmptestdir):
if os.path.exists(f"{tmptestdir}/write_object_overwrite_test.lh5"):
os.remove(f"{tmptestdir}/write_object_overwrite_test.lh5")
- tb1 = lh5.Table(col_dict={"dset1": lh5.Array(np.zeros(10))})
- tb2 = lh5.Table(
- col_dict={"dset1": lh5.Array(np.ones(10))}
+ tb1 = types.Table(col_dict={"dset1": types.Array(np.zeros(10))})
+ tb2 = types.Table(
+ col_dict={"dset1": types.Array(np.ones(10))}
) # Same field name, different values
- store = LH5Store()
- store.write_object(tb1, "my_group", f"{tmptestdir}/write_object_overwrite_test.lh5")
- store.write_object(
+ store = lh5.LH5Store()
+ store.write(tb1, "my_group", f"{tmptestdir}/write_object_overwrite_test.lh5")
+ store.write(
tb2,
"my_group",
f"{tmptestdir}/write_object_overwrite_test.lh5",
@@ -637,9 +621,7 @@ def test_write_object_overwrite_table_no_deletion(caplog, tmptestdir):
]
# Now, check that the data were overwritten
- tb_dat, _ = store.read_object(
- "my_group", f"{tmptestdir}/write_object_overwrite_test.lh5"
- )
+ tb_dat, _ = store.read("my_group", f"{tmptestdir}/write_object_overwrite_test.lh5")
assert np.array_equal(tb_dat["dset1"].nda, np.ones(10))
@@ -651,13 +633,13 @@ def test_write_object_overwrite_table_with_deletion(caplog, tmptestdir):
if os.path.exists(f"{tmptestdir}/write_object_overwrite_test.lh5"):
os.remove(f"{tmptestdir}/write_object_overwrite_test.lh5")
- tb1 = lh5.Table(col_dict={"dset1": lh5.Array(np.zeros(10))})
- tb2 = lh5.Table(
- col_dict={"dset2": lh5.Array(np.ones(10))}
+ tb1 = types.Table(col_dict={"dset1": types.Array(np.zeros(10))})
+ tb2 = types.Table(
+ col_dict={"dset2": types.Array(np.ones(10))}
) # Same field name, different values
- store = LH5Store()
- store.write_object(tb1, "my_group", f"{tmptestdir}/write_object_overwrite_test.lh5")
- store.write_object(
+ store = lh5.LH5Store()
+ store.write(tb1, "my_group", f"{tmptestdir}/write_object_overwrite_test.lh5")
+ store.write(
tb2,
"my_group",
f"{tmptestdir}/write_object_overwrite_test.lh5",
@@ -665,9 +647,7 @@ def test_write_object_overwrite_table_with_deletion(caplog, tmptestdir):
) # Now, try to overwrite with a different field
# Now, check that the data were overwritten
- tb_dat, _ = store.read_object(
- "my_group", f"{tmptestdir}/write_object_overwrite_test.lh5"
- )
+ tb_dat, _ = store.read("my_group", f"{tmptestdir}/write_object_overwrite_test.lh5")
assert np.array_equal(tb_dat["dset2"].nda, np.ones(10))
# Also make sure that the first table's fields aren't lurking around the lh5 file!
@@ -678,18 +658,18 @@ def test_write_object_overwrite_table_with_deletion(caplog, tmptestdir):
if os.path.exists(f"{tmptestdir}/write_object_overwrite_test.lh5"):
os.remove(f"{tmptestdir}/write_object_overwrite_test.lh5")
- tb1 = lh5.Table(col_dict={"dset1": lh5.Array(np.zeros(10))})
- tb2 = lh5.Table(
- col_dict={"dset2": lh5.Array(np.ones(10))}
+ tb1 = types.Table(col_dict={"dset1": types.Array(np.zeros(10))})
+ tb2 = types.Table(
+ col_dict={"dset2": types.Array(np.ones(10))}
) # Same field name, different values
- store = LH5Store()
- store.write_object(
+ store = lh5.LH5Store()
+ store.write(
tb1,
"my_table",
f"{tmptestdir}/write_object_overwrite_test.lh5",
group="my_group",
)
- store.write_object(
+ store.write(
tb2,
"my_table",
f"{tmptestdir}/write_object_overwrite_test.lh5",
@@ -698,7 +678,7 @@ def test_write_object_overwrite_table_with_deletion(caplog, tmptestdir):
) # Now, try to overwrite with a different field
# Now, check that the data were overwritten
- tb_dat, _ = store.read_object(
+ tb_dat, _ = store.read(
"my_group/my_table", f"{tmptestdir}/write_object_overwrite_test.lh5"
)
assert np.array_equal(tb_dat["dset2"].nda, np.ones(10))
@@ -713,11 +693,11 @@ def test_write_object_overwrite_lgdo(caplog, tmptestdir):
caplog.set_level(logging.DEBUG)
caplog.clear()
- # Start with an lgdo.WaveformTable
+ # Start with an types.WaveformTable
if os.path.exists(f"{tmptestdir}/write_object_overwrite_test.lh5"):
os.remove(f"{tmptestdir}/write_object_overwrite_test.lh5")
- tb1 = lh5.WaveformTable(
+ tb1 = types.WaveformTable(
t0=np.zeros(10),
t0_units="ns",
dt=np.zeros(10),
@@ -725,7 +705,7 @@ def test_write_object_overwrite_lgdo(caplog, tmptestdir):
values=np.zeros((10, 10)),
values_units="ADC",
)
- tb2 = lh5.WaveformTable(
+ tb2 = types.WaveformTable(
t0=np.ones(10),
t0_units="ns",
dt=np.ones(10),
@@ -733,14 +713,14 @@ def test_write_object_overwrite_lgdo(caplog, tmptestdir):
values=np.ones((10, 10)),
values_units="ADC",
) # Same field name, different values
- store = LH5Store()
- store.write_object(
+ store = lh5.LH5Store()
+ store.write(
tb1,
"my_table",
f"{tmptestdir}/write_object_overwrite_test.lh5",
group="my_group",
)
- store.write_object(
+ store.write(
tb2,
"my_table",
f"{tmptestdir}/write_object_overwrite_test.lh5",
@@ -754,19 +734,17 @@ def test_write_object_overwrite_lgdo(caplog, tmptestdir):
]
# Now, check that the data were overwritten
- tb_dat, _ = store.read_object(
+ tb_dat, _ = store.read(
"my_group/my_table", f"{tmptestdir}/write_object_overwrite_test.lh5"
)
assert np.array_equal(tb_dat["values"].nda, np.ones((10, 10)))
# Now try overwriting an array, and test the write_start argument
- array1 = lh5.Array(nda=np.zeros(10))
- array2 = lh5.Array(nda=np.ones(20))
- store = LH5Store()
- store.write_object(
- array1, "my_array", f"{tmptestdir}/write_object_overwrite_test.lh5"
- )
- store.write_object(
+ array1 = types.Array(nda=np.zeros(10))
+ array2 = types.Array(nda=np.ones(20))
+ store = lh5.LH5Store()
+ store.write(array1, "my_array", f"{tmptestdir}/write_object_overwrite_test.lh5")
+ store.write(
array2,
"my_array",
f"{tmptestdir}/write_object_overwrite_test.lh5",
@@ -775,7 +753,7 @@ def test_write_object_overwrite_lgdo(caplog, tmptestdir):
)
# Now, check that the data were overwritten
- array_dat, _ = store.read_object(
+ array_dat, _ = store.read(
"my_array", f"{tmptestdir}/write_object_overwrite_test.lh5"
)
expected_out_array = np.append(np.zeros(5), np.ones(20))
@@ -783,13 +761,11 @@ def test_write_object_overwrite_lgdo(caplog, tmptestdir):
assert np.array_equal(array_dat.nda, expected_out_array)
# Now try overwriting a scalar
- scalar1 = lh5.Scalar(0)
- scalar2 = lh5.Scalar(1)
- store = LH5Store()
- store.write_object(
- scalar1, "my_scalar", f"{tmptestdir}/write_object_overwrite_test.lh5"
- )
- store.write_object(
+ scalar1 = types.Scalar(0)
+ scalar2 = types.Scalar(1)
+ store = lh5.LH5Store()
+ store.write(scalar1, "my_scalar", f"{tmptestdir}/write_object_overwrite_test.lh5")
+ store.write(
scalar2,
"my_scalar",
f"{tmptestdir}/write_object_overwrite_test.lh5",
@@ -797,20 +773,18 @@ def test_write_object_overwrite_lgdo(caplog, tmptestdir):
)
# Now, check that the data were overwritten
- scalar_dat, _ = store.read_object(
+ scalar_dat, _ = store.read(
"my_scalar", f"{tmptestdir}/write_object_overwrite_test.lh5"
)
assert scalar_dat.value == 1
# Finally, try overwriting a vector of vectors
- vov1 = lh5.VectorOfVectors(listoflists=[np.zeros(1), np.ones(2), np.zeros(3)])
- vov2 = lh5.VectorOfVectors(listoflists=[np.ones(1), np.zeros(2), np.ones(3)])
- store = LH5Store()
- store.write_object(
- vov1, "my_vector", f"{tmptestdir}/write_object_overwrite_test.lh5"
- )
- store.write_object(
+ vov1 = types.VectorOfVectors(listoflists=[np.zeros(1), np.ones(2), np.zeros(3)])
+ vov2 = types.VectorOfVectors(listoflists=[np.ones(1), np.zeros(2), np.ones(3)])
+ store = lh5.LH5Store()
+ store.write(vov1, "my_vector", f"{tmptestdir}/write_object_overwrite_test.lh5")
+ store.write(
vov2,
"my_vector",
f"{tmptestdir}/write_object_overwrite_test.lh5",
@@ -818,7 +792,7 @@ def test_write_object_overwrite_lgdo(caplog, tmptestdir):
write_start=1,
) # start overwriting the second list of lists
- vector_dat, _ = store.read_object(
+ vector_dat, _ = store.read(
"my_vector", f"{tmptestdir}/write_object_overwrite_test.lh5"
)
@@ -832,14 +806,12 @@ def test_write_object_append_column(tmptestdir):
if os.path.exists(f"{tmptestdir}/write_object_append_column_test.lh5"):
os.remove(f"{tmptestdir}/write_object_append_column_test.lh5")
- array1 = lh5.Array(np.zeros(10))
- tb1 = lh5.Table(col_dict={"dset1`": lh5.Array(np.ones(10))})
- store = LH5Store()
- store.write_object(
- array1, "my_table", f"{tmptestdir}/write_object_append_column_test.lh5"
- )
+ array1 = types.Array(np.zeros(10))
+ tb1 = types.Table(col_dict={"dset1`": types.Array(np.ones(10))})
+ store = lh5.LH5Store()
+ store.write(array1, "my_table", f"{tmptestdir}/write_object_append_column_test.lh5")
with pytest.raises(RuntimeError) as exc_info:
- store.write_object(
+ store.write(
tb1,
"my_table",
f"{tmptestdir}/write_object_append_column_test.lh5",
@@ -855,18 +827,19 @@ def test_write_object_append_column(tmptestdir):
if os.path.exists(f"{tmptestdir}/write_object_append_column_test.lh5"):
os.remove(f"{tmptestdir}/write_object_append_column_test.lh5")
- tb1 = lh5.Table(
- col_dict={"dset1": lh5.Array(np.zeros(10)), "dset2": lh5.Array(np.zeros(10))}
+ tb1 = types.Table(
+ col_dict={
+ "dset1": types.Array(np.zeros(10)),
+ "dset2": types.Array(np.zeros(10)),
+ }
)
- tb2 = lh5.Table(
- col_dict={"dset2": lh5.Array(np.ones(10))}
+ tb2 = types.Table(
+ col_dict={"dset2": types.Array(np.ones(10))}
) # Same field name, different values
- store = LH5Store()
- store.write_object(
- tb1, "my_table", f"{tmptestdir}/write_object_append_column_test.lh5"
- )
+ store = lh5.LH5Store()
+ store.write(tb1, "my_table", f"{tmptestdir}/write_object_append_column_test.lh5")
with pytest.raises(ValueError) as exc_info:
- store.write_object(
+ store.write(
tb2,
"my_table",
f"{tmptestdir}/write_object_append_column_test.lh5",
@@ -883,16 +856,14 @@ def test_write_object_append_column(tmptestdir):
if os.path.exists(f"{tmptestdir}/write_object_append_column_test.lh5"):
os.remove(f"{tmptestdir}/write_object_append_column_test.lh5")
- tb1 = lh5.Table(col_dict={"dset1": lh5.Array(np.zeros(10))})
- tb2 = lh5.Table(
- col_dict={"dset2": lh5.Array(np.ones(20))}
+ tb1 = types.Table(col_dict={"dset1": types.Array(np.zeros(10))})
+ tb2 = types.Table(
+ col_dict={"dset2": types.Array(np.ones(20))}
) # different field name, different size
- store = LH5Store()
- store.write_object(
- tb1, "my_table", f"{tmptestdir}/write_object_append_column_test.lh5"
- )
+ store = lh5.LH5Store()
+ store.write(tb1, "my_table", f"{tmptestdir}/write_object_append_column_test.lh5")
with pytest.raises(ValueError) as exc_info:
- store.write_object(
+ store.write(
tb2,
"my_table",
f"{tmptestdir}/write_object_append_column_test.lh5",
@@ -909,18 +880,18 @@ def test_write_object_append_column(tmptestdir):
if os.path.exists(f"{tmptestdir}/write_object_append_column_test.lh5"):
os.remove(f"{tmptestdir}/write_object_append_column_test.lh5")
- tb1 = lh5.Table(col_dict={"dset1": lh5.Array(np.zeros(10))})
- tb2 = lh5.Table(
- col_dict={"dset2": lh5.Array(np.ones(10))}
+ tb1 = types.Table(col_dict={"dset1": types.Array(np.zeros(10))})
+ tb2 = types.Table(
+ col_dict={"dset2": types.Array(np.ones(10))}
) # different field name, different size
- store = LH5Store()
- store.write_object(
+ store = lh5.LH5Store()
+ store.write(
tb1,
"my_table",
f"{tmptestdir}/write_object_append_column_test.lh5",
group="my_group",
)
- store.write_object(
+ store.write(
tb2,
"my_table",
f"{tmptestdir}/write_object_append_column_test.lh5",
@@ -929,9 +900,20 @@ def test_write_object_append_column(tmptestdir):
)
# Now, check that the data were appended
- tb_dat, _ = store.read_object(
+ tb_dat, _ = store.read(
"my_group/my_table", f"{tmptestdir}/write_object_append_column_test.lh5"
)
- assert isinstance(tb_dat, lgdo.Table)
+ assert isinstance(tb_dat, types.Table)
assert np.array_equal(tb_dat["dset1"].nda, np.zeros(10))
assert np.array_equal(tb_dat["dset2"].nda, np.ones(10))
+
+
+def test_load_dfs(lgnd_file):
+ dfs = lh5.load_dfs(
+ [lgnd_file, lgnd_file],
+ ["baseline", "waveform/t0"],
+ lh5_group="/geds/raw",
+ idx_list=[[1, 3, 5], [2, 6, 7]],
+ )
+
+ assert isinstance(dfs, pd.DataFrame)
diff --git a/tests/lh5/test_lh5_utils.py b/tests/lh5/test_lh5_utils.py
new file mode 100644
index 00000000..c83dd9a9
--- /dev/null
+++ b/tests/lh5/test_lh5_utils.py
@@ -0,0 +1,72 @@
+import os
+
+import pytest
+
+import lgdo.lh5.utils as utils
+
+
+@pytest.fixture(scope="module")
+def lgnd_file(lgnd_test_data):
+ return lgnd_test_data.get_path("lh5/LDQTA_r117_20200110T105115Z_cal_geds_raw.lh5")
+
+
+def test_parse_datatype():
+ datatypes = [
+ ("real", ("scalar", None, "real")),
+ ("array<1>{bool}", ("array", (1,), "bool")),
+ ("fixedsizearray<2>{real}", ("fixedsizearray", (2,), "real")),
+ (
+ "arrayofequalsizedarrays<3,4>{complex}",
+ ("arrayofequalsizedarrays", (3, 4), "complex"),
+ ),
+ ("array<1>{array<1>{blob}}", ("array", (1,), "array<1>{blob}")),
+ (
+ "struct{field1,field2,fieldn}",
+ ("struct", None, ["field1", "field2", "fieldn"]),
+ ),
+ ("table{col1,col2,coln}", ("table", None, ["col1", "col2", "coln"])),
+ ]
+
+ for string, dt_tuple in datatypes:
+ pd_dt_tuple = utils.parse_datatype(string)
+ assert pd_dt_tuple == dt_tuple
+
+
+def test_expand_vars():
+ # Check env variable expansion
+ os.environ["PYGAMATESTBASEDIR"] = "a_random_string"
+ assert utils.expand_vars("$PYGAMATESTBASEDIR/blah") == "a_random_string/blah"
+
+ # Check user variable expansion
+ assert (
+ utils.expand_vars(
+ "$PYGAMATESTBASEDIR2/blah",
+ substitute={"PYGAMATESTBASEDIR2": "a_random_string"},
+ )
+ == "a_random_string/blah"
+ )
+
+
+def test_expand_path(lgnd_test_data):
+ files = [
+ lgnd_test_data.get_path(
+ "lh5/prod-ref-l200/generated/tier/dsp/cal/p03/r001/l200-p03-r001-cal-20230318T012144Z-tier_dsp.lh5"
+ ),
+ lgnd_test_data.get_path(
+ "lh5/prod-ref-l200/generated/tier/dsp/cal/p03/r001/l200-p03-r001-cal-20230318T012228Z-tier_dsp.lh5"
+ ),
+ ]
+ base_dir = os.path.dirname(files[0])
+
+ assert utils.expand_path(f"{base_dir}/*20230318T012144Z*") == files[0]
+
+ # Should fail if file not found
+ with pytest.raises(FileNotFoundError):
+ utils.expand_path(f"{base_dir}/not_a_real_file.lh5")
+
+ # Should fail if multiple files found
+ with pytest.raises(FileNotFoundError):
+ utils.expand_path(f"{base_dir}/*.lh5")
+
+ # Check if it finds a list of files correctly
+ assert sorted(utils.expand_path(f"{base_dir}/*.lh5", list=True)) == sorted(files)
diff --git a/tests/test_lgdo_utils.py b/tests/test_lgdo_utils.py
index 49df91ca..ce86d971 100644
--- a/tests/test_lgdo_utils.py
+++ b/tests/test_lgdo_utils.py
@@ -1,9 +1,6 @@
-import os
-
import numpy as np
-import pytest
-import lgdo.lgdo_utils as lgdo_utils
+import lgdo.utils as utils
def test_get_element_type():
@@ -20,69 +17,5 @@ def test_get_element_type():
]
for obj, name in objs:
- get_name = lgdo_utils.get_element_type(obj)
+ get_name = utils.get_element_type(obj)
assert get_name == name
-
-
-def test_parse_datatype():
- datatypes = [
- ("real", ("scalar", None, "real")),
- ("array<1>{bool}", ("array", (1,), "bool")),
- ("fixedsizearray<2>{real}", ("fixedsizearray", (2,), "real")),
- (
- "arrayofequalsizedarrays<3,4>{complex}",
- ("arrayofequalsizedarrays", (3, 4), "complex"),
- ),
- ("array<1>{array<1>{blob}}", ("array", (1,), "array<1>{blob}")),
- (
- "struct{field1,field2,fieldn}",
- ("struct", None, ["field1", "field2", "fieldn"]),
- ),
- ("table{col1,col2,coln}", ("table", None, ["col1", "col2", "coln"])),
- ]
-
- for string, dt_tuple in datatypes:
- pd_dt_tuple = lgdo_utils.parse_datatype(string)
- assert pd_dt_tuple == dt_tuple
-
-
-def test_expand_vars():
- # Check env variable expansion
- os.environ["PYGAMATESTBASEDIR"] = "a_random_string"
- assert lgdo_utils.expand_vars("$PYGAMATESTBASEDIR/blah") == "a_random_string/blah"
-
- # Check user variable expansion
- assert (
- lgdo_utils.expand_vars(
- "$PYGAMATESTBASEDIR2/blah",
- substitute={"PYGAMATESTBASEDIR2": "a_random_string"},
- )
- == "a_random_string/blah"
- )
-
-
-def test_expand_path(lgnd_test_data):
- files = [
- lgnd_test_data.get_path(
- "lh5/prod-ref-l200/generated/tier/dsp/cal/p03/r001/l200-p03-r001-cal-20230318T012144Z-tier_dsp.lh5"
- ),
- lgnd_test_data.get_path(
- "lh5/prod-ref-l200/generated/tier/dsp/cal/p03/r001/l200-p03-r001-cal-20230318T012228Z-tier_dsp.lh5"
- ),
- ]
- base_dir = os.path.dirname(files[0])
-
- assert lgdo_utils.expand_path(f"{base_dir}/*20230318T012144Z*") == files[0]
-
- # Should fail if file not found
- with pytest.raises(FileNotFoundError):
- lgdo_utils.expand_path(f"{base_dir}/not_a_real_file.lh5")
-
- # Should fail if multiple files found
- with pytest.raises(FileNotFoundError):
- lgdo_utils.expand_path(f"{base_dir}/*.lh5")
-
- # Check if it finds a list of files correctly
- assert sorted(lgdo_utils.expand_path(f"{base_dir}/*.lh5", list=True)) == sorted(
- files
- )
diff --git a/tests/types/test_array.py b/tests/types/test_array.py
index 0932c99b..df1bcd3c 100644
--- a/tests/types/test_array.py
+++ b/tests/types/test_array.py
@@ -1,6 +1,6 @@
import numpy as np
-import lgdo.lgdo_utils as utils
+import lgdo.utils as utils
from lgdo import Array
diff --git a/tests/types/test_vectorofvectors.py b/tests/types/test_vectorofvectors.py
index 4126d119..71c20ea8 100644
--- a/tests/types/test_vectorofvectors.py
+++ b/tests/types/test_vectorofvectors.py
@@ -2,7 +2,7 @@
import pytest
import lgdo
-import lgdo.lgdo_utils as utils
+import lgdo.utils as utils
from lgdo import VectorOfVectors
from lgdo.types import vectorofvectors as vov