Merge pull request #30 from MoritzNeuberger/issue_4_lgdo_format_conve…

…rsion Add LGDO format conversion utilities
legend-exp · Dec 30, 2023 · 021f397 · 021f397
2 parents c3b2475 + f5a317c
commit 021f397
Show file tree

Hide file tree

Showing 26 changed files with 902 additions and 78 deletions.
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -51,6 +51,7 @@
 intersphinx_mapping = {
     "python": ("https://docs.python.org/3", None),
     "numpy": ("https://numpy.org/doc/stable", None),
+    "awkward": ("https://awkward-array.org/doc/stable", None),
     "numba": ("https://numba.readthedocs.io/en/stable", None),
     "pandas": ("https://pandas.pydata.org/docs", None),
     "h5py": ("https://docs.h5py.org/en/stable", None),

diff --git a/docs/source/notebooks/LH5Files.ipynb b/docs/source/notebooks/LH5Files.ipynb
@@ -48,7 +48,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from lgdo import ls\n",
+    "from lgdo.lh5 import ls\n",
     "\n",
     "ls(lh5_file)"
    ]
@@ -91,7 +91,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from lgdo import show\n",
+    "from lgdo.lh5 import show\n",
     "\n",
     "show(lh5_file)"
    ]
@@ -111,7 +111,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from lgdo import LH5Store\n",
+    "from lgdo.lh5 import LH5Store\n",
     "\n",
     "store = LH5Store()"
    ]
@@ -210,12 +210,141 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from lgdo import LH5Iterator\n",
+    "from lgdo.lh5 import LH5Iterator\n",
     "\n",
     "for lh5_obj, entry, n_rows in LH5Iterator(lh5_file, \"geds/raw/energy\", buffer_len=20):\n",
     "    print(f\"entry {entry}, energy = {lh5_obj} ({n_rows} rows)\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "684f8530",
+   "metadata": {},
+   "source": [
+    "### Converting LGDO data to alternative formats\n",
+    "\n",
+    "Each LGDO is equipped with a class method called `view_as()` [[docs]](https://legend-pydataobj.readthedocs.io/en/stable/api/lgdo.types.html#lgdo.types.lgdo.LGDO.view_as), which allows the user to \"view\" the data (i.e. avoiding copying data as much as possible) in a different, third-party format.\n",
+    "\n",
+    "LGDOs generally support viewing as NumPy (`np`), Pandas (`pd`) or [Awkward](https://awkward-array.org) (`ak`) data structures, with some exceptions. We strongly recommend having a look at the `view_as()` API docs of each LGDO type for more details (for `Table.view_as()` [[docs]](https://legend-pydataobj.readthedocs.io/en/stable/api/lgdo.types.html#lgdo.types.table.Table.view_as), for example).\n",
+    "\n",
+    "<div class=\"alert alert-info\">\n",
+    "\n",
+    "**Note:** To obtain a copy of the data in the selected third-party format, the user can call the appropriate third-party copy method on the view (e.g. `pandas.DataFrame.copy()`, if viewing the data as a Pandas dataframe).\n",
+    "\n",
+    "</div>\n",
+    "\n",
+    "Let's play around with our good old table, can we view it as a Pandas dataframe?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e2f48391",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "obj, _ = store.read(\"geds/raw\", lh5_file)\n",
+    "df = obj.view_as(\"pd\")\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7f476362",
+   "metadata": {},
+   "source": [
+    "Yes! But how are the nested objects being handled?\n",
+    "\n",
+    "Nested tables have been flattened by prefixing their column names with the table object name (`obj.waveform.values` becomes `df.waveform_values`) and multi-dimensional columns are represented by Awkward arrays:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6261c8fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.waveform_values"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6ed5904a",
+   "metadata": {},
+   "source": [
+    "But what if we wanted to have the waveform values as a NumPy array?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f4b45112",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "obj.waveform.values.view_as(\"np\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d0c86728",
+   "metadata": {},
+   "source": [
+    "Can we just view the full table as a huge Awkward array? Of course:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "33ae5c21",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "obj.view_as(\"ak\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cd5fa308",
+   "metadata": {},
+   "source": [
+    "Note that viewing a `VectorOfVector` as an Awkward array is a nearly zero-copy operation and opens a new avenue of fast computational possibilities thanks to Awkward:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d75c8ff8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import awkward as ak\n",
+    "\n",
+    "# tracelist is a VoV on disk\n",
+    "trlist = obj.tracelist.view_as(\"ak\")\n",
+    "ak.mean(trlist)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d8d9ad8c",
+   "metadata": {},
+   "source": [
+    "Last but not least, we support attaching physical units (that might be stored in the `units` attribute of an LGDO) to data views through Pint, if the third-party format allows it:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4007efd4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = obj.view_as(\"pd\", with_units=True)\n",
+    "df.timestamp.dtype"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "3ab3794c",
@@ -278,7 +407,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from lgdo import show\n",
+    "from lgdo.lh5 import show\n",
     "\n",
     "show(\"my_objects.lh5\")"
    ]

diff --git a/setup.cfg b/setup.cfg
@@ -30,6 +30,8 @@ classifiers =
 [options]
 packages = find:
 install_requires =
+    awkward>=2
+    awkward-pandas
     colorlog
     h5py>=3.2
     hdf5plugin
@@ -39,6 +41,7 @@ install_requires =
     pandas>=1.4.4
     parse
     pint
+    pint-pandas
 python_requires = >=3.9
 include_package_data = True
 package_dir =

diff --git a/src/lgdo/compression/radware.py b/src/lgdo/compression/radware.py
@@ -130,7 +130,9 @@ def encode(
             )
         # convert VectorOfVectors to ArrayOfEqualSizedArrays so it can be
         # directly passed to the low-level encoding routine
-        sig_out_nda, nbytes = encode(sig_in.to_aoesa(), shift=shift)
+        sig_out_nda, nbytes = encode(
+            sig_in.to_aoesa(fill_val=0, preserve_dtype=True), shift=shift
+        )
 
         # build the encoded LGDO
         encoded_data = lgdo.ArrayOfEqualSizedArrays(nda=sig_out_nda).to_vov(
@@ -263,7 +265,7 @@ def decode(
         # convert vector of vectors to array of equal sized arrays
         # can now decode on the 2D matrix together with number of bytes to read per row
         _, siglen = decode(
-            (sig_in.encoded_data.to_aoesa(preserve_dtype=True).nda, nbytes),
+            (sig_in.encoded_data.to_aoesa(fill_val=0, preserve_dtype=True).nda, nbytes),
             sig_out if isinstance(sig_out, np.ndarray) else sig_out.nda,
             shift=shift,
         )
@@ -289,7 +291,8 @@ def decode(
         # convert vector of vectors to array of equal sized arrays
         # can now decode on the 2D matrix together with number of bytes to read per row
         sig_out, siglen = decode(
-            (sig_in.encoded_data.to_aoesa(preserve_dtype=True).nda, nbytes), shift=shift
+            (sig_in.encoded_data.to_aoesa(fill_val=0, preserve_dtype=True).nda, nbytes),
+            shift=shift,
         )
 
         # sanity check

diff --git a/src/lgdo/compression/varlen.py b/src/lgdo/compression/varlen.py
@@ -104,7 +104,7 @@ def encode(
             )
         # convert VectorOfVectors to ArrayOfEqualSizedArrays so it can be
         # directly passed to the low-level encoding routine
-        sig_out_nda, nbytes = encode(sig_in.to_aoesa())
+        sig_out_nda, nbytes = encode(sig_in.to_aoesa(fill_val=0, preserve_dtype=True))
 
         # build the encoded LGDO
         encoded_data = lgdo.ArrayOfEqualSizedArrays(nda=sig_out_nda).to_vov(
@@ -228,7 +228,7 @@ def decode(
         # convert vector of vectors to array of equal sized arrays
         # can now decode on the 2D matrix together with number of bytes to read per row
         _, siglen = decode(
-            (sig_in.encoded_data.to_aoesa(preserve_dtype=True).nda, nbytes),
+            (sig_in.encoded_data.to_aoesa(fill_val=0, preserve_dtype=True).nda, nbytes),
             sig_out if isinstance(sig_out, np.ndarray) else sig_out.nda,
         )
 
@@ -253,7 +253,7 @@ def decode(
         # convert vector of vectors to array of equal sized arrays
         # can now decode on the 2D matrix together with number of bytes to read per row
         sig_out, siglen = decode(
-            (sig_in.encoded_data.to_aoesa(preserve_dtype=True).nda, nbytes)
+            (sig_in.encoded_data.to_aoesa(fill_val=0, preserve_dtype=True).nda, nbytes)
         )
 
         # sanity check

diff --git a/src/lgdo/lh5/store.py b/src/lgdo/lh5/store.py
@@ -53,7 +53,7 @@ class LH5Store:
     >>> store = LH5Store()
     >>> obj, _ = store.read("/geds/waveform", "file.lh5")
     >>> type(obj)
-    lgdo.waveform_table.WaveformTable
+    lgdo.waveformtable.WaveformTable
     """
 
     def __init__(self, base_path: str = "", keep_open: bool = False) -> None:
@@ -890,13 +890,13 @@ def write(
         `compression` attribute.
 
         Note
-        ----
+        ----------
         The `compression` LGDO attribute takes precedence over the default HDF5
         compression settings. The `hdf5_settings` attribute takes precedence
         over `compression`. These attributes are not written to disk.
 
         Note
-        ----
+        ----------
         HDF5 compression is skipped for the `encoded_data.flattened_data`
         dataset of :class:`.VectorOfEncodedVectors` and
         :class:`.ArrayOfEncodedEqualSizedArrays`.

diff --git a/src/lgdo/types/__init__.py b/src/lgdo/types/__init__.py
@@ -9,7 +9,7 @@
 from .struct import Struct
 from .table import Table
 from .vectorofvectors import VectorOfVectors
-from .waveform_table import WaveformTable
+from .waveformtable import WaveformTable
 
 __all__ = [
     "Array",

diff --git a/src/lgdo/types/array.py b/src/lgdo/types/array.py
@@ -8,9 +8,14 @@
 from collections.abc import Iterator
 from typing import Any
 
+import awkward as ak
+import awkward_pandas as akpd
 import numpy as np
+import pandas as pd
+import pint_pandas  # noqa: F401
 
 from .. import utils as utils
+from ..units import default_units_registry as u
 from .lgdo import LGDO
 
 log = logging.getLogger(__name__)
@@ -138,3 +143,58 @@ def __repr__(self) -> str:
             )
             + f", attrs={repr(self.attrs)})"
         )
+
+    def view_as(
+        self, library: str, with_units: bool = False
+    ) -> pd.DataFrame | np.NDArray | ak.Array:
+        """View the Array data as a third-party format data structure.
+
+        This is a zero-copy operation. Supported third-party formats are:
+
+        - ``pd``: returns a :class:`pandas.Series`
+        - ``np``: returns the internal `nda` attribute (:class:`numpy.ndarray`)
+        - ``ak``: returns an :class:`ak.Array` initialized with `self.nda`
+
+        Parameters
+        ----------
+        library
+            format of the returned data view.
+        with_units
+            forward physical units to the output data.
+
+        See Also
+        --------
+        .LGDO.view_as
+        """
+        # TODO: does attaching units imply a copy?
+        attach_units = with_units and "units" in self.attrs
+
+        if library == "pd":
+            if attach_units:
+                if self.nda.ndim == 1:
+                    return pd.Series(
+                        self.nda, dtype=f"pint[{self.attrs['units']}]", copy=False
+                    )
+                else:
+                    raise ValueError(
+                        "Pint does not support Awkward yet, you must view the data with_units=False"
+                    )
+            else:
+                if self.nda.ndim == 1:
+                    return pd.Series(self.nda, copy=False)
+                else:
+                    return akpd.from_awkward(self.view_as("ak"))
+        elif library == "np":
+            if attach_units:
+                return self.nda * u(self.attrs["units"])
+            else:
+                return self.nda
+        elif library == "ak":
+            if attach_units:
+                raise ValueError(
+                    "Pint does not support Awkward yet, you must view the data with_units=False"
+                )
+            else:
+                return ak.Array(self.nda)
+        else:
+            raise ValueError(f"{library} is not a supported third-party format.")
diff --git a/src/lgdo/types/arrayofequalsizedarrays.py b/src/lgdo/types/arrayofequalsizedarrays.py
@@ -7,7 +7,9 @@
 from collections.abc import Iterator
 from typing import Any
 
+import awkward as ak
 import numpy as np
+import pandas as pd
 
 from .. import utils as utils
 from . import vectorofvectors as vov
@@ -131,3 +133,14 @@ def to_vov(self, cumulative_length: np.ndarray = None) -> vov.VectorOfVectors:
             cumulative_length=cumulative_length,
             attrs=attrs,
         )
+
+    def view_as(
+        self, library: str, with_units: bool = False
+    ) -> pd.DataFrame | np.NDArray | ak.Array:
+        """View the array as a third-party format data structure.
+
+        See Also
+        --------
+        .LGDO.view_as
+        """
+        return super().view_as(library, with_units=with_units)