diff --git a/docs/reference/hist.dask.rst b/docs/reference/hist.dask.rst new file mode 100644 index 00000000..166d2df8 --- /dev/null +++ b/docs/reference/hist.dask.rst @@ -0,0 +1,29 @@ +hist.dask package +================= + +Submodules +---------- + +hist.dask.hist module +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: hist.dask.hist + :members: + :undoc-members: + :show-inheritance: + +hist.dask.namedhist module +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: hist.dask.namedhist + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: hist.dask + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/reference/hist.rst b/docs/reference/hist.rst index 7ec13691..aabdefad 100644 --- a/docs/reference/hist.rst +++ b/docs/reference/hist.rst @@ -9,6 +9,12 @@ Subpackages hist.axis +.. toctree:: + :maxdepth: 5 + + hist.dask + + Submodules ---------- diff --git a/docs/user-guide/notebooks/Histogram.ipynb b/docs/user-guide/notebooks/Histogram.ipynb index 481d5197..da7867ae 100644 --- a/docs/user-guide/notebooks/Histogram.ipynb +++ b/docs/user-guide/notebooks/Histogram.ipynb @@ -2,60 +2,62 @@ "cells": [ { "cell_type": "markdown", + "metadata": {}, "source": [ "# Histogram" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "hist provides two types of histograms, in which Hist is the general class, NamedHist is a forced-name class. hist supports the whole workflow for a histogram's lifecycle, including some plotting tools and shortcuts which are pretty useful for HEP studies. Here, you can see how to serialize/deserialize (will be achieved), construct, use, and visualize histograms.\n", "\n", "![histogram's lifecycle](https://tva1.sinaimg.cn/large/007S8ZIlgy1ggrgi6xk7fj30y108qjsf.jpg)" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Hist\n", "\n", "Hist is the general class in the hist package based on [boost-histogram](https://github.com/scikit-hep/boost-histogram)'s Histogram. Here is how to serialize/deserialize (will be achieved), construct, use, and visualize histograms via Hist." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### Initialize Hist\n", "\n", "You need to initialized Hist first before you use it. Two ways are provided: you can just fill the axes into the Hist instance and create it; you can also add axes in Hist object via hist proxy. \n", "\n", "When initializing you don't have to use named-axes, axes without names are allowed. Using named-axes is recommended, because you will get more shortcuts to make the best of hist (there is also a classed called NamedHist which forces names be used most places). Duplicated non-empty names are not allowed in the Hist as name is the unique identifier for a Hist object." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "from hist import Hist\n", "import hist" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "#### Standard method:" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# fill the axes\n", "h = Hist(\n", @@ -66,22 +68,22 @@ " 50, -5, 5, name=\"W\", label=\"w [units]\", underflow=False, overflow=False\n", " ),\n", ")" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "#### Shortcut method:\n", "\n", "One benefit of the shortcut method is that you can work entirely from Hist, so `from hist import Hist` can be used." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# add the axes, finalize with storage\n", "h = (\n", @@ -89,29 +91,29 @@ " .Reg(50, -5, 5, name=\"W\", label=\"w [units]\", flow=False)\n", " .Double()\n", ")" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### Manipulate Hist" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "#### Fill Hist\n", "\n", "After initializing the Hist, the most likely thing you want to do is to fill it. The normal method to fill the histogram is just to pass the data to `.fill()`, and the data will be filled in the index order. If you have axes all with names in your Hist, you will have another option -- filling by names in the order of names given." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "import numpy as np\n", "\n", @@ -126,77 +128,77 @@ "\n", "# fill by names\n", "h.fill(W=w_data, S=s_data)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "#### Access Bins\n", "\n", "hist allows you to access the bins of your Hist by various ways. Besides the normal access by index, you can use locations (supported by boost-histogram), complex numbers, and the dictionary to access the bins." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Access by bin number\n", "h[25, 25]" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Access by data coordinate\n", "# Identical to: h[hist.loc(0), hist.loc(0)]\n", "h[0j, 0j]" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Identical to: h[hist.loc(-1) + 5, hist.loc(-4) + 20]\n", "h[-1j + 5, -4j + 20]" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "If you are accessing multiple bins, you can use complex numbers to rebin." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Identical to: h.project(\"S\")[20 : 30 : hist.rebin(2)]\n", "h.project(\"S\")[20:30:2j]" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Dictionary is allowed when accessing bins. If you have axes all with names in your Hist, you can also access them according to the axes' names." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "s = Hist(\n", " hist.axis.Regular(50, -5, 5, name=\"Norm\", label=\"normal distribution\"),\n", @@ -205,13 +207,13 @@ " hist.axis.Boolean(name=\"Yes\"),\n", " hist.axis.Integer(0, 1000, name=\"Int\"),\n", ")" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "s.fill(\n", " Norm=np.random.normal(size=1000),\n", @@ -220,79 +222,79 @@ " Yes=[True] * 600 + [False] * 400,\n", " Int=np.ones(1000),\n", ")" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "s[0j, -0j + 2, \"hi\", True, 1]" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "s[{0: 0j, 3: True, 4: 1, 1: -0j + 2, 2: \"hi\"}] += 10\n", "\n", "s[{\"Greet\": \"hi\", \"Unif\": -0j + 2, \"Yes\": True, \"Int\": 1, \"Norm\": 0j}]" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "#### Get Density\n", "\n", "If you want to get the density of an existing histogram, `.density()` is capable to do it and will return you the density array without overflow and underflow bins. (*This may return a \"smart\" object in the future; for now it's a simple NumPy array.*)" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "h[25:30, 25:30].density()" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "#### Get Project\n", "\n", "Hist allows you to get the projection of an N-D Histogram:" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "s_2d = s.project(\"Norm\", \"Unif\")\n", "s_2d" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "#### Get Profile\n", "\n", "To compute the (N-1)-D profile from an existing histogram, you can:" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "xy = np.array(\n", " [\n", @@ -315,22 +317,22 @@ "hp = h_xy.profile(\"y\")\n", "hp.values()[1:-1]\n", "# hp.variances()[1:-1]" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### Plot Hist\n", "\n", "One of the most amazing feature of hist is it's powerful plotting family. Here is a brief demonstration of how to plot Hist. You can get more information in the section of [Plots](./Plots.ipynb)." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "\n", @@ -340,20 +342,20 @@ "h.project(\"W\").plot(ax=axs[0])\n", "h.project(\"W\", \"S\").plot(ax=axs[1])\n", "plt.show()" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "This is an example of a pull plot:" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "from uncertainties import unumpy as unp\n", "\n", @@ -361,40 +363,40 @@ "def pdf(x, a=1 / np.sqrt(2 * np.pi), x0=0, sigma=1, offset=0):\n", " exp = unp.exp if a.dtype == np.dtype(\"O\") else np.exp\n", " return a * exp(-((x - x0) ** 2) / (2 * sigma**2)) + offset" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "*(The uncertainty is non-significant as we filled a great quantities of observation points above.)*" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "plt.figure(figsize=(10, 6))\n", "\n", "h.project(\"S\").plot_pull(pdf)\n", "\n", "plt.show()" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "You can also pass Hist objects directly to mplhep (which is what is used for the backend of Hist anyway):" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "import mplhep\n", "\n", @@ -405,22 +407,22 @@ "mplhep.hist2dplot(h, ax=axs[1])\n", "\n", "plt.show()" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## NamedHist\n", "\n", "If you want to force names always be used, you can use NamedHist. This reduces functionality but can reduce mistaking one axes for another." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "h = hist.NamedHist(\n", " hist.axis.Regular(\n", @@ -430,13 +432,13 @@ " 50, -5, 5, name=\"W\", label=\"w [units]\", underflow=False, overflow=False\n", " ),\n", ")" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# should all use names\n", "s_data = np.random.normal(size=50_000)\n", @@ -446,13 +448,13 @@ "\n", "assert h[25, 25] == h[0j, 1j - 5] == h[{\"W\": 25, \"S\": 0j}]\n", "assert h[:, 0:50:5j].project(\"S\")" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# plot2d full\n", "h.plot2d_full(\n", @@ -465,13 +467,13 @@ " side_color=\"steelblue\",\n", ")\n", "plt.show()" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# plot pull\n", "h.project(\"W\").plot_pull(\n", @@ -488,16 +490,141 @@ " bar_color=\"darkgreen\",\n", ")\n", "plt.show()" - ], + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## hist.dask\n", + "\n", + "If you want to fill your histograms using delayed arrays provided by dask start by importing the hist.dask sub-package, usually calling it `dah`.\n", + "Within this sub-package dask versions of Hist and NamedHist are available. All methods of Hist and NamedHist instantiation discussed above are supported in their dask forms. This method of using hist can be best used when operating on large datasets and distributed clusters. \n", + "\n", + "**An important note**: as with all dask collections the in-memory and finalized form of the histogram is only rendered when you call `.compute()` or `dask.compute()` on the dask collection! Until that point you are manipulating a *task graph* that represents the process of filling and creating that histogram." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import hist.dask as dah\n", + "import dask.array as da" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Hist\n", + "\n", + "Below we'll use a dask array to fill a `hist.dask.Hist` lazily, as a proxy for filling it on a cluster, and then plot the resulting histogram!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add the axes, finalize with storage\n", + "h = (\n", + " dah.Hist.new.Reg(50, -5, 5, name=\"S\", label=\"s [units]\", flow=False)\n", + " .Reg(50, -5, 5, name=\"W\", label=\"w [units]\", flow=False)\n", + " .Double()\n", + ")\n", + "\n", + "s_data = da.random.standard_normal(size=(50_000,), chunks=(1000,))\n", + "w_data = da.random.standard_normal(size=(50_000,), chunks=(1000,))\n", + "\n", + "# delayed fill\n", + "h.fill(W=w_data, S=s_data)\n", + "\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# auto-plot\n", + "fig, axs = plt.subplots(1, 2, figsize=(10, 4))\n", + "\n", + "h.project(\"W\").plot(ax=axs[0])\n", + "h.project(\"W\", \"S\").plot(ax=axs[1])\n", + "plt.show()\n", + "\n", + "h.visualize() # from here we can see that only the task graph is created and there is no filled histogram!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# render in-memory histogram\n", + "h = h.compute()\n", + "\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# auto-plot\n", + "fig, axs = plt.subplots(1, 2, figsize=(10, 4))\n", + "\n", + "h.project(\"W\").plot(ax=axs[0])\n", + "h.project(\"W\", \"S\").plot(ax=axs[1])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### NamedHist\n", + "\n", + "Below we'll use a dask array to fill a `hist.dask.NamedHist` lazily, as a proxy for filling it on a cluster, and then plot the resulting histogram!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, "outputs": [], - "metadata": {} + "source": [ + "h = dah.NamedHist(\n", + " hist.axis.Regular(\n", + " 50, -5, 5, name=\"S\", label=\"s [units]\", underflow=False, overflow=False\n", + " ),\n", + " hist.axis.Regular(\n", + " 50, -5, 5, name=\"W\", label=\"w [units]\", underflow=False, overflow=False\n", + " ),\n", + ")\n", + "# should all use names\n", + "s_data = da.random.standard_normal(size=(50_000,), chunks=(1000,))\n", + "w_data = da.random.standard_normal(size=(50_000,), chunks=(1000,))\n", + "\n", + "h.fill(W=w_data, S=s_data)\n", + "\n", + "h = h.compute()\n", + "assert h[25, 25] == h[0j, 1j - 5] == h[{\"W\": 25, \"S\": 0j}]\n", + "assert h[:, 0:50:5j].project(\"S\")\n", + "\n", + "# plot2d full\n", + "h.plot2d_full(\n", + " main_cmap=\"cividis\",\n", + " top_ls=\"--\",\n", + " top_color=\"orange\",\n", + " top_lw=2,\n", + " side_ls=\":\",\n", + " side_lw=2,\n", + " side_color=\"steelblue\",\n", + ")\n", + "plt.show()" + ] } ], "metadata": { "kernelspec": { - "display_name": "hist", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "hist" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -509,7 +636,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.4" + "version": "3.8.13" } }, "nbformat": 4, diff --git a/pyproject.toml b/pyproject.toml index 02251e0a..efcb590d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ classifiers = [ keywords = [ "histogram", "boost-histogram", + "dask-histogram", ] requires-python = ">=3.7" dependencies = [ @@ -62,9 +63,15 @@ plot = [ "scipy >=1.4; python_version<'3.11'", "iminuit >=2; python_version<'3.11'", ] +dask = [ + "dask[dataframe] >=2022; python_version>'3.7'", + "dask_histogram >=2023.1; python_version>'3.7'" +] test = [ "pytest >=6", "pytest-mpl >=0.12", + "dask[dataframe] >=2022; python_version>'3.7'", + "dask_histogram >=2023.1; python_version>'3.7'", ] dev = [ "pytest >=6", @@ -74,6 +81,8 @@ dev = [ "scipy >=1.4; python_version<'3.11'", "iminuit >=2; python_version<'3.11'", "ipykernel", + "dask[dataframe] >=2022; python_version>'3.7'", + "dask_histogram >=2023.1; python_version>'3.7'", ] docs = [ "pytest >=6", @@ -83,6 +92,9 @@ docs = [ "scipy >=1.4; python_version<'3.11'", "iminuit >=2; python_version<'3.11'", "ipython_genutils", + "graphviz >=0.20.1", + "dask[dataframe] >=2022; python_version>'3.7'", + "dask_histogram >=2023.1; python_version>'3.7'", "nbsphinx", "Sphinx >=3.0.0", "sphinx_copybutton", @@ -135,6 +147,7 @@ module = [ "scipy.*", "iminuit.*", "mplhep.*", + "dask_histogram.*", ] ignore_missing_imports = true diff --git a/src/hist/dask/__init__.py b/src/hist/dask/__init__.py new file mode 100644 index 00000000..5d88bf74 --- /dev/null +++ b/src/hist/dask/__init__.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +try: + import dask_histogram # noqa: F401 +except ModuleNotFoundError as err: + raise ModuleNotFoundError( + """for hist.dask, install the 'dask_histogram' package with: + pip install dask_histogram + or + conda install dask_histogram""" + ) from err + +from .hist import Hist +from .namedhist import NamedHist + +__all__ = ["Hist", "NamedHist"] diff --git a/src/hist/dask/hist.py b/src/hist/dask/hist.py new file mode 100644 index 00000000..f186a0fa --- /dev/null +++ b/src/hist/dask/hist.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +import dask_histogram.boost as dhb + +import hist + +from ..hist import Hist as HistNoDask + + +class Hist(HistNoDask, dhb.Histogram, family=hist): # type: ignore[misc] + @property + def _in_memory_type(self) -> type[HistNoDask]: + return HistNoDask diff --git a/src/hist/dask/namedhist.py b/src/hist/dask/namedhist.py new file mode 100644 index 00000000..aec97975 --- /dev/null +++ b/src/hist/dask/namedhist.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +import dask_histogram.boost as dhb + +import hist + +from ..namedhist import NamedHist as NamedHistNoDask + + +class NamedHist(NamedHistNoDask, dhb.Histogram, family=hist): # type: ignore[misc] + @property + def _in_memory_type(self) -> type[NamedHistNoDask]: + return NamedHistNoDask diff --git a/tests/conftest.py b/tests/conftest.py index fd83bad1..37a80fab 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,6 +5,11 @@ from hist import Hist, NamedHist from hist.hist import BaseHist +try: + import hist.dask as dah +except ImportError: + dah = None + @pytest.fixture(params=[Hist, BaseHist, NamedHist]) def named_hist(request): @@ -14,3 +19,20 @@ def named_hist(request): @pytest.fixture(params=[Hist, BaseHist]) def unnamed_hist(request): yield request.param + + +dask_params_named = [] +dask_params_unnamed = [] +if dah is not None: + dask_params_named = [dah.Hist, dah.NamedHist] + dask_params_unnamed = [dah.Hist] + + +@pytest.fixture(params=dask_params_named) +def named_dask_hist(request): + yield request.param + + +@pytest.fixture(params=dask_params_unnamed) +def unnamed_dask_hist(request): + yield request.param diff --git a/tests/test_dask.py b/tests/test_dask.py new file mode 100644 index 00000000..822bc189 --- /dev/null +++ b/tests/test_dask.py @@ -0,0 +1,123 @@ +from __future__ import annotations + +import numpy as np +import pytest + +import hist + +da = pytest.importorskip("dask.array") +dah = pytest.importorskip("hist.dask") + + +@pytest.mark.parametrize("use_weights", [True, False]) +def test_unnamed_5D_strcat_intcat_rectangular(unnamed_dask_hist, use_weights): + x = da.random.standard_normal(size=(2000, 3), chunks=(400, 3)) + if use_weights: + weights = da.random.uniform(0.5, 0.75, size=x.shape[0], chunks=x.chunksize[0]) + storage = hist.storage.Weight() + else: + weights = None + storage = hist.storage.Double() + + h = unnamed_dask_hist( + hist.axis.StrCategory([], growth=True, name="strcat"), + hist.axis.IntCategory([], growth=True, name="intcat"), + hist.axis.Regular(8, -3.5, 3.5, name="x"), + hist.axis.Regular(7, -3.3, 3.3, name="y"), + hist.axis.Regular(9, -3.2, 3.2, name="z"), + storage=storage, + ) + xT = x.T + h.fill(strcat="testcat1", intcat=1, x=xT[0], y=xT[1], z=xT[2], weight=weights) + h.fill(strcat="testcat2", intcat=2, x=xT[0], y=xT[1], z=xT[2], weight=weights) + h = h.compute() + + control = h.__class__(*h.axes, storage=h.storage_type()) + xTc = x.compute().T + if use_weights: + control.fill( + strcat="testcat1", + intcat=1, + x=xTc[0], + y=xTc[1], + z=xTc[2], + weight=weights.compute(), + ) + control.fill( + strcat="testcat2", + intcat=2, + x=xTc[0], + y=xTc[1], + z=xTc[2], + weight=weights.compute(), + ) + else: + control.fill(strcat="testcat1", intcat=1, x=xTc[0], y=xTc[1], z=xTc[2]) + control.fill(strcat="testcat2", intcat=2, x=xTc[0], y=xTc[1], z=xTc[2]) + + assert np.allclose(h.counts(), control.counts()) + if use_weights: + assert np.allclose(h.variances(), control.variances()) + + assert len(h.axes[0]) == 2 and len(control.axes[0]) == 2 + assert all(cx == hx for cx, hx in zip(control.axes[0], h.axes[0])) + + assert len(h.axes[1]) == 2 and len(control.axes[1]) == 2 + assert all(cx == hx for cx, hx in zip(control.axes[1], h.axes[1])) + + +@pytest.mark.parametrize("use_weights", [True, False]) +def test_named_5D_strcat_intcat_rectangular(named_dask_hist, use_weights): + x = da.random.standard_normal(size=(2000, 3), chunks=(400, 3)) + if use_weights: + weights = da.random.uniform(0.5, 0.75, size=x.shape[0], chunks=x.chunksize[0]) + storage = hist.storage.Weight() + else: + weights = None + storage = hist.storage.Double() + + h = named_dask_hist( + hist.axis.StrCategory([], growth=True, name="strcat"), + hist.axis.IntCategory([], growth=True, name="intcat"), + hist.axis.Regular(8, -3.5, 3.5, name="x"), + hist.axis.Regular(7, -3.3, 3.3, name="y"), + hist.axis.Regular(9, -3.2, 3.2, name="z"), + storage=storage, + ) + xT = x.T + h.fill(strcat="testcat1", intcat=1, x=xT[0], y=xT[1], z=xT[2], weight=weights) + h.fill(strcat="testcat2", intcat=2, x=xT[0], y=xT[1], z=xT[2], weight=weights) + h = h.compute() + + control = h.__class__(*h.axes, storage=h.storage_type()) + xTc = x.compute().T + if use_weights: + control.fill( + strcat="testcat1", + intcat=1, + x=xTc[0], + y=xTc[1], + z=xTc[2], + weight=weights.compute(), + ) + control.fill( + strcat="testcat2", + intcat=2, + x=xTc[0], + y=xTc[1], + z=xTc[2], + weight=weights.compute(), + ) + else: + control.fill(strcat="testcat1", intcat=1, x=xTc[0], y=xTc[1], z=xTc[2]) + control.fill(strcat="testcat2", intcat=2, x=xTc[0], y=xTc[1], z=xTc[2]) + + assert np.allclose(h.counts(), control.counts()) + if use_weights: + assert np.allclose(h.variances(), control.variances()) + + assert len(h.axes[0]) == 2 and len(control.axes[0]) == 2 + assert all(cx == hx for cx, hx in zip(control.axes[0], h.axes[0])) + + assert len(h.axes[1]) == 2 and len(control.axes[1]) == 2 + assert all(cx == hx for cx, hx in zip(control.axes[1], h.axes[1]))