Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a Null backend #112

Merged
merged 4 commits into from
Jun 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions mcbackend/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
A framework agnostic implementation for storage of MCMC draws.
"""

from .backends.null import NullBackend
from .backends.numpy import NumPyBackend
from .core import Backend, Chain, Run
from .meta import ChainMeta, Coordinate, DataVariable, ExtendedValue, RunMeta, Variable
Expand All @@ -16,6 +17,7 @@
__version__ = "0.5.2"
__all__ = [
"NumPyBackend",
"NullBackend",
"Backend",
"Chain",
"Run",
Expand Down
117 changes: 117 additions & 0 deletions mcbackend/backends/null.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
"""
This backend simply discards draws. There are not stored in memory.
This can be used in situations where we want to run an MCMC but not permanently
store its output.
"""

# Code-wise, a NullChain is essentially just a NumpyChain without the underlying data array.

from typing import Dict, List, Mapping, Optional, Sequence, Tuple

import numpy

from ..core import Backend, Chain, Run
from ..meta import ChainMeta, RunMeta
from .numpy import grow_append, prepare_storage


class NullChain(Chain):
"""A null storage: discards values immediately and allocates no memory.

Use cases are

- Online computations: Draws are used and discarded immediately, allowing for much larger sample spaces.
- Profiling: To use as a baseline, to measure compute time & memory before allocating memory for draws.
Comparing with another backend would then show how much overhead it adds.

Since draws are not stored, only a subset of the `Chain` interface is supported:

- Supported: `__len__`, `append`, `get_stats`, `get_stats_at`
- Not supported: `get_draws`, `get_draws_at`

.. Todo:: Option to also sampling stats?
.. Todo:: Allow retrieving the most recent draw?

"""

def __init__(self, cmeta: ChainMeta, rmeta: RunMeta, *, preallocate: int) -> None:
"""Creates a null storage for draws from a chain: will gobble outputs without storing them

Parameters
----------
cmeta : ChainMeta
Metadata of the chain.
rmeta : RunMeta
Metadata of the MCMC run.
preallocate : int
Influences the memory pre-allocation behavior.
(Draws are not saved, but stats may still be.)
The default is to reserve memory for ``preallocate`` draws
and grow the allocated memory by 10 % when needed.
Exceptions are variables with non-rigid shapes (indicated by 0 in the shape tuple)
where the correct amount of memory cannot be pre-allocated.
In these cases object arrays are used.
"""
self._draw_idx = 0

# Create storage ndarrays only for sampler stats.
self._stats, self._stat_is_rigid = prepare_storage(rmeta.sample_stats, preallocate)

super().__init__(cmeta, rmeta)

def append( # pylint: disable=duplicate-code
self, draw: Mapping[str, numpy.ndarray], stats: Optional[Mapping[str, numpy.ndarray]] = None
):
if stats:
grow_append(self._stats, stats, self._stat_is_rigid, self._draw_idx)
self._draw_idx += 1
return

def __len__(self) -> int:
return self._draw_idx

def get_draws(self, var_name: str, slc: slice = slice(None)) -> numpy.ndarray:
raise RuntimeError("NullChain does not save draws.")

def get_draws_at(self, idx: int, var_names: Sequence[str]) -> Dict[str, numpy.ndarray]:
raise RuntimeError("NullChain does not save draws.")

def get_stats( # pylint: disable=duplicate-code
self, stat_name: str, slc: slice = slice(None)
) -> numpy.ndarray:
data = self._stats[stat_name][: self._draw_idx][slc]
if self.sample_stats[stat_name].dtype == "str":
return numpy.array(data.tolist(), dtype=str)
return data

def get_stats_at(self, idx: int, stat_names: Sequence[str]) -> Dict[str, numpy.ndarray]:
return {sn: numpy.asarray(self._stats[sn][idx]) for sn in stat_names}


class NullRun(Run):
"""An MCMC run where samples are immediately discarded."""

def __init__(self, meta: RunMeta, *, preallocate: int) -> None:
self._settings = {"preallocate": preallocate}
self._chains: List[NullChain] = []
super().__init__(meta)

def init_chain(self, chain_number: int) -> NullChain:
cmeta = ChainMeta(self.meta.rid, chain_number)
chain = NullChain(cmeta, self.meta, **self._settings)
self._chains.append(chain)
return chain

def get_chains(self) -> Tuple[NullChain, ...]:
return tuple(self._chains)


class NullBackend(Backend):
"""A backend which discards samples immediately."""

def __init__(self, preallocate: int = 1_000) -> None:
self._settings = {"preallocate": preallocate}
super().__init__()

def init_run(self, meta: RunMeta) -> NullRun:
return NullRun(meta, **self._settings)
38 changes: 20 additions & 18 deletions mcbackend/backends/numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
"""

import math
from typing import Dict, List, Mapping, Optional, Sequence, Tuple
from typing import Dict, Iterable, List, Mapping, Optional, Sequence, Tuple

import numpy

from ..core import Backend, Chain, Run, is_rigid
from ..meta import ChainMeta, RunMeta
from ..meta import ChainMeta, RunMeta, Variable


def grow_append(
Expand All @@ -34,6 +34,22 @@ def grow_append(
return


def prepare_storage(
variables: Iterable[Variable], preallocate: int
) -> Tuple[Dict[str, numpy.ndarray], Dict[str, bool]]:
storage: Dict[str, numpy.ndarray] = {}
rigid_dict: Dict[str, bool] = {}
for var in variables:
rigid = is_rigid(var.shape) and not var.undefined_ndim and var.dtype != "str"
rigid_dict[var.name] = rigid
if rigid:
reserve = (preallocate, *var.shape)
storage[var.name] = numpy.empty(reserve, var.dtype)
else:
storage[var.name] = numpy.array([None] * preallocate, dtype=object)
return storage, rigid_dict


class NumPyChain(Chain):
"""Stores value draws in NumPy arrays and can pre-allocate memory."""

Expand All @@ -54,25 +70,11 @@ def __init__(self, cmeta: ChainMeta, rmeta: RunMeta, *, preallocate: int) -> Non
where the correct amount of memory cannot be pre-allocated.
In these cases object arrays are used.
"""
self._var_is_rigid: Dict[str, bool] = {}
self._samples: Dict[str, numpy.ndarray] = {}
self._stat_is_rigid: Dict[str, bool] = {}
self._stats: Dict[str, numpy.ndarray] = {}
self._draw_idx = 0

# Create storage ndarrays for each model variable and sampler stat.
for target_dict, rigid_dict, variables in [
(self._samples, self._var_is_rigid, rmeta.variables),
(self._stats, self._stat_is_rigid, rmeta.sample_stats),
]:
for var in variables:
rigid = is_rigid(var.shape) and not var.undefined_ndim and var.dtype != "str"
rigid_dict[var.name] = rigid
if rigid:
reserve = (preallocate, *var.shape)
target_dict[var.name] = numpy.empty(reserve, var.dtype)
else:
target_dict[var.name] = numpy.array([None] * preallocate, dtype=object)
self._samples, self._var_is_rigid = prepare_storage(rmeta.variables, preallocate)
self._stats, self._stat_is_rigid = prepare_storage(rmeta.sample_stats, preallocate)

super().__init__(cmeta, rmeta)

Expand Down
Loading
Loading