Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Alternative variable binning approach #849

Merged
merged 20 commits into from
Mar 13, 2025
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
fcae83c
Add variable binning and adjust output functions
JanWeldert Jan 3, 2025
8c20429
Force events representation and add histogramming to translations
JanWeldert Jan 7, 2025
35dcf0e
Adjust analysis.py and config parser
JanWeldert Jan 9, 2025
81c308b
Add example
JanWeldert Jan 10, 2025
eaf6235
Expand example a bit
JanWeldert Jan 17, 2025
3215d55
fit_hypo does not know hypo_asimov_dist
JanWeldert Feb 14, 2025
87f21fb
generalise pipeline config parser and varbinning to accept selection …
thehrh Feb 26, 2025
854e356
Make selection string work with container
JanWeldert Feb 27, 2025
eb78c69
Check if selections are exclusive
JanWeldert Feb 28, 2025
0ee0429
Define selection before checking it
JanWeldert Feb 28, 2025
ba8c4a7
Simple test and more checks for VarBinning
JanWeldert Mar 3, 2025
e9b0b5b
docs & comments, split up check_varbinning (was performing two very d…
thehrh Mar 6, 2025
9f078c6
exclusivity check only when requesting new output binning in get_outputs
thehrh Mar 6, 2025
92409cf
Add docstrings and change assert to warn
JanWeldert Mar 6, 2025
94a0529
fix undefined varbinning name in config parser, debug logging of sele…
thehrh Mar 6, 2025
7837426
superficial: comments & docstrings, thousands of lines of ipynb outpu…
thehrh Mar 6, 2025
f7a53ad
also make separate functions for parsing different types of binning f…
thehrh Mar 6, 2025
b0eabba
Merge remote-tracking branch 'origin/master' into var_bin
thehrh Mar 12, 2025
1a3dd76
adapt varbinning init and tests (require > 1 binnings, detect any bin…
thehrh Mar 12, 2025
5c2d85f
comments/NotImplementedErrors for generalized_poisson_llh in analysis…
thehrh Mar 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 53 additions & 32 deletions pisa/analysis/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,16 +647,22 @@ def __init__(
assert hypo_maker is not None, msg
assert data_dist is not None, msg
assert metric is not None, msg
# this passes through the setter method, but it should just pass through
# without actually doing anything
if hypo_maker.__class__.__name__ == "Detectors":
# this passes through the setter method, but it should just pass through
# without actually doing anything
self.detailed_metric_info = [self.get_detailed_metric_info(
data_dist=data_dist[i], hypo_asimov_dist=self.hypo_asimov_dist[i],
params=hypo_maker.distribution_makers[i].params, metric=metric[i],
other_metrics=other_metrics, detector_name=hypo_maker.det_names[i], hypo_maker=hypo_maker
other_metrics=other_metrics, detector_name=hypo_maker.det_names[i], hypo_maker=hypo_maker,
) for i in range(len(data_dist))]
else: # DistributionMaker object
if 'generalized_poisson_llh' == metric[0]:
elif isinstance(data_dist, list): # DistributionMaker object with variable binning
self.detailed_metric_info = [self.get_detailed_metric_info(
data_dist=data_dist[i], hypo_asimov_dist=self.hypo_asimov_dist[i],
params=hypo_maker.params, metric=metric[0],
other_metrics=other_metrics, detector_name=hypo_maker.detector_name, hypo_maker=hypo_maker,
) for i in range(len(data_dist))]
else: # DistributionMaker object with regular binning
if metric[0] == 'generalized_poisson_llh':
generalized_poisson_dist = hypo_maker.get_outputs(return_sum=False, force_standard_output=False)
generalized_poisson_dist = merge_mapsets_together(mapset_list=generalized_poisson_dist)
else:
Expand All @@ -665,7 +671,7 @@ def __init__(
self.detailed_metric_info = self.get_detailed_metric_info(
data_dist=data_dist, hypo_asimov_dist=self.hypo_asimov_dist, generalized_poisson_hypo=generalized_poisson_dist,
params=hypo_maker.params, metric=metric[0], other_metrics=other_metrics,
detector_name=hypo_maker.detector_name, hypo_maker=hypo_maker
detector_name=hypo_maker.detector_name, hypo_maker=hypo_maker,
)

def __getitem__(self, i):
Expand Down Expand Up @@ -1180,17 +1186,6 @@ def fit_recursively(

"""

if isinstance(metric, str):
metric = [metric]

# Before starting any fit, check if we already have a perfect match between data and template
# This can happen if using pseudodata that was generated with the nominal values for parameters
# (which will also be the initial values in the fit) and blah...
# If this is the case, don't both to fit and return results right away.

if isinstance(metric, str):
metric = [metric]

# Grab the hypo map
hypo_asimov_dist = hypo_maker.get_outputs(return_sum=True)

Expand All @@ -1201,12 +1196,18 @@ def fit_recursively(
if len(metric) == 1: # One metric for all detectors
metric = list(metric) * len(hypo_maker.distribution_makers)
elif len(metric) != len(hypo_maker.distribution_makers):
raise IndexError('Number of defined metrics does not match with number of detectors.')
else: # DistributionMaker object
assert len(metric) == 1
raise IndexError(f"Number of defined metrics does not match with number of detectors.")
elif isinstance(hypo_asimov_dist, MapSet) or isinstance(hypo_asimov_dist, list):
# DistributionMaker object (list means variable binning)
assert len(metric) == 1, f"Only one metric allowed for DistributionMaker"
else:
raise NotImplementedError(f"hypo_maker returned output of type {type(hypo_asimov_dist)}")

# Check if the hypo matches data
if hypo_maker.__class__.__name__ != "Detectors" and data_dist.allclose(hypo_asimov_dist) :
# Before starting any fit, check if we already have a perfect match between data and template
# This can happen if using pseudodata that was generated with the nominal values for parameters
# (which will also be the initial values in the fit)
# If this is the case, don't both to fit and return results right away.
if isinstance(data_dist, MapSet) and data_dist.allclose(hypo_asimov_dist):

msg = 'Initial hypo matches data, no need for fit'
logging.info(msg)
Expand Down Expand Up @@ -2785,11 +2786,17 @@ def _minimizer_callable(self, scaled_param_vals, hypo_maker, data_dist,
metric_val = 0
for i in range(len(hypo_maker.distribution_makers)):
data = data_dist[i].metric_total(expected_values=hypo_asimov_dist[i],
metric=metric[i], metric_kwargs=metric_kwargs)
metric=metric[i], metric_kwargs=metric_kwargs)
metric_val += data
priors = hypo_maker.params.priors_penalty(metric=metric[0]) # uses just the "first" metric for prior
metric_val += priors
else: # DistributionMaker object
elif isinstance(hypo_asimov_dist, list): # DistributionMaker object with variable binning
metric_val = 0
for i in range(len(hypo_asimov_dist)):
metric_val += data_dist[i].metric_total(expected_values=hypo_asimov_dist[i],
metric=metric[0], metric_kwargs=metric_kwargs)
metric_val += hypo_maker.params.priors_penalty(metric=metric[0])
else: # DistributionMaker object with regular binning
if metric[0] == 'weighted_chi2':
actual_values = data_dist.hist['total']
expected_values = hypo_asimov_dist.hist['total']
Expand Down Expand Up @@ -3121,9 +3128,9 @@ def fit_hypo(self, data_dist, hypo_maker, metric, minimizer_settings,
if len(metric) == 1: # One metric for all detectors
metric = list(metric) * len(hypo_maker.distribution_makers)
elif len(metric) != len(hypo_maker.distribution_makers):
raise IndexError('Number of defined metrics does not match with number of detectors.')
else: # DistributionMaker object
assert len(metric) == 1
raise IndexError(f"Number of defined metrics does not match with number of detectors.")
else:
assert len(metric) == 1, f"Only one metric allowed for DistributionMaker"

if check_ordering:
if 'nh' in hypo_param_selections or 'ih' in hypo_param_selections:
Expand Down Expand Up @@ -3209,9 +3216,12 @@ def nofit_hypo(self, data_dist, hypo_maker, hypo_param_selections,
if len(metric) == 1: # One metric for all detectors
metric = list(metric) * len(hypo_maker.distribution_makers)
elif len(metric) != len(hypo_maker.distribution_makers):
raise IndexError('Number of defined metrics does not match with number of detectors.')
else: # DistributionMaker object
assert len(metric) == 1
raise IndexError(f"Number of defined metrics does not match with number of detectors.")
elif isinstance(hypo_asimov_dist, MapSet) or isinstance(hypo_asimov_dist, list):
# DistributionMaker object (list means variable binning)
assert len(metric) == 1, f"Only one metric allowed for DistributionMaker"
else:
raise NotImplementedError(f"hypo_maker returned output of type {type(hypo_asimov_dist)}")
fit_info.metric = metric

# Assess the fit: whether the data came from the hypo_asimov_dist
Expand All @@ -3223,7 +3233,12 @@ def nofit_hypo(self, data_dist, hypo_maker, hypo_param_selections,
metric_val += data
priors = hypo_maker.params.priors_penalty(metric=metric[0]) # uses just the "first" metric for prior
metric_val += priors
else: # DistributionMaker object
elif isinstance(data_dist, list): # DistributionMaker object with VarBinning
metric_val = 0
for i in range(len(data_dist)):
metric_val += data_dist[i].metric_total(expected_values=hypo_asimov_dist[i], metric=metric[0])
metric_val += hypo_maker.params.priors_penalty(metric=metric[0])
else: # DistributionMaker object with MultiDimBinning

if 'generalized_poisson_llh' == metric[0]:

Expand Down Expand Up @@ -3271,7 +3286,13 @@ def nofit_hypo(self, data_dist, hypo_maker, hypo_param_selections,
params=hypo_maker.distribution_makers[i].params, metric=metric[i],
other_metrics=other_metrics, detector_name=hypo_maker.det_names[i]
) for i in range(len(data_dist))]
else: # DistributionMaker object
elif isinstance(data_dist, list): # DistributionMaker object with VarBinning
fit_info.detailed_metric_info = [fit_info.get_detailed_metric_info(
data_dist=data_dist[i], hypo_asimov_dist=hypo_asimov_dist[i],
params=hypo_maker.params, metric=metric[0], other_metrics=other_metrics,
detector_name=hypo_maker.detector_name
) for i in range(len(data_dist))]
else: # DistributionMaker object with MultiDimBinning

if 'generalized_poisson_llh' == metric[0]:
generalized_poisson_dist = hypo_maker.get_outputs(return_sum=False, force_standard_output=False)
Expand Down
182 changes: 175 additions & 7 deletions pisa/core/binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ class (MultiDimBinning) for arbitrarily many of dimensions (one or more). These

__all__ = ['NAME_FIXES', 'NAME_SEPCHARS', 'NAME_FIXES_REGEXES',
'basename', '_new_obj', 'is_binning',
'OneDimBinning', 'MultiDimBinning',
'test_OneDimBinning', 'test_MultiDimBinning']
'OneDimBinning', 'MultiDimBinning', 'VarBinning',
'test_OneDimBinning', 'test_MultiDimBinning', 'test_VarBinning']

__author__ = 'J.L. Lanfranchi'

Expand Down Expand Up @@ -139,7 +139,7 @@ def new_function(cls, *args, **kwargs):
return new_function


class OneDimBinning(object):
class OneDimBinning():
# pylint: disable=line-too-long
"""Histogram-oriented binning specialized to a single dimension.

Expand Down Expand Up @@ -1481,7 +1481,7 @@ def __ne__(self, other):
return not self.__eq__(other)


class MultiDimBinning(object):
class MultiDimBinning():
# pylint: disable=line-too-long
r"""
Multi-dimensional binning object. This can contain one or more
Expand Down Expand Up @@ -1590,7 +1590,6 @@ def __init__(self, dimensions, name=None, mask=None):
# Handle masking
self._init_mask(mask)


def _init_mask(self, mask) :
'''
Initialize the bin mask. This can either be specified as:
Expand Down Expand Up @@ -1639,13 +1638,11 @@ def _init_mask(self, mask) :
# Done, store the mask
self._mask = mask


@property
def name(self):
"""Name of the dimension"""
return self._name


def __repr__(self):
previous_precision = np.get_printoptions()['precision']
np.set_printoptions(precision=18)
Expand Down Expand Up @@ -3043,6 +3040,143 @@ def __ne__(self, other):
return not self.__eq__(other)


class VarBinning():
# pylint: disable=line-too-long
r"""Binning class that allows to use multiple different analysis binnings
or to split up an event sample with a given analysis binning into
non-overlapping sub-samples - which will be verified at analysis time -
defined by arbitrary cuts.

A use case is for example if you want to use a different 2d binning in
reconstructed energy and cosine zenith for each PID bin.

Parameters
----------
binnings : list of MultiDimBinnings
The MultiDimBinnings that should be used for each selection.

selections : OneDimBinning or list of strings (cut expressions)
Selections for creating sub-samples and for which different binnings
can be used. Can either be a OneDimBinning of the selection variable
or an arbitray selection defined in a list of strings where each string
contains the respective selection cuts (similar to
data.simple_data_loader's `mc_cuts`).

Notes
-----
The lengths of `binnings` and `selections` are required to be > 1 (otherwise,
just apply MC cuts the regular way) and the same (one `MultiDimBinning`
per selection - we don't attempt any array broadcasting).

If a `OneDimBinning` is passed to `selections`, the corresponding
dimension must not be a dimension of any `MultiDimBinning` in `binnings`.
Similarly, if a list of cut expressions is passed, none of its contained
variables may simultaneously serve as a `MultiDimBinning`'s dimension.
While such scenarios needn't always be dangerous, there are probably less
convoluted ways of achieving them.

A warning is issued when `selections` is of type `OneDimBinning` and
all `binnings` entries are identical.

"""
# pylint: enable=line-too-long
def __init__(self, binnings, selections):

if not isinstance(selections, (OneDimBinning, list)):
raise ValueError(f"Selection type {type(selections)} not supported!")
# some "trivial" asserts
assert isinstance(binnings, list) and len(binnings) == len(selections)
assert len(binnings) > 1

all_equal = True
for b in binnings:
assert isinstance(b, MultiDimBinning)
# determine whether any variable is part of this binning
invalid_sel_vars = self._selection_vars_in_mdb(b, selections)
if invalid_sel_vars:
raise ValueError(
f"Selection variables {invalid_sel_vars} may not"
" simultaneously be part of any MultiDimBinning!"
)
all_equal = all_equal and b == binnings[0]

if isinstance(selections, OneDimBinning) and all_equal:
logging.warning(
f"All binnings are equal, namely {binnings[0]!s}. It will most"
" likely be advantageous to simply use the MultiDimBinning"
" class for the same analysis purpose."
)

self._binnings = binnings
self._selections = selections
self._nselections = len(selections)
self._names = None

def _selection_vars_in_mdb(self, binning, selections):
"""Check which binning dimensions also appear in the selections.
"""
if isinstance(selections, OneDimBinning):
if selections.name in binning.names:
return [selections.name]
elif isinstance(selections, Sequence):
# dumb way to find all cut variables in expressions (+ cut values)
vars_and_more = []
for cut_expr in selections:
vars_and_more.extend(re.findall(r"\b\w+\b", cut_expr))
# Return any dimension names that occur identically in any
# cut expression. Adapt the above if there is a false positive.
return [dim for dim in binning.names if dim in vars_and_more]
return []

@property
def binnings(self):
"""list of MultiDimBinning : each binning in a list"""
return self._binnings

@property
def selections(self):
"""list of strs or OneDimBinning : selections for which to use different binnings"""
return self._selections

@property
def nselections(self):
"""int : number of selections with possibly different binnings"""
return self._nselections

@property
def names(self):
"""list of strings : names of each (binning) dimension contained"""
if self._names is None:
self._names = []
for b in self.binnings:
self._names.append(b.names)
return self._names

def __pretty__(self, p, cycle):
"""Method used by the `pretty` library for formatting"""
if cycle:
p.text('%s(...)' % self.__class__.__name__)
else:
p.begin_group(4, '%s([' % self.__class__.__name__)
for n, dim in enumerate(self):
p.breakable()
p.pretty(dim)
if n < len(self)-1:
p.text(',')
p.end_group(4, '])')

def _repr_pretty_(self, p, cycle):
"""Method used by e.g. ipython/Jupyter for formatting"""
return self.__pretty__(p, cycle)

def __iter__(self):
"""Iterate over dimensions. Use `iterbins` to iterate over bins."""
return iter(self._binnings)

def __len__(self):
return self._nselections


def test_OneDimBinning():
"""Unit tests for OneDimBinning class"""
# pylint: disable=line-too-long, wrong-import-position
Expand Down Expand Up @@ -3499,7 +3633,41 @@ def test_MultiDimBinning():
logging.info('<< PASS : test_MultiDimBinning >>')


def test_VarBinning():
"""Unit tests for VarBinning class"""
# pylint: disable=wrong-import-position

b1 = OneDimBinning(name='energy', num_bins=40, is_log=True,
domain=[1, 80]*ureg.GeV)
b2 = OneDimBinning(name='coszen', num_bins=40, is_lin=True,
domain=[-1, 1])
mdb1 = MultiDimBinning([b1, b2])
b3 = OneDimBinning(name='energy', num_bins=20, is_log=True,
domain=[1, 80]*ureg.GeV)
mdb2 = MultiDimBinning([b3, b2])

selection = OneDimBinning(name='pid', is_lin=True, num_bins=2, domain=[0, 1])
varbin = VarBinning([mdb1, mdb2], selection)
assert varbin.names[0] == varbin.names[1] == ['energy', 'coszen']
l = [isinstance(mdb, MultiDimBinning) for mdb in varbin]
assert len(l) == 2 and np.all(l)

selection = ['pid < 0.5', 'pid >= 0.5']
varbin = VarBinning([mdb1, mdb2], selection)

selection = ['energy == 10', 'coszen < 0']
try:
varbin = VarBinning([mdb1, mdb1], selection)
except ValueError:
pass
else:
assert False

logging.info('<< PASS : test_VarBinning >>')


if __name__ == "__main__":
set_verbosity(1)
test_OneDimBinning()
test_MultiDimBinning()
test_VarBinning()
Loading