Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Alternative variable binning approach #849

Merged
merged 20 commits into from
Mar 13, 2025
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
fcae83c
Add variable binning and adjust output functions
JanWeldert Jan 3, 2025
8c20429
Force events representation and add histogramming to translations
JanWeldert Jan 7, 2025
35dcf0e
Adjust analysis.py and config parser
JanWeldert Jan 9, 2025
81c308b
Add example
JanWeldert Jan 10, 2025
eaf6235
Expand example a bit
JanWeldert Jan 17, 2025
3215d55
fit_hypo does not know hypo_asimov_dist
JanWeldert Feb 14, 2025
87f21fb
generalise pipeline config parser and varbinning to accept selection …
thehrh Feb 26, 2025
854e356
Make selection string work with container
JanWeldert Feb 27, 2025
eb78c69
Check if selections are exclusive
JanWeldert Feb 28, 2025
0ee0429
Define selection before checking it
JanWeldert Feb 28, 2025
ba8c4a7
Simple test and more checks for VarBinning
JanWeldert Mar 3, 2025
e9b0b5b
docs & comments, split up check_varbinning (was performing two very d…
thehrh Mar 6, 2025
9f078c6
exclusivity check only when requesting new output binning in get_outputs
thehrh Mar 6, 2025
92409cf
Add docstrings and change assert to warn
JanWeldert Mar 6, 2025
94a0529
fix undefined varbinning name in config parser, debug logging of sele…
thehrh Mar 6, 2025
7837426
superficial: comments & docstrings, thousands of lines of ipynb outpu…
thehrh Mar 6, 2025
f7a53ad
also make separate functions for parsing different types of binning f…
thehrh Mar 6, 2025
b0eabba
Merge remote-tracking branch 'origin/master' into var_bin
thehrh Mar 12, 2025
1a3dd76
adapt varbinning init and tests (require > 1 binnings, detect any bin…
thehrh Mar 12, 2025
5c2d85f
comments/NotImplementedErrors for generalized_poisson_llh in analysis…
thehrh Mar 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 44 additions & 30 deletions pisa/analysis/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,16 +486,22 @@ def __init__(
assert hypo_maker is not None, msg
assert data_dist is not None, msg
assert metric is not None, msg
# this passes through the setter method, but it should just pass through
# without actually doing anything
if hypo_maker.__class__.__name__ == "Detectors":
# this passes through the setter method, but it should just pass through
# without actually doing anything
self.detailed_metric_info = [self.get_detailed_metric_info(
data_dist=data_dist[i], hypo_asimov_dist=self.hypo_asimov_dist[i],
params=hypo_maker.distribution_makers[i].params, metric=metric[i],
other_metrics=other_metrics, detector_name=hypo_maker.det_names[i], hypo_maker=hypo_maker
other_metrics=other_metrics, detector_name=hypo_maker.det_names[i], hypo_maker=hypo_maker,
) for i in range(len(data_dist))]
else: # DistributionMaker object
if 'generalized_poisson_llh' == metric[0]:
elif isinstance(data_dist, list): # DistributionMaker object with variable binning
self.detailed_metric_info = [self.get_detailed_metric_info(
data_dist=data_dist[i], hypo_asimov_dist=self.hypo_asimov_dist[i],
params=hypo_maker.params, metric=metric[0],
other_metrics=other_metrics, detector_name=hypo_maker.detector_name, hypo_maker=hypo_maker,
) for i in range(len(data_dist))]
else: # DistributionMaker object with regular binning
if metric[0] == 'generalized_poisson_llh':
generalized_poisson_dist = hypo_maker.get_outputs(return_sum=False, force_standard_output=False)
generalized_poisson_dist = merge_mapsets_together(mapset_list=generalized_poisson_dist)
else:
Expand All @@ -504,7 +510,7 @@ def __init__(
self.detailed_metric_info = self.get_detailed_metric_info(
data_dist=data_dist, hypo_asimov_dist=self.hypo_asimov_dist, generalized_poisson_hypo=generalized_poisson_dist,
params=hypo_maker.params, metric=metric[0], other_metrics=other_metrics,
detector_name=hypo_maker.detector_name, hypo_maker=hypo_maker
detector_name=hypo_maker.detector_name, hypo_maker=hypo_maker,
)

def __getitem__(self, i):
Expand Down Expand Up @@ -1006,17 +1012,6 @@ def fit_recursively(

"""

if isinstance(metric, str):
metric = [metric]

# Before starting any fit, check if we already have a perfect match between data and template
# This can happen if using pseudodata that was generated with the nominal values for parameters
# (which will also be the initial values in the fit) and blah...
# If this is the case, don't both to fit and return results right away.

if isinstance(metric, str):
metric = [metric]

# Grab the hypo map
hypo_asimov_dist = hypo_maker.get_outputs(return_sum=True)

Expand All @@ -1027,12 +1022,18 @@ def fit_recursively(
if len(metric) == 1: # One metric for all detectors
metric = list(metric) * len(hypo_maker.distribution_makers)
elif len(metric) != len(hypo_maker.distribution_makers):
raise IndexError('Number of defined metrics does not match with number of detectors.')
else: # DistributionMaker object
assert len(metric) == 1
raise IndexError(f"Number of defined metrics does not match with number of detectors.")
elif isinstance(hypo_asimov_dist, MapSet) or isinstance(hypo_asimov_dist, list):
# DistributionMaker object (list means variable binning)
assert len(metric) == 1, f"Only one metric allowed for DistributionMaker"
else:
raise NotImplementedError(f"hypo_maker returned output of type {type(hypo_asimov_dist)}")

# Check if the hypo matches data
if hypo_maker.__class__.__name__ != "Detectors" and data_dist.allclose(hypo_asimov_dist) :
# Before starting any fit, check if we already have a perfect match between data and template
# This can happen if using pseudodata that was generated with the nominal values for parameters
# (which will also be the initial values in the fit)
# If this is the case, don't both to fit and return results right away.
if isinstance(data_dist, MapSet) and data_dist.allclose(hypo_asimov_dist):

msg = 'Initial hypo matches data, no need for fit'
logging.info(msg)
Expand Down Expand Up @@ -2602,11 +2603,17 @@ def _minimizer_callable(self, scaled_param_vals, hypo_maker, data_dist,
metric_val = 0
for i in range(len(hypo_maker.distribution_makers)):
data = data_dist[i].metric_total(expected_values=hypo_asimov_dist[i],
metric=metric[i], metric_kwargs=metric_kwargs)
metric=metric[i], metric_kwargs=metric_kwargs)
metric_val += data
priors = hypo_maker.params.priors_penalty(metric=metric[0]) # uses just the "first" metric for prior
metric_val += priors
else: # DistributionMaker object
elif isinstance(hypo_asimov_dist, list): # DistributionMaker object with variable binning
metric_val = 0
for i in range(len(hypo_asimov_dist)):
metric_val += data_dist[i].metric_total(expected_values=hypo_asimov_dist[i],
metric=metric[0], metric_kwargs=metric_kwargs)
metric_val += hypo_maker.params.priors_penalty(metric=metric[0])
else: # DistributionMaker object with regular binning
if metric[0] == 'weighted_chi2':
actual_values = data_dist.hist['total']
expected_values = hypo_asimov_dist.hist['total']
Expand Down Expand Up @@ -2938,9 +2945,9 @@ def fit_hypo(self, data_dist, hypo_maker, metric, minimizer_settings,
if len(metric) == 1: # One metric for all detectors
metric = list(metric) * len(hypo_maker.distribution_makers)
elif len(metric) != len(hypo_maker.distribution_makers):
raise IndexError('Number of defined metrics does not match with number of detectors.')
else: # DistributionMaker object
assert len(metric) == 1
raise IndexError(f"Number of defined metrics does not match with number of detectors.")
else:
assert len(metric) == 1, f"Only one metric allowed for DistributionMaker"

if check_ordering:
if 'nh' in hypo_param_selections or 'ih' in hypo_param_selections:
Expand Down Expand Up @@ -3026,9 +3033,12 @@ def nofit_hypo(self, data_dist, hypo_maker, hypo_param_selections,
if len(metric) == 1: # One metric for all detectors
metric = list(metric) * len(hypo_maker.distribution_makers)
elif len(metric) != len(hypo_maker.distribution_makers):
raise IndexError('Number of defined metrics does not match with number of detectors.')
else: # DistributionMaker object
assert len(metric) == 1
raise IndexError(f"Number of defined metrics does not match with number of detectors.")
elif isinstance(hypo_asimov_dist, MapSet) or isinstance(hypo_asimov_dist, list):
# DistributionMaker object (list means variable binning)
assert len(metric) == 1, f"Only one metric allowed for DistributionMaker"
else:
raise NotImplementedError(f"hypo_maker returned output of type {type(hypo_asimov_dist)}")
fit_info.metric = metric

# Assess the fit: whether the data came from the hypo_asimov_dist
Expand All @@ -3040,6 +3050,8 @@ def nofit_hypo(self, data_dist, hypo_maker, hypo_param_selections,
metric_val += data
priors = hypo_maker.params.priors_penalty(metric=metric[0]) # uses just the "first" metric for prior
metric_val += priors
elif isinstance(data_dist, list):
raise NotImplementedError()
else: # DistributionMaker object

if 'generalized_poisson_llh' == metric[0]:
Expand Down Expand Up @@ -3088,6 +3100,8 @@ def nofit_hypo(self, data_dist, hypo_maker, hypo_param_selections,
params=hypo_maker.distribution_makers[i].params, metric=metric[i],
other_metrics=other_metrics, detector_name=hypo_maker.det_names[i]
) for i in range(len(data_dist))]
elif isinstance(data_dist, list):
raise NotImplementedError()
else: # DistributionMaker object

if 'generalized_poisson_llh' == metric[0]:
Expand Down
72 changes: 69 additions & 3 deletions pisa/core/binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -1590,7 +1590,6 @@ def __init__(self, dimensions, name=None, mask=None):
# Handle masking
self._init_mask(mask)


def _init_mask(self, mask) :
'''
Initialize the bin mask. This can either be specified as:
Expand Down Expand Up @@ -1639,13 +1638,11 @@ def _init_mask(self, mask) :
# Done, store the mask
self._mask = mask


@property
def name(self):
"""Name of the dimension"""
return self._name


def __repr__(self):
previous_precision = np.get_printoptions()['precision']
np.set_printoptions(precision=18)
Expand Down Expand Up @@ -3043,6 +3040,75 @@ def __ne__(self, other):
return not self.__eq__(other)


class VarBinning(object):
# pylint: disable=line-too-long
r"""
Variable binning.

"""
# pylint: enable=line-too-long
def __init__(self, binnings, cut_var, name=None, mask=None):

assert isinstance(cut_var, OneDimBinning)
assert isinstance(binnings, list) and len(binnings) == cut_var.size
for b in binnings:
assert isinstance(b, MultiDimBinning)
assert cut_var.name not in b.names

self._binnings = binnings
self._cut_var = cut_var
self._name = name
self._names = None

@property
def binnings(self):
"""list of MultiDimBinning : each binning in a list"""
return self._binnings

@property
def cut_var(self):
"""OneDimBinning : variable for which to use different binnings"""
return self._cut_var

@property
def name(self):
"""Name of the dimension"""
return self._name

@property
def names(self):
"""list of strings : names of each dimension contained plus cut var"""
if self._names is None:
self._names = [self.cut_var.name]
for b in self.binnings:
self._names.extend([n for n in b.names if n not in self._names])
return self._names

def __pretty__(self, p, cycle):
"""Method used by the `pretty` library for formatting"""
if cycle:
p.text('%s(...)' % self.__class__.__name__)
else:
p.begin_group(4, '%s([' % self.__class__.__name__)
for n, dim in enumerate(self):
p.breakable()
p.pretty(dim)
if n < len(self)-1:
p.text(',')
p.end_group(4, '])')

def _repr_pretty_(self, p, cycle):
"""Method used by e.g. ipython/Jupyter for formatting"""
return self.__pretty__(p, cycle)

def __iter__(self):
"""Iterate over dimensions. Use `iterbins` to iterate over bins."""
return iter(self._binnings)

def __len__(self):
return len(self._binnings)


def test_OneDimBinning():
"""Unit tests for OneDimBinning class"""
# pylint: disable=line-too-long, wrong-import-position
Expand Down
18 changes: 15 additions & 3 deletions pisa/core/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ class Container():
"""

default_translation_mode = "average"
translation_modes = ("average", "sum", None)
translation_modes = ("average", "sum")
array_representations = ("events", "log_events")


Expand Down Expand Up @@ -590,6 +590,18 @@ def translate(self, key, src_representation):
else:
raise NotImplementedError(f"translating {src_representation} to {dest_representation}")

elif self.tranlation_modes[key] == 'sum':
if from_map and to_map:
raise NotImplementedError()

elif to_map:
out = self.array_to_binned(key, src_representation, dest_representation, averaged=False)
self.representation = dest_representation
self[key] = out

else:
raise NotImplementedError()

else:
raise NotImplementedError()

Expand All @@ -611,13 +623,13 @@ def find_valid_representation(self, key):

precedence = np.inf
representation = None

for h in validity.keys():
if validity[h]:
if self.precedence[h] < precedence:
precedence = self.precedence[h]
representation = self._representations[h]

return representation

def resample(self, key, src_representation, dest_representation):
Expand Down
19 changes: 9 additions & 10 deletions pisa/core/distribution_maker.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,16 +281,15 @@ def get_outputs(self, return_sum=False, sum_map_name='total',
outputs.tex = sum_map_tex_name
outputs = MapSet(outputs) # final output must be a MapSet

# Case where the output of a pipeline is a dict of different MapSets
elif isinstance(outputs[0], OrderedDict):
output_dict = OrderedDict()
for key in outputs[0].keys():
output_dict[key] = sum([sum(A[key]) for A in outputs]) # This produces a Map objects
output_dict[key].name = sum_map_name
output_dict[key].tex = sum_map_tex_name
output_dict[key] = MapSet(output_dict[key])

outputs = output_dict
# Case where the output of a pipeline is a list of different MapSets
elif isinstance(outputs[0], list):
outs = []
for i in range(len(outputs[0])):
o = sum([sum(x) for x in np.array(outputs)[:, i]])
o.name = sum_map_name
o.tex = sum_map_tex_name
outs.append(MapSet(o))
outputs = outs

return outputs

Expand Down
Loading