Skip to content

Commit fe8a899

Browse files
JanWeldertthehrh
andauthored
Alternative variable binning approach (#849)
* Add variable binning and adjust output functions * Force events representation and add histogramming to translations * Adjust analysis.py and config parser * Add example * Expand example a bit * fit_hypo does not know hypo_asimov_dist * generalise pipeline config parser and varbinning to accept selection string also * Make selection string work with container * Check if selections are exclusive * Define selection before checking it * Simple test and more checks for VarBinning * docs & comments, split up check_varbinning (was performing two very different tasks), new pipeline unit test & minor logic fix * exclusivity check only when requesting new output binning in get_outputs * Add docstrings and change assert to warn * fix undefined varbinning name in config parser, debug logging of selection counts and warn if total is zero, add test for non-exclusive selections and empty selections * superficial: comments & docstrings, thousands of lines of ipynb output cleared, moved core pipeline output calculations into separate functions for clarity * also make separate functions for parsing different types of binning from config + minor * adapt varbinning init and tests (require > 1 binnings, detect any binning dim. in a cut expression, allow duplicate binnings, more detailed logging); add notes to class docstring; couple superficial mods * comments/NotImplementedErrors for generalized_poisson_llh in analysis.py; comment on possible caching optimisation in pipeline.py; use built-in DistributionMaker profiling functionality in notebook and reduce no. of pseudoexperiments for usability --------- Co-authored-by: T Ehrhardt <thomas.ehrhardt@icecube.wisc.edu>
1 parent 3d2c212 commit fe8a899

File tree

10 files changed

+1289
-125
lines changed

10 files changed

+1289
-125
lines changed

pisa/analysis/analysis.py

+79-33
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,8 @@ def merge_mapsets_together(mapset_list=None):
195195
'''Handle merging of multiple MapSets, when they come in
196196
the shape of a dict
197197
198+
TODO: might not be required any longer (see various comments on
199+
generalized_poisson_llh)
198200
'''
199201

200202
if isinstance(mapset_list[0], Mapping):
@@ -647,16 +649,27 @@ def __init__(
647649
assert hypo_maker is not None, msg
648650
assert data_dist is not None, msg
649651
assert metric is not None, msg
652+
# this passes through the setter method, but it should just pass through
653+
# without actually doing anything
650654
if hypo_maker.__class__.__name__ == "Detectors":
651-
# this passes through the setter method, but it should just pass through
652-
# without actually doing anything
653655
self.detailed_metric_info = [self.get_detailed_metric_info(
654656
data_dist=data_dist[i], hypo_asimov_dist=self.hypo_asimov_dist[i],
655657
params=hypo_maker.distribution_makers[i].params, metric=metric[i],
656-
other_metrics=other_metrics, detector_name=hypo_maker.det_names[i], hypo_maker=hypo_maker
658+
other_metrics=other_metrics, detector_name=hypo_maker.det_names[i], hypo_maker=hypo_maker,
657659
) for i in range(len(data_dist))]
658-
else: # DistributionMaker object
659-
if 'generalized_poisson_llh' == metric[0]:
660+
elif isinstance(data_dist, list): # DistributionMaker object with variable binning
661+
self.detailed_metric_info = [self.get_detailed_metric_info(
662+
data_dist=data_dist[i], hypo_asimov_dist=self.hypo_asimov_dist[i],
663+
params=hypo_maker.params, metric=metric[0],
664+
other_metrics=other_metrics, detector_name=hypo_maker.detector_name, hypo_maker=hypo_maker,
665+
) for i in range(len(data_dist))]
666+
else: # DistributionMaker object with regular binning
667+
if metric[0] == 'generalized_poisson_llh':
668+
raise NotImplementedError(
669+
"generalized_poisson_llh isn't correctly implemented any longer!"
670+
)
671+
# FIXME: these `output_mode` and `force_standard_output` kwargs were removed in
672+
# https://github.com/icecube/pisa/commit/7a4e875aa7bdc52ea64a5270e9808d866d1395f3
660673
generalized_poisson_dist = hypo_maker.get_outputs(return_sum=False, force_standard_output=False)
661674
generalized_poisson_dist = merge_mapsets_together(mapset_list=generalized_poisson_dist)
662675
else:
@@ -665,7 +678,7 @@ def __init__(
665678
self.detailed_metric_info = self.get_detailed_metric_info(
666679
data_dist=data_dist, hypo_asimov_dist=self.hypo_asimov_dist, generalized_poisson_hypo=generalized_poisson_dist,
667680
params=hypo_maker.params, metric=metric[0], other_metrics=other_metrics,
668-
detector_name=hypo_maker.detector_name, hypo_maker=hypo_maker
681+
detector_name=hypo_maker.detector_name, hypo_maker=hypo_maker,
669682
)
670683

671684
def __getitem__(self, i):
@@ -810,6 +823,7 @@ def get_detailed_metric_info(data_dist, hypo_maker, hypo_asimov_dist, params, me
810823
detailed_metric_info[m] = name_vals_d
811824

812825
else:
826+
# TODO: remove this case?
813827
if isinstance(hypo_asimov_dist, OrderedDict):
814828
hypo_asimov_dist = hypo_asimov_dist['weights']
815829

@@ -1180,17 +1194,6 @@ def fit_recursively(
11801194
11811195
"""
11821196

1183-
if isinstance(metric, str):
1184-
metric = [metric]
1185-
1186-
# Before starting any fit, check if we already have a perfect match between data and template
1187-
# This can happen if using pseudodata that was generated with the nominal values for parameters
1188-
# (which will also be the initial values in the fit) and blah...
1189-
# If this is the case, don't both to fit and return results right away.
1190-
1191-
if isinstance(metric, str):
1192-
metric = [metric]
1193-
11941197
# Grab the hypo map
11951198
hypo_asimov_dist = hypo_maker.get_outputs(return_sum=True)
11961199

@@ -1201,12 +1204,20 @@ def fit_recursively(
12011204
if len(metric) == 1: # One metric for all detectors
12021205
metric = list(metric) * len(hypo_maker.distribution_makers)
12031206
elif len(metric) != len(hypo_maker.distribution_makers):
1204-
raise IndexError('Number of defined metrics does not match with number of detectors.')
1205-
else: # DistributionMaker object
1206-
assert len(metric) == 1
1207+
raise IndexError(f"Number of defined metrics does not match with number of detectors.")
1208+
elif isinstance(hypo_asimov_dist, MapSet) or isinstance(hypo_asimov_dist, list):
1209+
# DistributionMaker object (list means variable binning)
1210+
assert len(metric) == 1, f"Only one metric allowed for DistributionMaker"
1211+
else:
1212+
raise NotImplementedError(f"hypo_maker returned output of type {type(hypo_asimov_dist)}")
12071213

1208-
# Check if the hypo matches data
1209-
if hypo_maker.__class__.__name__ != "Detectors" and data_dist.allclose(hypo_asimov_dist) :
1214+
# Before starting any fit, check if we already have a perfect match
1215+
# between data and template. This can happen if using pseudodata that
1216+
# was generated with the nominal values for parameters (which will also
1217+
# be the initial values in the fit). If this is the case, don't bother
1218+
# to fit and return results right away. TODO: This speedup is currently
1219+
# only enabled for a DistributionMaker with regular binning.
1220+
if isinstance(data_dist, MapSet) and data_dist.allclose(hypo_asimov_dist):
12101221

12111222
msg = 'Initial hypo matches data, no need for fit'
12121223
logging.info(msg)
@@ -2756,12 +2767,18 @@ def _minimizer_callable(self, scaled_param_vals, hypo_maker, data_dist,
27562767
# Get the map set
27572768
try:
27582769
if metric[0] == 'generalized_poisson_llh':
2770+
raise NotImplementedError(
2771+
"generalized_poisson_llh isn't correctly implemented any longer!"
2772+
)
2773+
# FIXME: these `output_mode` and `force_standard_output` kwargs were removed in
2774+
# https://github.com/icecube/pisa/commit/7a4e875aa7bdc52ea64a5270e9808d866d1395f3
27592775
hypo_asimov_dist = hypo_maker.get_outputs(return_sum=False, output_mode='binned', force_standard_output=False)
27602776
hypo_asimov_dist = merge_mapsets_together(mapset_list=hypo_asimov_dist)
27612777
data_dist = data_dist.maps[0] # Extract the map from the MapSet
27622778
metric_kwargs = {'empty_bins':hypo_maker.empty_bin_indices}
27632779
else:
27642780
hypo_asimov_dist = hypo_maker.get_outputs(return_sum=True)
2781+
# TODO: can be removed? (see same commit as above)
27652782
if isinstance(hypo_asimov_dist, OrderedDict):
27662783
hypo_asimov_dist = hypo_asimov_dist['weights']
27672784
metric_kwargs = {}
@@ -2785,11 +2802,17 @@ def _minimizer_callable(self, scaled_param_vals, hypo_maker, data_dist,
27852802
metric_val = 0
27862803
for i in range(len(hypo_maker.distribution_makers)):
27872804
data = data_dist[i].metric_total(expected_values=hypo_asimov_dist[i],
2788-
metric=metric[i], metric_kwargs=metric_kwargs)
2805+
metric=metric[i], metric_kwargs=metric_kwargs)
27892806
metric_val += data
27902807
priors = hypo_maker.params.priors_penalty(metric=metric[0]) # uses just the "first" metric for prior
27912808
metric_val += priors
2792-
else: # DistributionMaker object
2809+
elif isinstance(hypo_asimov_dist, list): # DistributionMaker object with variable binning
2810+
metric_val = 0
2811+
for i in range(len(hypo_asimov_dist)):
2812+
metric_val += data_dist[i].metric_total(expected_values=hypo_asimov_dist[i],
2813+
metric=metric[0], metric_kwargs=metric_kwargs)
2814+
metric_val += hypo_maker.params.priors_penalty(metric=metric[0])
2815+
else: # DistributionMaker object with regular binning
27932816
if metric[0] == 'weighted_chi2':
27942817
actual_values = data_dist.hist['total']
27952818
expected_values = hypo_asimov_dist.hist['total']
@@ -3121,9 +3144,9 @@ def fit_hypo(self, data_dist, hypo_maker, metric, minimizer_settings,
31213144
if len(metric) == 1: # One metric for all detectors
31223145
metric = list(metric) * len(hypo_maker.distribution_makers)
31233146
elif len(metric) != len(hypo_maker.distribution_makers):
3124-
raise IndexError('Number of defined metrics does not match with number of detectors.')
3125-
else: # DistributionMaker object
3126-
assert len(metric) == 1
3147+
raise IndexError(f"Number of defined metrics does not match with number of detectors.")
3148+
else:
3149+
assert len(metric) == 1, f"Only one metric allowed for DistributionMaker"
31273150

31283151
if check_ordering:
31293152
if 'nh' in hypo_param_selections or 'ih' in hypo_param_selections:
@@ -3209,9 +3232,12 @@ def nofit_hypo(self, data_dist, hypo_maker, hypo_param_selections,
32093232
if len(metric) == 1: # One metric for all detectors
32103233
metric = list(metric) * len(hypo_maker.distribution_makers)
32113234
elif len(metric) != len(hypo_maker.distribution_makers):
3212-
raise IndexError('Number of defined metrics does not match with number of detectors.')
3213-
else: # DistributionMaker object
3214-
assert len(metric) == 1
3235+
raise IndexError(f"Number of defined metrics does not match with number of detectors.")
3236+
elif isinstance(hypo_asimov_dist, MapSet) or isinstance(hypo_asimov_dist, list):
3237+
# DistributionMaker object (list means variable binning)
3238+
assert len(metric) == 1, f"Only one metric allowed for DistributionMaker"
3239+
else:
3240+
raise NotImplementedError(f"hypo_maker returned output of type {type(hypo_asimov_dist)}")
32153241
fit_info.metric = metric
32163242

32173243
# Assess the fit: whether the data came from the hypo_asimov_dist
@@ -3223,10 +3249,19 @@ def nofit_hypo(self, data_dist, hypo_maker, hypo_param_selections,
32233249
metric_val += data
32243250
priors = hypo_maker.params.priors_penalty(metric=metric[0]) # uses just the "first" metric for prior
32253251
metric_val += priors
3226-
else: # DistributionMaker object
3252+
elif isinstance(data_dist, list): # DistributionMaker object with VarBinning
3253+
metric_val = 0
3254+
for i in range(len(data_dist)):
3255+
metric_val += data_dist[i].metric_total(expected_values=hypo_asimov_dist[i], metric=metric[0])
3256+
metric_val += hypo_maker.params.priors_penalty(metric=metric[0])
3257+
else: # DistributionMaker object with MultiDimBinning
32273258

32283259
if 'generalized_poisson_llh' == metric[0]:
3229-
3260+
raise NotImplementedError(
3261+
"generalized_poisson_llh isn't correctly implemented any longer!"
3262+
)
3263+
# FIXME: these `output_mode` and `force_standard_output` kwargs were removed in
3264+
# https://github.com/icecube/pisa/commit/7a4e875aa7bdc52ea64a5270e9808d866d1395f3
32303265
hypo_asimov_dist = hypo_maker.get_outputs(return_sum=False, output_mode='binned', force_standard_output=False)
32313266
hypo_asimov_dist = merge_mapsets_together(mapset_list=hypo_asimov_dist)
32323267
data_dist = data_dist.maps[0] # Extract the map from the MapSet
@@ -3271,9 +3306,20 @@ def nofit_hypo(self, data_dist, hypo_maker, hypo_param_selections,
32713306
params=hypo_maker.distribution_makers[i].params, metric=metric[i],
32723307
other_metrics=other_metrics, detector_name=hypo_maker.det_names[i]
32733308
) for i in range(len(data_dist))]
3274-
else: # DistributionMaker object
3309+
elif isinstance(data_dist, list): # DistributionMaker object with VarBinning
3310+
fit_info.detailed_metric_info = [fit_info.get_detailed_metric_info(
3311+
data_dist=data_dist[i], hypo_asimov_dist=hypo_asimov_dist[i],
3312+
params=hypo_maker.params, metric=metric[0], other_metrics=other_metrics,
3313+
detector_name=hypo_maker.detector_name
3314+
) for i in range(len(data_dist))]
3315+
else: # DistributionMaker object with MultiDimBinning
32753316

32763317
if 'generalized_poisson_llh' == metric[0]:
3318+
raise NotImplementedError(
3319+
"generalized_poisson_llh isn't correctly implemented any longer!"
3320+
)
3321+
# FIXME: these `output_mode` and `force_standard_output` kwargs were removed in
3322+
# https://github.com/icecube/pisa/commit/7a4e875aa7bdc52ea64a5270e9808d866d1395f3
32773323
generalized_poisson_dist = hypo_maker.get_outputs(return_sum=False, force_standard_output=False)
32783324
generalized_poisson_dist = merge_mapsets_together(mapset_list=generalized_poisson_dist)
32793325
else:

0 commit comments

Comments
 (0)