From b58e02cc2d235f6e3745653aef59f43d56d4b4b6 Mon Sep 17 00:00:00 2001 From: Alex Nitz Date: Mon, 7 Oct 2024 18:30:51 -0400 Subject: [PATCH] Add new extremely fast direct monte-carlo method (#4678) * do this * fix * metadata * add labmda check * revert change * renaming * remove unused options * cc * cc * cc * more cc --- pycbc/conversions.py | 5 +- pycbc/inference/sampler/__init__.py | 2 + pycbc/inference/sampler/dummy.py | 3 + pycbc/inference/sampler/games.py | 238 ++++++++++++++++++++++++++++ 4 files changed, 246 insertions(+), 2 deletions(-) create mode 100644 pycbc/inference/sampler/games.py diff --git a/pycbc/conversions.py b/pycbc/conversions.py index d494e6f24fa..04f5dc1779d 100644 --- a/pycbc/conversions.py +++ b/pycbc/conversions.py @@ -431,8 +431,9 @@ def lambda_tilde(mass1, mass2, lambda1, lambda2): mask = m1 < m2 ldiff[mask] = -ldiff[mask] eta = eta_from_mass1_mass2(m1, m2) - p1 = lsum * (1 + 7. * eta - 31 * eta ** 2.0) - p2 = (1 - 4 * eta)**0.5 * (1 + 9 * eta - 11 * eta ** 2.0) * ldiff + eta[eta > 0.25] = 0.25 # Account for numerical error, 0.25 is the max + p1 = (lsum) * (1 + 7. * eta - 31 * eta ** 2.0) + p2 = (1 - 4 * eta)**0.5 * (1 + 9 * eta - 11 * eta ** 2.0) * (ldiff) return formatreturn(8.0 / 13.0 * (p1 + p2), input_is_array) def delta_lambda_tilde(mass1, mass2, lambda1, lambda2): diff --git a/pycbc/inference/sampler/__init__.py b/pycbc/inference/sampler/__init__.py index 79787696b8e..094a2298747 100644 --- a/pycbc/inference/sampler/__init__.py +++ b/pycbc/inference/sampler/__init__.py @@ -26,6 +26,7 @@ from .dummy import DummySampler from .refine import RefineSampler from .snowline import SnowlineSampler +from .games import GameSampler # list of available samplers samplers = {cls.name: cls for cls in ( @@ -34,6 +35,7 @@ DummySampler, RefineSampler, SnowlineSampler, + GameSampler, )} try: diff --git a/pycbc/inference/sampler/dummy.py b/pycbc/inference/sampler/dummy.py index 2adf12dd561..460ce865f32 100644 --- a/pycbc/inference/sampler/dummy.py +++ b/pycbc/inference/sampler/dummy.py @@ -37,6 +37,7 @@ def __init__(self, model, *args, nprocesses=1, use_mpi=False, self.num_samples = int(num_samples) self.pool = choose_pool(mpi=use_mpi, processes=nprocesses) self._samples = {} + self.meta = {} @classmethod def from_config(cls, cp, model, output_file=None, nprocesses=1, @@ -70,6 +71,8 @@ def run(self): def finalize(self): with self.io(self.checkpoint_file, "a") as fp: fp.write_samples(samples=self._samples) + for k in self.meta: + fp[fp.sampler_group].attrs[k] = self.meta[k] checkpoint = resume_from_checkpoint = run diff --git a/pycbc/inference/sampler/games.py b/pycbc/inference/sampler/games.py new file mode 100644 index 00000000000..3a5931fbd8f --- /dev/null +++ b/pycbc/inference/sampler/games.py @@ -0,0 +1,238 @@ +""" Direct monte carlo sampling using pregenerated mapping files that +encode the intrinsic parameter space. +""" +import logging +import tqdm +import h5py +import numpy +import numpy.random +from scipy.special import logsumexp +from pycbc.io import FieldArray + +from pycbc.inference import models +from pycbc.pool import choose_pool +from .dummy import DummySampler + + +def call_likelihood(params): + """ Accessor to update the global model + """ + models._global_instance.update(**params) + return models._global_instance.loglikelihood + + +class OutOfSamples(Exception): + """Exception if we ran out of samples""" + + +class GameSampler(DummySampler): + """Direct importance sampling using a preconstructed parameter space + mapping file. + + Parameters + ---------- + model : Model + An instance of a model from ``pycbc.inference.models``. + mapfile : str + Path to the pre-generated file containing the pre-mapped prior volume + loglr_region: int + Only use regions from the prior volume tiling that are within + this loglr difference of the maximum tile. + target_likelihood_calls: int + Try to use this many likelihood calls in each round of the analysis. + rounds: int + The number of iterations to use before terminated. + """ + name = 'games' + + def __init__(self, model, *args, nprocesses=1, use_mpi=False, + mapfile=None, + loglr_region=25, + target_likelihood_calls=1e5, + rounds=1, + **kwargs): + super().__init__(model, *args) + + self.meta = {} + self.mapfile = mapfile + self.rounds = int(rounds) + self.dmap = {} + self.draw = {} + + models._global_instance = model + self.model = model + self.pool = choose_pool(mpi=use_mpi, processes=nprocesses) + self._samples = {} + + self.target_likelihood_calls = int(target_likelihood_calls) + self.loglr_region = float(loglr_region) + + def draw_samples_from_bin(self, i, size): + """ Get samples from the binned prior space """ + if i not in self.draw: + self.draw[i] = numpy.arange(0, len(self.dmap[i])) + + if size > len(self.draw[i]): + raise OutOfSamples + + numpy.random.shuffle(self.draw[i]) + selected = self.draw[i][:size] + self.draw[i] = self.draw[i][size:] + + if size > 0: + remain = len(self.draw[i]) + logging.info('Drew %i, %i remains in bin %i', size, remain, i) + + return self.dmap[i][selected] + + def sample_round(self, bin_weight, node_idx, lengths): + """ Sample from the posterior using pre-binned sets of points and + the weighting factor of each bin. + + bin_weight: Array + The weighting importance factor of each bin of the prior space + node_idx: Array + The set of ids into the prebinned prior volume to use. This should + map to the given weights. + lengths: Array + The size of each bin, used to self-normalize + """ + logging.info("...draw template bins") + drawcount = (bin_weight * self.target_likelihood_calls).astype(int) + + dorder = bin_weight.argsort()[::-1] + remainder = 0 + for i in dorder: + bincount = drawcount[i] + binlen = lengths[i] + if bincount > binlen: + drawcount[i] = binlen + remainder += bincount - binlen + elif bincount < binlen: + asize = min(binlen - bincount, remainder) + drawcount[i] += asize + remainder -= asize + + drawweight = bin_weight / drawcount + total_draw = drawcount.sum() + + logging.info('...drawn random points within bins') + psamp = FieldArray(total_draw, dtype=self.dtype) + pweight = numpy.zeros(total_draw, dtype=float) + bin_id = numpy.zeros(total_draw, dtype=int) + j = 0 + for i, (c, w) in enumerate(zip(drawcount, drawweight)): + bdraw = self.draw_samples_from_bin(node_idx[i], c) + psamp[j:j+len(bdraw)] = FieldArray.from_records(bdraw, + dtype=self.dtype) + pweight[j:j+len(bdraw)] = numpy.log(bin_weight[i]) - numpy.log(w) + bin_id[j:j+len(bdraw)] = i + j += len(bdraw) + + logging.info("Possible unique values %s", lengths.sum()) + logging.info("Templates drawn from %s", len(lengths)) + logging.info("Unique values first draw %s", len(psamp)) + + # Calculate the likelihood values for the unique parameter space + # points + args = [] + for i in range(len(psamp)): + pset = {p: psamp[p][i] for p in self.model.variable_params} + args.append(pset) + + loglr_samp = list(tqdm.tqdm(self.pool.imap(call_likelihood, args), + total=len(args))) + loglr_samp = numpy.array(loglr_samp) + + # Calculate the weights from the actual likelihood relative to the + # initial weights + logw3 = loglr_samp + numpy.log(lengths[bin_id]) - pweight + logw3 -= logsumexp(logw3) + weight2 = numpy.exp(logw3) + return psamp, loglr_samp, weight2, bin_id + + def run(self): + """ Produce posterior samples """ + logging.info('Retrieving params of parameter space nodes') + with h5py.File(self.mapfile, 'r') as mapfile: + bparams = {p: mapfile['bank'][p][:] for p in self.variable_params} + num_nodes = len(bparams[list(bparams.keys())[0]]) + lengths = numpy.array([len(mapfile['map'][str(x)]) + for x in range(num_nodes)]) + self.dtype = mapfile['map']['0'].dtype + + logging.info('Calculating likelihood at nodes') + args = [] + for i in range(num_nodes): + pset = {p: bparams[p][i] for p in self.model.variable_params} + args.append(pset) + + node_loglrs = list(tqdm.tqdm(self.pool.imap(call_likelihood, args), + total=len(args))) + node_loglrs = numpy.array(node_loglrs) + loglr_bound = node_loglrs[~numpy.isnan(node_loglrs)].max() + loglr_bound -= self.loglr_region + + logging.info('Drawing proposal samples from node regions') + logw = node_loglrs + numpy.log(lengths) + passed = (node_loglrs > loglr_bound) & ~numpy.isnan(node_loglrs) + passed = numpy.where(passed)[0] + logw2 = logw[passed] + logw2 -= logsumexp(logw2) + weight = numpy.exp(logw2) + + logging.info("...reading template bins") + with h5py.File(self.mapfile, 'r') as mapfile: + for i in passed: + self.dmap[i] = mapfile['map'][str(i)][:] + + # Sample from posterior + psamp = None + loglr_samp = None + weight2 = None + bin_ids = None + + weight = lengths[passed] / lengths[passed].sum() + + for i in range(self.rounds): + try: + psamp_v, loglr_samp_v, weight2_v, bin_id = \ + self.sample_round(weight / weight.sum(), + passed, lengths[passed]) + except OutOfSamples: + logging.info("No more samples to draw from") + break + + for j, v in zip(bin_id, weight2_v): + weight[j] += v + + if psamp is None: + psamp = psamp_v + loglr_samp = loglr_samp_v + weight2 = weight2_v + bin_ids = bin_id + else: + psamp = numpy.concatenate([psamp_v, psamp]) + loglr_samp = numpy.concatenate([loglr_samp_v, loglr_samp]) + weight2 = numpy.concatenate([weight2_v, weight2]) + bin_ids = numpy.concatenate([bin_id, bin_ids]) + + ess = 1.0 / ((weight2/weight2.sum()) ** 2.0).sum() + logging.info("ESS = %s", ess) + + # Prepare the equally weighted output samples + self.meta['ncalls'] = len(weight2) + self.meta['ess'] = ess + + weight2 /= weight2.sum() + draw2 = numpy.random.choice(len(psamp), size=int(ess * 1), + replace=True, p=weight2) + logging.info("Unique values second draw %s", + len(numpy.unique(psamp[draw2]))) + + fsamp = FieldArray(len(draw2), dtype=self.dtype) + for i, v in enumerate(draw2): + fsamp[i] = psamp[v] + + self._samples = {p: fsamp[p] for p in self.model.variable_params} + self._samples['loglikelihood'] = loglr_samp[draw2]