From b58e02cc2d235f6e3745653aef59f43d56d4b4b6 Mon Sep 17 00:00:00 2001
From: Alex Nitz <alex.nitz@gmail.com>
Date: Mon, 7 Oct 2024 18:30:51 -0400
Subject: [PATCH] Add new extremely fast direct monte-carlo method (#4678)

* do this

* fix

* metadata

* add labmda check

* revert change

* renaming

* remove unused options

* cc

* cc

* cc

* more cc
---
 pycbc/conversions.py                |   5 +-
 pycbc/inference/sampler/__init__.py |   2 +
 pycbc/inference/sampler/dummy.py    |   3 +
 pycbc/inference/sampler/games.py    | 238 ++++++++++++++++++++++++++++
 4 files changed, 246 insertions(+), 2 deletions(-)
 create mode 100644 pycbc/inference/sampler/games.py

diff --git a/pycbc/conversions.py b/pycbc/conversions.py
index d494e6f24fa..04f5dc1779d 100644
--- a/pycbc/conversions.py
+++ b/pycbc/conversions.py
@@ -431,8 +431,9 @@ def lambda_tilde(mass1, mass2, lambda1, lambda2):
     mask = m1 < m2
     ldiff[mask] = -ldiff[mask]
     eta = eta_from_mass1_mass2(m1, m2)
-    p1 = lsum * (1 + 7. * eta - 31 * eta ** 2.0)
-    p2 = (1 - 4 * eta)**0.5 * (1 + 9 * eta - 11 * eta ** 2.0) * ldiff
+    eta[eta > 0.25] = 0.25 # Account for numerical error, 0.25 is the max
+    p1 = (lsum) * (1 + 7. * eta - 31 * eta ** 2.0)
+    p2 = (1 - 4 * eta)**0.5 * (1 + 9 * eta - 11 * eta ** 2.0) * (ldiff)
     return formatreturn(8.0 / 13.0 * (p1 + p2), input_is_array)
 
 def delta_lambda_tilde(mass1, mass2, lambda1, lambda2):
diff --git a/pycbc/inference/sampler/__init__.py b/pycbc/inference/sampler/__init__.py
index 79787696b8e..094a2298747 100644
--- a/pycbc/inference/sampler/__init__.py
+++ b/pycbc/inference/sampler/__init__.py
@@ -26,6 +26,7 @@
 from .dummy import DummySampler
 from .refine import RefineSampler
 from .snowline import SnowlineSampler
+from .games import GameSampler
 
 # list of available samplers
 samplers = {cls.name: cls for cls in (
@@ -34,6 +35,7 @@
     DummySampler,
     RefineSampler,
     SnowlineSampler,
+    GameSampler,
 )}
 
 try:
diff --git a/pycbc/inference/sampler/dummy.py b/pycbc/inference/sampler/dummy.py
index 2adf12dd561..460ce865f32 100644
--- a/pycbc/inference/sampler/dummy.py
+++ b/pycbc/inference/sampler/dummy.py
@@ -37,6 +37,7 @@ def __init__(self, model, *args, nprocesses=1, use_mpi=False,
         self.num_samples = int(num_samples)
         self.pool = choose_pool(mpi=use_mpi, processes=nprocesses)
         self._samples = {}
+        self.meta = {}
 
     @classmethod
     def from_config(cls, cp, model, output_file=None, nprocesses=1,
@@ -70,6 +71,8 @@ def run(self):
     def finalize(self):
         with self.io(self.checkpoint_file, "a") as fp:
             fp.write_samples(samples=self._samples)
+            for k in self.meta:
+                fp[fp.sampler_group].attrs[k] = self.meta[k]
 
     checkpoint = resume_from_checkpoint = run
 
diff --git a/pycbc/inference/sampler/games.py b/pycbc/inference/sampler/games.py
new file mode 100644
index 00000000000..3a5931fbd8f
--- /dev/null
+++ b/pycbc/inference/sampler/games.py
@@ -0,0 +1,238 @@
+""" Direct monte carlo sampling using pregenerated mapping files that
+encode the intrinsic parameter space.
+"""
+import logging
+import tqdm
+import h5py
+import numpy
+import numpy.random
+from scipy.special import logsumexp
+from pycbc.io import FieldArray
+
+from pycbc.inference import models
+from pycbc.pool import choose_pool
+from .dummy import DummySampler
+
+
+def call_likelihood(params):
+    """ Accessor to update the global model
+    """
+    models._global_instance.update(**params)
+    return models._global_instance.loglikelihood
+
+
+class OutOfSamples(Exception):
+    """Exception if we ran out of samples"""
+
+
+class GameSampler(DummySampler):
+    """Direct importance sampling using a preconstructed parameter space
+    mapping file.
+
+    Parameters
+    ----------
+    model : Model
+        An instance of a model from ``pycbc.inference.models``.
+    mapfile : str
+        Path to the pre-generated file containing the pre-mapped prior volume
+    loglr_region: int
+        Only use regions from the prior volume tiling that are within
+        this loglr difference of the maximum tile.
+    target_likelihood_calls: int
+        Try to use this many likelihood calls in each round of the analysis.
+    rounds: int
+        The number of iterations to use before terminated.
+    """
+    name = 'games'
+
+    def __init__(self, model, *args, nprocesses=1, use_mpi=False,
+                 mapfile=None,
+                 loglr_region=25,
+                 target_likelihood_calls=1e5,
+                 rounds=1,
+                 **kwargs):
+        super().__init__(model, *args)
+
+        self.meta = {}
+        self.mapfile = mapfile
+        self.rounds = int(rounds)
+        self.dmap = {}
+        self.draw = {}
+
+        models._global_instance = model
+        self.model = model
+        self.pool = choose_pool(mpi=use_mpi, processes=nprocesses)
+        self._samples = {}
+
+        self.target_likelihood_calls = int(target_likelihood_calls)
+        self.loglr_region = float(loglr_region)
+
+    def draw_samples_from_bin(self, i, size):
+        """ Get samples from the binned prior space """
+        if i not in self.draw:
+            self.draw[i] = numpy.arange(0, len(self.dmap[i]))
+
+        if size > len(self.draw[i]):
+            raise OutOfSamples
+
+        numpy.random.shuffle(self.draw[i])
+        selected = self.draw[i][:size]
+        self.draw[i] = self.draw[i][size:]
+
+        if size > 0:
+            remain = len(self.draw[i])
+            logging.info('Drew %i, %i remains in bin %i', size, remain, i)
+
+        return self.dmap[i][selected]
+
+    def sample_round(self, bin_weight, node_idx, lengths):
+        """ Sample from the posterior using pre-binned sets of points and
+        the weighting factor of each bin.
+
+        bin_weight: Array
+            The weighting importance factor of each bin of the prior space
+        node_idx: Array
+            The set of ids into the prebinned prior volume to use. This should
+            map to the given weights.
+        lengths: Array
+            The size of each bin, used to self-normalize
+        """
+        logging.info("...draw template bins")
+        drawcount = (bin_weight * self.target_likelihood_calls).astype(int)
+
+        dorder = bin_weight.argsort()[::-1]
+        remainder = 0
+        for i in dorder:
+            bincount = drawcount[i]
+            binlen = lengths[i]
+            if bincount > binlen:
+                drawcount[i] = binlen
+                remainder += bincount - binlen
+            elif bincount < binlen:
+                asize = min(binlen - bincount, remainder)
+                drawcount[i] += asize
+                remainder -= asize
+
+        drawweight = bin_weight / drawcount
+        total_draw = drawcount.sum()
+
+        logging.info('...drawn random points within bins')
+        psamp = FieldArray(total_draw, dtype=self.dtype)
+        pweight = numpy.zeros(total_draw, dtype=float)
+        bin_id = numpy.zeros(total_draw, dtype=int)
+        j = 0
+        for i, (c, w) in enumerate(zip(drawcount, drawweight)):
+            bdraw = self.draw_samples_from_bin(node_idx[i], c)
+            psamp[j:j+len(bdraw)] = FieldArray.from_records(bdraw,
+                                                            dtype=self.dtype)
+            pweight[j:j+len(bdraw)] = numpy.log(bin_weight[i]) - numpy.log(w)
+            bin_id[j:j+len(bdraw)] = i
+            j += len(bdraw)
+
+        logging.info("Possible unique values %s", lengths.sum())
+        logging.info("Templates drawn from %s", len(lengths))
+        logging.info("Unique values first draw %s", len(psamp))
+
+        # Calculate the likelihood values for the unique parameter space
+        # points
+        args = []
+        for i in range(len(psamp)):
+            pset = {p: psamp[p][i] for p in self.model.variable_params}
+            args.append(pset)
+
+        loglr_samp = list(tqdm.tqdm(self.pool.imap(call_likelihood, args),
+                                    total=len(args)))
+        loglr_samp = numpy.array(loglr_samp)
+
+        # Calculate the weights from the actual likelihood relative to the
+        # initial weights
+        logw3 = loglr_samp + numpy.log(lengths[bin_id]) - pweight
+        logw3 -= logsumexp(logw3)
+        weight2 = numpy.exp(logw3)
+        return psamp, loglr_samp, weight2, bin_id
+
+    def run(self):
+        """ Produce posterior samples """
+        logging.info('Retrieving params of parameter space nodes')
+        with h5py.File(self.mapfile, 'r') as mapfile:
+            bparams = {p: mapfile['bank'][p][:] for p in self.variable_params}
+            num_nodes = len(bparams[list(bparams.keys())[0]])
+            lengths = numpy.array([len(mapfile['map'][str(x)])
+                                   for x in range(num_nodes)])
+            self.dtype = mapfile['map']['0'].dtype
+
+        logging.info('Calculating likelihood at nodes')
+        args = []
+        for i in range(num_nodes):
+            pset = {p: bparams[p][i] for p in self.model.variable_params}
+            args.append(pset)
+
+        node_loglrs = list(tqdm.tqdm(self.pool.imap(call_likelihood, args),
+                                     total=len(args)))
+        node_loglrs = numpy.array(node_loglrs)
+        loglr_bound = node_loglrs[~numpy.isnan(node_loglrs)].max()
+        loglr_bound -= self.loglr_region
+
+        logging.info('Drawing proposal samples from node regions')
+        logw = node_loglrs + numpy.log(lengths)
+        passed = (node_loglrs > loglr_bound) & ~numpy.isnan(node_loglrs)
+        passed = numpy.where(passed)[0]
+        logw2 = logw[passed]
+        logw2 -= logsumexp(logw2)
+        weight = numpy.exp(logw2)
+
+        logging.info("...reading template bins")
+        with h5py.File(self.mapfile, 'r') as mapfile:
+            for i in passed:
+                self.dmap[i] = mapfile['map'][str(i)][:]
+
+        # Sample from posterior
+        psamp = None
+        loglr_samp = None
+        weight2 = None
+        bin_ids = None
+
+        weight = lengths[passed] / lengths[passed].sum()
+
+        for i in range(self.rounds):
+            try:
+                psamp_v, loglr_samp_v, weight2_v, bin_id = \
+                    self.sample_round(weight / weight.sum(),
+                                      passed, lengths[passed])
+            except OutOfSamples:
+                logging.info("No more samples to draw from")
+                break
+
+            for j, v in zip(bin_id, weight2_v):
+                weight[j] += v
+
+            if psamp is None:
+                psamp = psamp_v
+                loglr_samp = loglr_samp_v
+                weight2 = weight2_v
+                bin_ids = bin_id
+            else:
+                psamp = numpy.concatenate([psamp_v, psamp])
+                loglr_samp = numpy.concatenate([loglr_samp_v, loglr_samp])
+                weight2 = numpy.concatenate([weight2_v, weight2])
+                bin_ids = numpy.concatenate([bin_id, bin_ids])
+
+            ess = 1.0 / ((weight2/weight2.sum()) ** 2.0).sum()
+            logging.info("ESS = %s", ess)
+
+        # Prepare the equally weighted output samples
+        self.meta['ncalls'] = len(weight2)
+        self.meta['ess'] = ess
+
+        weight2 /= weight2.sum()
+        draw2 = numpy.random.choice(len(psamp), size=int(ess * 1),
+                                    replace=True, p=weight2)
+        logging.info("Unique values second draw %s",
+                     len(numpy.unique(psamp[draw2])))
+
+        fsamp = FieldArray(len(draw2), dtype=self.dtype)
+        for i, v in enumerate(draw2):
+            fsamp[i] = psamp[v]
+
+        self._samples = {p: fsamp[p] for p in self.model.variable_params}
+        self._samples['loglikelihood'] = loglr_samp[draw2]