Skip to content

Commit

Permalink
Add new extremely fast direct monte-carlo method (#4678)
Browse files Browse the repository at this point in the history
* do this

* fix

* metadata

* add labmda check

* revert change

* renaming

* remove unused options

* cc

* cc

* cc

* more cc
  • Loading branch information
ahnitz authored Oct 7, 2024
1 parent c82b88b commit b58e02c
Show file tree
Hide file tree
Showing 4 changed files with 246 additions and 2 deletions.
5 changes: 3 additions & 2 deletions pycbc/conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,8 +431,9 @@ def lambda_tilde(mass1, mass2, lambda1, lambda2):
mask = m1 < m2
ldiff[mask] = -ldiff[mask]
eta = eta_from_mass1_mass2(m1, m2)
p1 = lsum * (1 + 7. * eta - 31 * eta ** 2.0)
p2 = (1 - 4 * eta)**0.5 * (1 + 9 * eta - 11 * eta ** 2.0) * ldiff
eta[eta > 0.25] = 0.25 # Account for numerical error, 0.25 is the max
p1 = (lsum) * (1 + 7. * eta - 31 * eta ** 2.0)
p2 = (1 - 4 * eta)**0.5 * (1 + 9 * eta - 11 * eta ** 2.0) * (ldiff)
return formatreturn(8.0 / 13.0 * (p1 + p2), input_is_array)

def delta_lambda_tilde(mass1, mass2, lambda1, lambda2):
Expand Down
2 changes: 2 additions & 0 deletions pycbc/inference/sampler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from .dummy import DummySampler
from .refine import RefineSampler
from .snowline import SnowlineSampler
from .games import GameSampler

# list of available samplers
samplers = {cls.name: cls for cls in (
Expand All @@ -34,6 +35,7 @@
DummySampler,
RefineSampler,
SnowlineSampler,
GameSampler,
)}

try:
Expand Down
3 changes: 3 additions & 0 deletions pycbc/inference/sampler/dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def __init__(self, model, *args, nprocesses=1, use_mpi=False,
self.num_samples = int(num_samples)
self.pool = choose_pool(mpi=use_mpi, processes=nprocesses)
self._samples = {}
self.meta = {}

@classmethod
def from_config(cls, cp, model, output_file=None, nprocesses=1,
Expand Down Expand Up @@ -70,6 +71,8 @@ def run(self):
def finalize(self):
with self.io(self.checkpoint_file, "a") as fp:
fp.write_samples(samples=self._samples)
for k in self.meta:
fp[fp.sampler_group].attrs[k] = self.meta[k]

checkpoint = resume_from_checkpoint = run

Expand Down
238 changes: 238 additions & 0 deletions pycbc/inference/sampler/games.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
""" Direct monte carlo sampling using pregenerated mapping files that
encode the intrinsic parameter space.
"""
import logging
import tqdm
import h5py
import numpy
import numpy.random
from scipy.special import logsumexp
from pycbc.io import FieldArray

from pycbc.inference import models
from pycbc.pool import choose_pool
from .dummy import DummySampler


def call_likelihood(params):
""" Accessor to update the global model
"""
models._global_instance.update(**params)
return models._global_instance.loglikelihood


class OutOfSamples(Exception):
"""Exception if we ran out of samples"""


class GameSampler(DummySampler):
"""Direct importance sampling using a preconstructed parameter space
mapping file.
Parameters
----------
model : Model
An instance of a model from ``pycbc.inference.models``.
mapfile : str
Path to the pre-generated file containing the pre-mapped prior volume
loglr_region: int
Only use regions from the prior volume tiling that are within
this loglr difference of the maximum tile.
target_likelihood_calls: int
Try to use this many likelihood calls in each round of the analysis.
rounds: int
The number of iterations to use before terminated.
"""
name = 'games'

def __init__(self, model, *args, nprocesses=1, use_mpi=False,
mapfile=None,
loglr_region=25,
target_likelihood_calls=1e5,
rounds=1,
**kwargs):
super().__init__(model, *args)

self.meta = {}
self.mapfile = mapfile
self.rounds = int(rounds)
self.dmap = {}
self.draw = {}

models._global_instance = model
self.model = model
self.pool = choose_pool(mpi=use_mpi, processes=nprocesses)
self._samples = {}

self.target_likelihood_calls = int(target_likelihood_calls)
self.loglr_region = float(loglr_region)

def draw_samples_from_bin(self, i, size):
""" Get samples from the binned prior space """
if i not in self.draw:
self.draw[i] = numpy.arange(0, len(self.dmap[i]))

if size > len(self.draw[i]):
raise OutOfSamples

numpy.random.shuffle(self.draw[i])
selected = self.draw[i][:size]
self.draw[i] = self.draw[i][size:]

if size > 0:
remain = len(self.draw[i])
logging.info('Drew %i, %i remains in bin %i', size, remain, i)

return self.dmap[i][selected]

def sample_round(self, bin_weight, node_idx, lengths):
""" Sample from the posterior using pre-binned sets of points and
the weighting factor of each bin.
bin_weight: Array
The weighting importance factor of each bin of the prior space
node_idx: Array
The set of ids into the prebinned prior volume to use. This should
map to the given weights.
lengths: Array
The size of each bin, used to self-normalize
"""
logging.info("...draw template bins")
drawcount = (bin_weight * self.target_likelihood_calls).astype(int)

dorder = bin_weight.argsort()[::-1]
remainder = 0
for i in dorder:
bincount = drawcount[i]
binlen = lengths[i]
if bincount > binlen:
drawcount[i] = binlen
remainder += bincount - binlen
elif bincount < binlen:
asize = min(binlen - bincount, remainder)
drawcount[i] += asize
remainder -= asize

drawweight = bin_weight / drawcount
total_draw = drawcount.sum()

logging.info('...drawn random points within bins')
psamp = FieldArray(total_draw, dtype=self.dtype)
pweight = numpy.zeros(total_draw, dtype=float)
bin_id = numpy.zeros(total_draw, dtype=int)
j = 0
for i, (c, w) in enumerate(zip(drawcount, drawweight)):
bdraw = self.draw_samples_from_bin(node_idx[i], c)
psamp[j:j+len(bdraw)] = FieldArray.from_records(bdraw,
dtype=self.dtype)
pweight[j:j+len(bdraw)] = numpy.log(bin_weight[i]) - numpy.log(w)
bin_id[j:j+len(bdraw)] = i
j += len(bdraw)

logging.info("Possible unique values %s", lengths.sum())
logging.info("Templates drawn from %s", len(lengths))
logging.info("Unique values first draw %s", len(psamp))

# Calculate the likelihood values for the unique parameter space
# points
args = []
for i in range(len(psamp)):
pset = {p: psamp[p][i] for p in self.model.variable_params}
args.append(pset)

loglr_samp = list(tqdm.tqdm(self.pool.imap(call_likelihood, args),
total=len(args)))
loglr_samp = numpy.array(loglr_samp)

# Calculate the weights from the actual likelihood relative to the
# initial weights
logw3 = loglr_samp + numpy.log(lengths[bin_id]) - pweight
logw3 -= logsumexp(logw3)
weight2 = numpy.exp(logw3)
return psamp, loglr_samp, weight2, bin_id

def run(self):
""" Produce posterior samples """
logging.info('Retrieving params of parameter space nodes')
with h5py.File(self.mapfile, 'r') as mapfile:
bparams = {p: mapfile['bank'][p][:] for p in self.variable_params}
num_nodes = len(bparams[list(bparams.keys())[0]])
lengths = numpy.array([len(mapfile['map'][str(x)])
for x in range(num_nodes)])
self.dtype = mapfile['map']['0'].dtype

logging.info('Calculating likelihood at nodes')
args = []
for i in range(num_nodes):
pset = {p: bparams[p][i] for p in self.model.variable_params}
args.append(pset)

node_loglrs = list(tqdm.tqdm(self.pool.imap(call_likelihood, args),
total=len(args)))
node_loglrs = numpy.array(node_loglrs)
loglr_bound = node_loglrs[~numpy.isnan(node_loglrs)].max()
loglr_bound -= self.loglr_region

logging.info('Drawing proposal samples from node regions')
logw = node_loglrs + numpy.log(lengths)
passed = (node_loglrs > loglr_bound) & ~numpy.isnan(node_loglrs)
passed = numpy.where(passed)[0]
logw2 = logw[passed]
logw2 -= logsumexp(logw2)
weight = numpy.exp(logw2)

logging.info("...reading template bins")
with h5py.File(self.mapfile, 'r') as mapfile:
for i in passed:
self.dmap[i] = mapfile['map'][str(i)][:]

# Sample from posterior
psamp = None
loglr_samp = None
weight2 = None
bin_ids = None

weight = lengths[passed] / lengths[passed].sum()

for i in range(self.rounds):
try:
psamp_v, loglr_samp_v, weight2_v, bin_id = \
self.sample_round(weight / weight.sum(),
passed, lengths[passed])
except OutOfSamples:
logging.info("No more samples to draw from")
break

for j, v in zip(bin_id, weight2_v):
weight[j] += v

if psamp is None:
psamp = psamp_v
loglr_samp = loglr_samp_v
weight2 = weight2_v
bin_ids = bin_id
else:
psamp = numpy.concatenate([psamp_v, psamp])
loglr_samp = numpy.concatenate([loglr_samp_v, loglr_samp])
weight2 = numpy.concatenate([weight2_v, weight2])
bin_ids = numpy.concatenate([bin_id, bin_ids])

ess = 1.0 / ((weight2/weight2.sum()) ** 2.0).sum()
logging.info("ESS = %s", ess)

# Prepare the equally weighted output samples
self.meta['ncalls'] = len(weight2)
self.meta['ess'] = ess

weight2 /= weight2.sum()
draw2 = numpy.random.choice(len(psamp), size=int(ess * 1),
replace=True, p=weight2)
logging.info("Unique values second draw %s",
len(numpy.unique(psamp[draw2])))

fsamp = FieldArray(len(draw2), dtype=self.dtype)
for i, v in enumerate(draw2):
fsamp[i] = psamp[v]

self._samples = {p: fsamp[p] for p in self.model.variable_params}
self._samples['loglikelihood'] = loglr_samp[draw2]

0 comments on commit b58e02c

Please sign in to comment.