Skip to content

Commit

Permalink
PyGRB: Update postprocessing functions (gwastro#4550)
Browse files Browse the repository at this point in the history
* Add mapping function

* Implement mapping in trig combiner

* Fix "template_id" dataset being written too early

* Force template_id to be integer

* Try adding bank files to workflow

* Remove fixme

* Only use one bank_file

* Add template mapping to inj finder

* Small change

* mapping function works with trig file object

* Small change

* Remove typo

* Add bank file opt to inj_finder

* Add template masses to multi_inspiral output

* sort_trigs updates

* Extract trig properties

* Add old imports back for CodeClimate

* Remove unused bestnr opts

* Update pycbc/results/pygrb_postprocessing_utils.py

Co-authored-by: Francesco Pannarale <pannarale@users.noreply.github.com>

* Codeclimate

* Remove masses from multi inspiral output

* Correct segment loading names

* Add NoneType handling to _slide_vetoes function

* Indentation fix

* Add 's' back in

* Fix docstring(?)

* Codeclimate

* Codeclimate

* Update pycbc/results/pygrb_postprocessing_utils.py

Co-authored-by: Tito Dal Canton <tito@dalcanton.it>

* Uses event_ids in sort_trigs to avoid confusion

* Add possibility of multiple banks (and NotImplementedError)

* Remove enumerate and fix indexing issue

* Check for single bank earlier

* Simplify column name check

* Use zip()

* Update pycbc/results/pygrb_postprocessing_utils.py

Co-authored-by: Tito Dal Canton <tito@dalcanton.it>

* Update pycbc/results/pygrb_postprocessing_utils.py

Co-authored-by: Tito Dal Canton <tito@dalcanton.it>

---------

Co-authored-by: Francesco Pannarale <pannarale@users.noreply.github.com>
Co-authored-by: Tito Dal Canton <tito@dalcanton.it>
  • Loading branch information
3 people authored and bhooshan-gadre committed Mar 4, 2024
1 parent 5c7feac commit 73741ac
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 55 deletions.
21 changes: 19 additions & 2 deletions bin/pygrb/pycbc_grb_inj_finder
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ from ligo.segments.utils import fromsegwizard
from pycbc import __version__
from pycbc.inject import InjectionSet
from pycbc.io import FieldArray
from pycbc.results.pygrb_postprocessing_utils import template_hash_to_id

__author__ = "Duncan Macleod <duncan.macleod@ligo.org>"

Expand Down Expand Up @@ -96,6 +97,8 @@ def read_hdf5_triggers(inputfiles, verbose=False):
elif name.startswith("network") and \
NETWORK_IFO_EVENT_ID_REGEX.match(name):
return
elif "template_id" in name:
return
else:
shape = obj.shape
dtype = obj.dtype
Expand Down Expand Up @@ -126,7 +129,7 @@ def read_hdf5_triggers(inputfiles, verbose=False):
# copy dataset contents
for filename in inputfiles:
with h5py.File(filename, 'r') as h5in:
# Skipping emtpy files
# Skipping empty files
if len(h5in['network']['end_time_gc'][:]):
for dset in datasets:
data = h5in[dset][:]
Expand Down Expand Up @@ -174,6 +177,11 @@ parser.add_argument(
nargs="+",
help="path(s) to input injection file(s)",
)
parser.add_argument(
"--bank-file",
required=True,
help="full template bank file for mapping hashes to ids",
)
parser.add_argument(
"-o",
"--output-dir",
Expand Down Expand Up @@ -329,12 +337,21 @@ with h5py.File(outfilename, "w") as h5out:
for x in out_trigs:
xgroup = h5out.create_group(x)
for col in out_trigs[x]:
if "template_id" == col:
continue
xgroup.create_dataset(
col,
data=out_trigs[x][col],
compression="gzip",
compression_opts=9,
)

# map and write template ids
ids = template_hash_to_id(h5out, args.bank_file)
h5out.create_dataset(
"network/template_id",
data=ids,
compression="gzip",
compression_opts=9,
)
if args.verbose:
print("Found/missed written to {}".format(outfilename))
19 changes: 19 additions & 0 deletions bin/pygrb/pycbc_grb_trig_combiner
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ from ligo import segments
from ligo.segments.utils import fromsegwizard

from pycbc import __version__
from pycbc.results.pygrb_postprocessing_utils import template_hash_to_id

__author__ = "Duncan Macleod <duncan.macleod@ligo.org>"

Expand All @@ -55,6 +56,8 @@ def _init_combined_file(h5file, attributes, datasets, **compression_kw):
h5file.attrs.update(attributes)
# create datasets
for dset, (shape, dtype) in datasets.items():
if "template_id" in dset:
continue
h5file.create_dataset(dset, shape=shape, dtype=dtype,
**compression_kw)

Expand Down Expand Up @@ -207,6 +210,11 @@ def merge_hdf5_files(inputfiles, outputfile, verbose=False, **compression_kw):
else:
ifo_event_id_pos = None

# Template ID becomes unhelpful in the merged file.
# This will be handled separately.
if "template_id" in dset:
continue

# merge dataset
position[dset] += _merge_dataset(
h5in[dset], h5out[dset], position[dset],
Expand All @@ -218,6 +226,12 @@ def merge_hdf5_files(inputfiles, outputfile, verbose=False, **compression_kw):
while search_datasets:
dataset_names.remove(search_datasets.pop())

template_ids = template_hash_to_id(trigger_file=h5out, bank_path=args.bank_file)
h5out.create_dataset("network/template_id",
data=template_ids,
compression="gzip",
compression_opts=9)

if verbose:
print("Merged triggers written to {}".format(outputfile))
return outputfile
Expand Down Expand Up @@ -423,6 +437,11 @@ parser.add_argument(
metavar="TRIGGER FILE",
help="read in listed trigger files",
)
parser.add_argument(
"--bank-file",
required=True,
help="full template bank file for mapping hashes to ids",
)
parser.add_argument(
"-o",
"--output-dir",
Expand Down
9 changes: 7 additions & 2 deletions bin/pygrb/pycbc_make_offline_grb_workflow
Original file line number Diff line number Diff line change
Expand Up @@ -273,9 +273,14 @@ all_files.extend(datafind_veto_files)
# TEMPLATE BANK AND SPLIT BANK
bank_files = _workflow.setup_tmpltbank_workflow(wflow, sciSegs,
datafind_files, df_dir)
# Check there are not multiple banks
if len(bank_files) > 1:
raise NotImplementedError("Multiple banks not supported")
full_bank_file = bank_files[0]
# Note: setup_splittable_workflow requires a FileList as input
splitbank_files = _workflow.setup_splittable_workflow(wflow, bank_files, df_dir,
tags=["inspiral"])
all_files.extend(bank_files)
all_files.append(full_bank_file)
all_files.extend(splitbank_files)

# INJECTIONS
Expand Down Expand Up @@ -411,7 +416,7 @@ if post_proc_method == "PYGRB_OFFLINE":
# The content and structure of pp_files are described in the definition
# of _workflow.setup_pygrb_pp_workflow
pp_files = _workflow.setup_pygrb_pp_workflow(wflow, pp_dir, seg_dir,
sciSegs[ifo][0], inspiral_files,
sciSegs[ifo][0], full_bank_file, inspiral_files,
injs, inj_insp_files, inj_tags)
sec_name = 'workflow-pygrb_pp_workflow'
if not wflow.cp.has_section(sec_name):
Expand Down
95 changes: 50 additions & 45 deletions pycbc/results/pygrb_postprocessing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,16 +310,17 @@ def _get_id_numbers(ligolw_table, column):
# =============================================================================
# Function to build a dictionary (indexed by ifo) of time-slid vetoes
# =============================================================================
def _slide_vetoes(vetoes, slide_dict_or_list, slide_id):
def _slide_vetoes(vetoes, slide_dict_or_list, slide_id, ifos):
"""Build a dictionary (indexed by ifo) of time-slid vetoes"""

# Copy vetoes
slid_vetoes = copy.deepcopy(vetoes)

# Slide them
ifos = vetoes.keys()
for ifo in ifos:
slid_vetoes[ifo].shift(-slide_dict_or_list[slide_id][ifo])
if vetoes is not None:
slid_vetoes = copy.deepcopy(vetoes)
# Slide them
for ifo in ifos:
slid_vetoes[ifo].shift(-slide_dict_or_list[slide_id][ifo])
else:
slid_vetoes = {ifo: segments.segmentlist() for ifo in ifos}

return slid_vetoes

Expand Down Expand Up @@ -461,84 +462,67 @@ def sort_trigs(trial_dict, trigs, slide_dict, seg_dict):
sorted_trigs = {}

# Begin by sorting the triggers into each slide
# New seems pretty slow, so run it once and then use deepcopy
tmp_table = glsctables.New(glsctables.MultiInspiralTable)
for slide_id in slide_dict:
sorted_trigs[slide_id] = copy.deepcopy(tmp_table)
for trig in trigs:
sorted_trigs[int(trig.time_slide_id)].append(trig)
sorted_trigs[slide_id] = []
for slide_id, event_id in zip(trigs['network/slide_id'],
trigs['network/event_id']):
sorted_trigs[slide_id].append(event_id)

for slide_id in slide_dict:
# These can only *reduce* the analysis time
curr_seg_list = seg_dict[slide_id]

# Check the triggers are all in the analysed segment lists
for trig in sorted_trigs[slide_id]:
if trig.end_time not in curr_seg_list:
for event_id in sorted_trigs[slide_id]:
index = numpy.flatnonzero(trigs['network/event_id'] == event_id)[0]
end_time = trigs['network/end_time_gc'][index]
if end_time not in curr_seg_list:
# This can be raised if the trigger is on the segment boundary,
# so check if the trigger is within 1/100 of a second within
# the list
if trig.get_end() + 0.01 in curr_seg_list:
if end_time + 0.01 in curr_seg_list:
continue
if trig.get_end() - 0.01 in curr_seg_list:
if end_time - 0.01 in curr_seg_list:
continue
err_msg = "Triggers found in input files not in the list of "
err_msg += "analysed segments. This should not happen."
raise RuntimeError(err_msg)
# END OF CHECK #

# The below line works like the inverse of .veto and only returns trigs
# that are within the segment specified by trial_dict[slide_id]
sorted_trigs[slide_id] = \
sorted_trigs[slide_id].vetoed(trial_dict[slide_id])
# Keep triggers that are in trial_dict
sorted_trigs[slide_id] = [event_id for event_id in
sorted_trigs[slide_id]
if trigs['network/end_time_gc'][
trigs['network/event_id'] == event_id][0]
in trial_dict[slide_id]]

return sorted_trigs


# =============================================================================
# Extract basic trigger properties and store them as dictionaries
# =============================================================================
def extract_basic_trig_properties(trial_dict, trigs, slide_dict, seg_dict,
opts):
def extract_basic_trig_properties(trial_dict, trigs, slide_dict, seg_dict):
"""Extract and store as dictionaries time, SNR, and BestNR of
time-slid triggers"""

# Sort the triggers into each slide
sorted_trigs = sort_trigs(trial_dict, trigs, slide_dict, seg_dict)
logging.info("Triggers sorted.")

# Local copies of variables entering the BestNR definition
chisq_index = opts.chisq_index
chisq_nhigh = opts.chisq_nhigh
null_thresh = list(map(float, opts.null_snr_threshold.split(',')))
snr_thresh = opts.snr_threshold
sngl_snr_thresh = opts.sngl_snr_threshold
new_snr_thresh = opts.newsnr_threshold
null_grad_thresh = opts.null_grad_thresh
null_grad_val = opts.null_grad_val

# Build the 3 dictionaries
trig_time = {}
trig_snr = {}
trig_bestnr = {}
for slide_id in slide_dict:
slide_trigs = sorted_trigs[slide_id]
if slide_trigs:
trig_time[slide_id] = numpy.asarray(slide_trigs.get_end()).\
astype(float)
trig_snr[slide_id] = numpy.asarray(slide_trigs.get_column('snr'))
trig_time[slide_id] = trigs['network/end_time_gc'][slide_trigs]
trig_snr[slide_id] = trigs['network/coherent_snr'][slide_trigs]
else:
trig_time[slide_id] = numpy.asarray([])
trig_snr[slide_id] = numpy.asarray([])
trig_bestnr[slide_id] = get_bestnrs(slide_trigs,
q=chisq_index,
n=chisq_nhigh,
null_thresh=null_thresh,
snr_threshold=snr_thresh,
sngl_snr_threshold=sngl_snr_thresh,
chisq_threshold=new_snr_thresh,
null_grad_thresh=null_grad_thresh,
null_grad_val=null_grad_val)
trig_bestnr[slide_id] = trigs['network/reweighted_snr'][slide_trigs]
logging.info("Time, SNR, and BestNR of triggers extracted.")

return trig_time, trig_snr, trig_bestnr
Expand Down Expand Up @@ -699,7 +683,7 @@ def construct_trials(seg_files, seg_dict, ifos, slide_dict, vetoes):
seg_buffer.coalesce()

# Construct the ifo-indexed dictionary of slid veteoes
slid_vetoes = _slide_vetoes(vetoes, slide_dict, slide_id)
slid_vetoes = _slide_vetoes(vetoes, slide_dict, slide_id, ifos)

# Construct trial list and check against buffer
trial_dict[slide_id] = segments.segmentlist()
Expand Down Expand Up @@ -804,3 +788,24 @@ def get_coinc_snr(trigs_or_injs, ifos):
coinc_snr = numpy.sqrt(snr_sum_square)

return coinc_snr


def template_hash_to_id(trigger_file, bank_path):
"""
This function converts the template hashes from a trigger file
into 'template_id's that represent indices of the
templates within the bank.
Parameters
----------
trigger_file: h5py File object for trigger file
bank_file: filepath for template bank
"""
with h5py.File(bank_path, "r") as bank:
hashes = bank['template_hash'][:]
ifos = [k for k in trigger_file.keys() if k != 'network']
trig_hashes = trigger_file[f'{ifos[0]}/template_hash'][:]
trig_ids = numpy.zeros(trig_hashes.shape[0], dtype=int)
for idx, t_hash in enumerate(hashes):
matches = numpy.where(trig_hashes == t_hash)
trig_ids[matches] = idx
return trig_ids
14 changes: 8 additions & 6 deletions pycbc/workflow/grb_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,8 +233,8 @@ def get_sky_grid_scale(
return out


def setup_pygrb_pp_workflow(wf, pp_dir, seg_dir, segment, insp_files,
inj_files, inj_insp_files, inj_tags):
def setup_pygrb_pp_workflow(wf, pp_dir, seg_dir, segment, bank_file,
insp_files, inj_files, inj_insp_files, inj_tags):
"""
Generate post-processing section of PyGRB offline workflow
"""
Expand All @@ -258,7 +258,7 @@ def setup_pygrb_pp_workflow(wf, pp_dir, seg_dir, segment, insp_files,
job_instance = exe_class(wf.cp, "trig_combiner")
# Create node for coherent no injections jobs
node, trig_files = job_instance.create_node(wf.ifos, seg_dir, segment,
insp_files, pp_dir)
insp_files, pp_dir, bank_file)
wf.add_node(node)
pp_outs.append(trig_files)

Expand All @@ -285,7 +285,7 @@ def setup_pygrb_pp_workflow(wf, pp_dir, seg_dir, segment, insp_files,
if inj_tag in f.tags[1]])
node, inj_find_file = job_instance.create_node(
tag_inj_files, tag_insp_files,
pp_dir)
bank_file, pp_dir)
wf.add_node(node)
inj_find_files.append(inj_find_file)
pp_outs.append(inj_find_files)
Expand Down Expand Up @@ -321,7 +321,7 @@ def __init__(self, cp, name):
self.num_trials = int(cp.get('trig_combiner', 'num-trials'))

def create_node(self, ifo_tag, seg_dir, segment, insp_files,
out_dir, tags=None):
out_dir, bank_file, tags=None):
node = Node(self)
node.add_opt('--verbose')
node.add_opt("--ifo-tag", ifo_tag)
Expand All @@ -331,6 +331,7 @@ def create_node(self, ifo_tag, seg_dir, segment, insp_files,
node.add_input_list_opt("--input-files", insp_files)
node.add_opt("--user-tag", "PYGRB")
node.add_opt("--num-trials", self.num_trials)
node.add_input_opt("--bank-file", bank_file)
# Prepare output file tag
user_tag = f"PYGRB_GRB{self.trigger_name}"
if tags:
Expand Down Expand Up @@ -385,13 +386,14 @@ class PycbcGrbInjFinderExecutable(Executable):
def __init__(self, cp, exe_name):
super().__init__(cp=cp, name=exe_name)

def create_node(self, inj_files, inj_insp_files,
def create_node(self, inj_files, inj_insp_files, bank_file,
out_dir, tags=None):
if tags is None:
tags = []
node = Node(self)
node.add_input_list_opt('--input-files', inj_insp_files)
node.add_input_list_opt('--inj-files', inj_files)
node.add_input_opt('--bank-file', bank_file)
ifo_tag, desc, segment = filename_metadata(inj_files[0].name)
desc = '_'.join(desc.split('_')[:-1])
out_name = "{}-{}_FOUNDMISSED-{}-{}.h5".format(
Expand Down

0 comments on commit 73741ac

Please sign in to comment.