PyGRB: Update postprocessing functions (gwastro#4550)

* Add mapping function * Implement mapping in trig combiner * Fix "template_id" dataset being written too early * Force template_id to be integer * Try adding bank files to workflow * Remove fixme * Only use one bank_file * Add template mapping to inj finder * Small change * mapping function works with trig file object * Small change * Remove typo * Add bank file opt to inj_finder * Add template masses to multi_inspiral output * sort_trigs updates * Extract trig properties * Add old imports back for CodeClimate * Remove unused bestnr opts * Update pycbc/results/pygrb_postprocessing_utils.py Co-authored-by: Francesco Pannarale <pannarale@users.noreply.github.com> * Codeclimate * Remove masses from multi inspiral output * Correct segment loading names * Add NoneType handling to _slide_vetoes function * Indentation fix * Add 's' back in * Fix docstring(?) * Codeclimate * Codeclimate * Update pycbc/results/pygrb_postprocessing_utils.py Co-authored-by: Tito Dal Canton <tito@dalcanton.it> * Uses event_ids in sort_trigs to avoid confusion * Add possibility of multiple banks (and NotImplementedError) * Remove enumerate and fix indexing issue * Check for single bank earlier * Simplify column name check * Use zip() * Update pycbc/results/pygrb_postprocessing_utils.py Co-authored-by: Tito Dal Canton <tito@dalcanton.it> * Update pycbc/results/pygrb_postprocessing_utils.py Co-authored-by: Tito Dal Canton <tito@dalcanton.it> --------- Co-authored-by: Francesco Pannarale <pannarale@users.noreply.github.com> Co-authored-by: Tito Dal Canton <tito@dalcanton.it>
bhooshan-gadre · Mar 4, 2024 · 73741ac · 73741ac
1 parent 5c7feac
commit 73741ac
Show file tree

Hide file tree

Showing 5 changed files with 103 additions and 55 deletions.
diff --git a/bin/pygrb/pycbc_grb_inj_finder b/bin/pygrb/pycbc_grb_inj_finder
@@ -42,6 +42,7 @@ from ligo.segments.utils import fromsegwizard
 from pycbc import __version__
 from pycbc.inject import InjectionSet
 from pycbc.io import FieldArray
+from pycbc.results.pygrb_postprocessing_utils import template_hash_to_id
 
 __author__ = "Duncan Macleod <duncan.macleod@ligo.org>"
 
@@ -96,6 +97,8 @@ def read_hdf5_triggers(inputfiles, verbose=False):
         elif name.startswith("network") and \
             NETWORK_IFO_EVENT_ID_REGEX.match(name):
                 return
+        elif "template_id" in name:
+            return
         else:
             shape = obj.shape
             dtype = obj.dtype
@@ -126,7 +129,7 @@ def read_hdf5_triggers(inputfiles, verbose=False):
     # copy dataset contents
     for filename in inputfiles:
         with h5py.File(filename, 'r') as h5in:
-            # Skipping emtpy files
+            # Skipping empty files
             if len(h5in['network']['end_time_gc'][:]):
                 for dset in datasets:
                     data = h5in[dset][:]
@@ -174,6 +177,11 @@ parser.add_argument(
     nargs="+",
     help="path(s) to input injection file(s)",
 )
+parser.add_argument(
+    "--bank-file",
+    required=True,
+    help="full template bank file for mapping hashes to ids",
+)
 parser.add_argument(
     "-o",
     "--output-dir",
@@ -329,12 +337,21 @@ with h5py.File(outfilename, "w") as h5out:
     for x in out_trigs:
         xgroup = h5out.create_group(x)
         for col in out_trigs[x]:
+            if "template_id" == col:
+                continue
             xgroup.create_dataset(
                 col,
                 data=out_trigs[x][col],
                 compression="gzip",
                 compression_opts=9,
             )
-
+    # map and write template ids
+    ids = template_hash_to_id(h5out, args.bank_file)
+    h5out.create_dataset(
+        "network/template_id",
+        data=ids,
+        compression="gzip",
+        compression_opts=9,
+    )
 if args.verbose:
     print("Found/missed written to {}".format(outfilename))
diff --git a/bin/pygrb/pycbc_grb_trig_combiner b/bin/pygrb/pycbc_grb_trig_combiner
@@ -36,6 +36,7 @@ from ligo import segments
 from ligo.segments.utils import fromsegwizard
 
 from pycbc import __version__
+from pycbc.results.pygrb_postprocessing_utils import template_hash_to_id
 
 __author__ = "Duncan Macleod <duncan.macleod@ligo.org>"
 
@@ -55,6 +56,8 @@ def _init_combined_file(h5file, attributes, datasets, **compression_kw):
     h5file.attrs.update(attributes)
     # create datasets
     for dset, (shape, dtype) in datasets.items():
+        if "template_id" in dset:
+            continue
         h5file.create_dataset(dset, shape=shape, dtype=dtype,
                               **compression_kw)
 
@@ -207,6 +210,11 @@ def merge_hdf5_files(inputfiles, outputfile, verbose=False, **compression_kw):
                     else:
                         ifo_event_id_pos = None
 
+                    # Template ID becomes unhelpful in the merged file.
+                    # This will be handled separately.
+                    if "template_id" in dset:
+                        continue
+
                     # merge dataset
                     position[dset] += _merge_dataset(
                         h5in[dset], h5out[dset], position[dset],
@@ -218,6 +226,12 @@ def merge_hdf5_files(inputfiles, outputfile, verbose=False, **compression_kw):
             while search_datasets:
                 dataset_names.remove(search_datasets.pop())
 
+        template_ids = template_hash_to_id(trigger_file=h5out, bank_path=args.bank_file)
+        h5out.create_dataset("network/template_id",
+                             data=template_ids,
+                             compression="gzip",
+                             compression_opts=9)
+
     if verbose:
         print("Merged triggers written to {}".format(outputfile))
     return outputfile
@@ -423,6 +437,11 @@ parser.add_argument(
     metavar="TRIGGER FILE",
     help="read in listed trigger files",
 )
+parser.add_argument(
+    "--bank-file",
+    required=True,
+    help="full template bank file for mapping hashes to ids",
+)
 parser.add_argument(
     "-o",
     "--output-dir",

diff --git a/bin/pygrb/pycbc_make_offline_grb_workflow b/bin/pygrb/pycbc_make_offline_grb_workflow
@@ -273,9 +273,14 @@ all_files.extend(datafind_veto_files)
 # TEMPLATE BANK AND SPLIT BANK
 bank_files = _workflow.setup_tmpltbank_workflow(wflow, sciSegs,
                                                 datafind_files, df_dir)
+# Check there are not multiple banks
+if len(bank_files) > 1:
+    raise NotImplementedError("Multiple banks not supported")
+full_bank_file = bank_files[0]
+# Note: setup_splittable_workflow requires a FileList as input
 splitbank_files = _workflow.setup_splittable_workflow(wflow, bank_files, df_dir,
                                                       tags=["inspiral"])
-all_files.extend(bank_files)
+all_files.append(full_bank_file)
 all_files.extend(splitbank_files)
 
 # INJECTIONS
@@ -411,7 +416,7 @@ if post_proc_method == "PYGRB_OFFLINE":
     # The content and structure of pp_files are described in the definition
     # of _workflow.setup_pygrb_pp_workflow
     pp_files = _workflow.setup_pygrb_pp_workflow(wflow, pp_dir, seg_dir,
-                                      sciSegs[ifo][0], inspiral_files,
+                                      sciSegs[ifo][0], full_bank_file, inspiral_files,
                                       injs, inj_insp_files, inj_tags)
     sec_name = 'workflow-pygrb_pp_workflow'
     if not wflow.cp.has_section(sec_name):

diff --git a/pycbc/results/pygrb_postprocessing_utils.py b/pycbc/results/pygrb_postprocessing_utils.py
@@ -310,16 +310,17 @@ def _get_id_numbers(ligolw_table, column):
 # =============================================================================
 # Function to build a dictionary (indexed by ifo) of time-slid vetoes
 # =============================================================================
-def _slide_vetoes(vetoes, slide_dict_or_list, slide_id):
+def _slide_vetoes(vetoes, slide_dict_or_list, slide_id, ifos):
     """Build a dictionary (indexed by ifo) of time-slid vetoes"""
 
     # Copy vetoes
-    slid_vetoes = copy.deepcopy(vetoes)
-
-    # Slide them
-    ifos = vetoes.keys()
-    for ifo in ifos:
-        slid_vetoes[ifo].shift(-slide_dict_or_list[slide_id][ifo])
+    if vetoes is not None:
+        slid_vetoes = copy.deepcopy(vetoes)
+        # Slide them
+        for ifo in ifos:
+            slid_vetoes[ifo].shift(-slide_dict_or_list[slide_id][ifo])
+    else:
+        slid_vetoes = {ifo: segments.segmentlist() for ifo in ifos}
 
     return slid_vetoes
 
@@ -461,84 +462,67 @@ def sort_trigs(trial_dict, trigs, slide_dict, seg_dict):
     sorted_trigs = {}
 
     # Begin by sorting the triggers into each slide
-    # New seems pretty slow, so run it once and then use deepcopy
-    tmp_table = glsctables.New(glsctables.MultiInspiralTable)
     for slide_id in slide_dict:
-        sorted_trigs[slide_id] = copy.deepcopy(tmp_table)
-    for trig in trigs:
-        sorted_trigs[int(trig.time_slide_id)].append(trig)
+        sorted_trigs[slide_id] = []
+    for slide_id, event_id in zip(trigs['network/slide_id'],
+                                  trigs['network/event_id']):
+        sorted_trigs[slide_id].append(event_id)
 
     for slide_id in slide_dict:
         # These can only *reduce* the analysis time
         curr_seg_list = seg_dict[slide_id]
 
         # Check the triggers are all in the analysed segment lists
-        for trig in sorted_trigs[slide_id]:
-            if trig.end_time not in curr_seg_list:
+        for event_id in sorted_trigs[slide_id]:
+            index = numpy.flatnonzero(trigs['network/event_id'] == event_id)[0]
+            end_time = trigs['network/end_time_gc'][index]
+            if end_time not in curr_seg_list:
                 # This can be raised if the trigger is on the segment boundary,
                 # so check if the trigger is within 1/100 of a second within
                 # the list
-                if trig.get_end() + 0.01 in curr_seg_list:
+                if end_time + 0.01 in curr_seg_list:
                     continue
-                if trig.get_end() - 0.01 in curr_seg_list:
+                if end_time - 0.01 in curr_seg_list:
                     continue
                 err_msg = "Triggers found in input files not in the list of "
                 err_msg += "analysed segments. This should not happen."
                 raise RuntimeError(err_msg)
         # END OF CHECK #
 
-        # The below line works like the inverse of .veto and only returns trigs
-        # that are within the segment specified by trial_dict[slide_id]
-        sorted_trigs[slide_id] = \
-            sorted_trigs[slide_id].vetoed(trial_dict[slide_id])
+        # Keep triggers that are in trial_dict
+        sorted_trigs[slide_id] = [event_id for event_id in
+                                  sorted_trigs[slide_id]
+                                  if trigs['network/end_time_gc'][
+                                      trigs['network/event_id'] == event_id][0]
+                                  in trial_dict[slide_id]]
 
     return sorted_trigs
 
 
 # =============================================================================
 # Extract basic trigger properties and store them as dictionaries
 # =============================================================================
-def extract_basic_trig_properties(trial_dict, trigs, slide_dict, seg_dict,
-                                  opts):
+def extract_basic_trig_properties(trial_dict, trigs, slide_dict, seg_dict):
     """Extract and store as dictionaries time, SNR, and BestNR of
     time-slid triggers"""
 
     # Sort the triggers into each slide
     sorted_trigs = sort_trigs(trial_dict, trigs, slide_dict, seg_dict)
     logging.info("Triggers sorted.")
 
-    # Local copies of variables entering the BestNR definition
-    chisq_index = opts.chisq_index
-    chisq_nhigh = opts.chisq_nhigh
-    null_thresh = list(map(float, opts.null_snr_threshold.split(',')))
-    snr_thresh = opts.snr_threshold
-    sngl_snr_thresh = opts.sngl_snr_threshold
-    new_snr_thresh = opts.newsnr_threshold
-    null_grad_thresh = opts.null_grad_thresh
-    null_grad_val = opts.null_grad_val
-
     # Build the 3 dictionaries
     trig_time = {}
     trig_snr = {}
     trig_bestnr = {}
     for slide_id in slide_dict:
         slide_trigs = sorted_trigs[slide_id]
         if slide_trigs:
-            trig_time[slide_id] = numpy.asarray(slide_trigs.get_end()).\
-                                  astype(float)
-            trig_snr[slide_id] = numpy.asarray(slide_trigs.get_column('snr'))
+            trig_time[slide_id] = trigs['network/end_time_gc'][slide_trigs]
+            trig_snr[slide_id] = trigs['network/coherent_snr'][slide_trigs]
         else:
             trig_time[slide_id] = numpy.asarray([])
             trig_snr[slide_id] = numpy.asarray([])
-        trig_bestnr[slide_id] = get_bestnrs(slide_trigs,
-                                            q=chisq_index,
-                                            n=chisq_nhigh,
-                                            null_thresh=null_thresh,
-                                            snr_threshold=snr_thresh,
-                                            sngl_snr_threshold=sngl_snr_thresh,
-                                            chisq_threshold=new_snr_thresh,
-                                            null_grad_thresh=null_grad_thresh,
-                                            null_grad_val=null_grad_val)
+        trig_bestnr[slide_id] = trigs['network/reweighted_snr'][slide_trigs]
     logging.info("Time, SNR, and BestNR of triggers extracted.")
 
     return trig_time, trig_snr, trig_bestnr
@@ -699,7 +683,7 @@ def construct_trials(seg_files, seg_dict, ifos, slide_dict, vetoes):
         seg_buffer.coalesce()
 
         # Construct the ifo-indexed dictionary of slid veteoes
-        slid_vetoes = _slide_vetoes(vetoes, slide_dict, slide_id)
+        slid_vetoes = _slide_vetoes(vetoes, slide_dict, slide_id, ifos)
 
         # Construct trial list and check against buffer
         trial_dict[slide_id] = segments.segmentlist()
@@ -804,3 +788,24 @@ def get_coinc_snr(trigs_or_injs, ifos):
     coinc_snr = numpy.sqrt(snr_sum_square)
 
     return coinc_snr
+
+
+def template_hash_to_id(trigger_file, bank_path):
+    """
+    This function converts the template hashes from a trigger file
+    into 'template_id's that represent indices of the
+    templates within the bank.
+    Parameters
+    ----------
+    trigger_file: h5py File object for trigger file
+    bank_file: filepath for template bank
+    """
+    with h5py.File(bank_path, "r") as bank:
+        hashes = bank['template_hash'][:]
+    ifos = [k for k in trigger_file.keys() if k != 'network']
+    trig_hashes = trigger_file[f'{ifos[0]}/template_hash'][:]
+    trig_ids = numpy.zeros(trig_hashes.shape[0], dtype=int)
+    for idx, t_hash in enumerate(hashes):
+        matches = numpy.where(trig_hashes == t_hash)
+        trig_ids[matches] = idx
+    return trig_ids
diff --git a/pycbc/workflow/grb_utils.py b/pycbc/workflow/grb_utils.py
@@ -233,8 +233,8 @@ def get_sky_grid_scale(
     return out
 
 
-def setup_pygrb_pp_workflow(wf, pp_dir, seg_dir, segment, insp_files,
-                            inj_files, inj_insp_files, inj_tags):
+def setup_pygrb_pp_workflow(wf, pp_dir, seg_dir, segment, bank_file,
+                            insp_files, inj_files, inj_insp_files, inj_tags):
     """
     Generate post-processing section of PyGRB offline workflow
     """
@@ -258,7 +258,7 @@ def setup_pygrb_pp_workflow(wf, pp_dir, seg_dir, segment, insp_files,
     job_instance = exe_class(wf.cp, "trig_combiner")
     # Create node for coherent no injections jobs
     node, trig_files = job_instance.create_node(wf.ifos, seg_dir, segment,
-                                    insp_files, pp_dir)
+                                    insp_files, pp_dir, bank_file)
     wf.add_node(node)
     pp_outs.append(trig_files)
 
@@ -285,7 +285,7 @@ def setup_pygrb_pp_workflow(wf, pp_dir, seg_dir, segment, insp_files,
                                    if inj_tag in f.tags[1]])
         node, inj_find_file = job_instance.create_node(
                                            tag_inj_files, tag_insp_files,
-                                           pp_dir)
+                                           bank_file, pp_dir)
         wf.add_node(node)
         inj_find_files.append(inj_find_file)
     pp_outs.append(inj_find_files)
@@ -321,7 +321,7 @@ def __init__(self, cp, name):
         self.num_trials = int(cp.get('trig_combiner', 'num-trials'))
 
     def create_node(self, ifo_tag, seg_dir, segment, insp_files,
-                    out_dir, tags=None):
+                    out_dir, bank_file, tags=None):
         node = Node(self)
         node.add_opt('--verbose')
         node.add_opt("--ifo-tag", ifo_tag)
@@ -331,6 +331,7 @@ def create_node(self, ifo_tag, seg_dir, segment, insp_files,
         node.add_input_list_opt("--input-files", insp_files)
         node.add_opt("--user-tag", "PYGRB")
         node.add_opt("--num-trials", self.num_trials)
+        node.add_input_opt("--bank-file", bank_file)
         # Prepare output file tag
         user_tag = f"PYGRB_GRB{self.trigger_name}"
         if tags:
@@ -385,13 +386,14 @@ class PycbcGrbInjFinderExecutable(Executable):
     def __init__(self, cp, exe_name):
         super().__init__(cp=cp, name=exe_name)
 
-    def create_node(self, inj_files, inj_insp_files,
+    def create_node(self, inj_files, inj_insp_files, bank_file,
                     out_dir, tags=None):
         if tags is None:
             tags = []
         node = Node(self)
         node.add_input_list_opt('--input-files', inj_insp_files)
         node.add_input_list_opt('--inj-files', inj_files)
+        node.add_input_opt('--bank-file', bank_file)
         ifo_tag, desc, segment = filename_metadata(inj_files[0].name)
         desc = '_'.join(desc.split('_')[:-1])
         out_name = "{}-{}_FOUNDMISSED-{}-{}.h5".format(