Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PyGRB: Update postprocessing functions #4550

Merged
merged 40 commits into from
Feb 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
596920a
Add mapping function
jakeb245 Jan 18, 2024
bf8e971
Implement mapping in trig combiner
jakeb245 Jan 18, 2024
02b04e0
Fix "template_id" dataset being written too early
jakeb245 Jan 22, 2024
c736dee
Force template_id to be integer
jakeb245 Jan 22, 2024
7c9bbd2
Try adding bank files to workflow
jakeb245 Jan 22, 2024
e707e28
Remove fixme
jakeb245 Jan 22, 2024
b75940e
Only use one bank_file
jakeb245 Jan 22, 2024
a4ac47c
Add template mapping to inj finder
jakeb245 Jan 24, 2024
8b57915
Small change
jakeb245 Jan 24, 2024
a2011a5
mapping function works with trig file object
jakeb245 Jan 24, 2024
50916d8
Small change
jakeb245 Jan 24, 2024
9369491
Remove typo
jakeb245 Jan 24, 2024
93edd5f
Add bank file opt to inj_finder
jakeb245 Jan 26, 2024
1f0f7fa
Add template masses to multi_inspiral output
jakeb245 Oct 31, 2023
93ac747
sort_trigs updates
jakeb245 Oct 31, 2023
67394c0
Extract trig properties
jakeb245 Oct 31, 2023
3c40adf
Add old imports back for CodeClimate
jakeb245 Nov 3, 2023
e24c0e6
Remove unused bestnr opts
jakeb245 Nov 6, 2023
9f1ba6c
Update pycbc/results/pygrb_postprocessing_utils.py
jakeb245 Nov 20, 2023
fc9b061
Codeclimate
jakeb245 Nov 29, 2023
6f48e9f
Remove masses from multi inspiral output
jakeb245 Jan 8, 2024
70935fb
Correct segment loading names
jakeb245 Jan 16, 2024
fec0810
Add NoneType handling to _slide_vetoes function
jakeb245 Jan 16, 2024
77a68ca
Indentation fix
jakeb245 Jan 16, 2024
671b914
Add 's' back in
jakeb245 Jan 23, 2024
03e8f82
Fix docstring(?)
jakeb245 Jan 26, 2024
449d704
Codeclimate
jakeb245 Jan 26, 2024
da15eaa
Codeclimate
jakeb245 Jan 30, 2024
778ffd5
Merge remote-tracking branch 'gwastro/master' into pp_utils
jakeb245 Jan 30, 2024
d4f297f
Merge remote-tracking branch 'gwastro/master' into pp_utils
jakeb245 Feb 9, 2024
88a492a
Update pycbc/results/pygrb_postprocessing_utils.py
jakeb245 Feb 12, 2024
983ac9b
Uses event_ids in sort_trigs to avoid confusion
jakeb245 Feb 12, 2024
32cdadf
Add possibility of multiple banks (and NotImplementedError)
jakeb245 Feb 12, 2024
ad78bea
Remove enumerate and fix indexing issue
jakeb245 Feb 13, 2024
d870dbb
Check for single bank earlier
jakeb245 Feb 13, 2024
8ed469a
Simplify column name check
jakeb245 Feb 13, 2024
d17b8fc
Use zip()
jakeb245 Feb 13, 2024
d36f809
Merge remote-tracking branch 'gwastro/master' into pp_utils
jakeb245 Feb 13, 2024
fea14c6
Update pycbc/results/pygrb_postprocessing_utils.py
jakeb245 Feb 15, 2024
6d5c597
Update pycbc/results/pygrb_postprocessing_utils.py
jakeb245 Feb 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions bin/pygrb/pycbc_grb_inj_finder
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ from ligo.segments.utils import fromsegwizard
from pycbc import __version__
from pycbc.inject import InjectionSet
from pycbc.io import FieldArray
from pycbc.results.pygrb_postprocessing_utils import template_hash_to_id

__author__ = "Duncan Macleod <duncan.macleod@ligo.org>"

Expand Down Expand Up @@ -96,6 +97,8 @@ def read_hdf5_triggers(inputfiles, verbose=False):
elif name.startswith("network") and \
NETWORK_IFO_EVENT_ID_REGEX.match(name):
return
elif "template_id" in name:
return
else:
shape = obj.shape
dtype = obj.dtype
Expand Down Expand Up @@ -126,7 +129,7 @@ def read_hdf5_triggers(inputfiles, verbose=False):
# copy dataset contents
for filename in inputfiles:
with h5py.File(filename, 'r') as h5in:
# Skipping emtpy files
# Skipping empty files
if len(h5in['network']['end_time_gc'][:]):
for dset in datasets:
data = h5in[dset][:]
Expand Down Expand Up @@ -174,6 +177,11 @@ parser.add_argument(
nargs="+",
help="path(s) to input injection file(s)",
)
parser.add_argument(
"--bank-file",
required=True,
help="full template bank file for mapping hashes to ids",
)
parser.add_argument(
"-o",
"--output-dir",
Expand Down Expand Up @@ -329,12 +337,21 @@ with h5py.File(outfilename, "w") as h5out:
for x in out_trigs:
xgroup = h5out.create_group(x)
for col in out_trigs[x]:
if "template_id" == col:
continue
xgroup.create_dataset(
col,
data=out_trigs[x][col],
compression="gzip",
compression_opts=9,
)

# map and write template ids
ids = template_hash_to_id(h5out, args.bank_file)
h5out.create_dataset(
"network/template_id",
data=ids,
compression="gzip",
compression_opts=9,
)
if args.verbose:
print("Found/missed written to {}".format(outfilename))
19 changes: 19 additions & 0 deletions bin/pygrb/pycbc_grb_trig_combiner
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ from ligo import segments
from ligo.segments.utils import fromsegwizard

from pycbc import __version__
from pycbc.results.pygrb_postprocessing_utils import template_hash_to_id

__author__ = "Duncan Macleod <duncan.macleod@ligo.org>"

Expand All @@ -55,6 +56,8 @@ def _init_combined_file(h5file, attributes, datasets, **compression_kw):
h5file.attrs.update(attributes)
# create datasets
for dset, (shape, dtype) in datasets.items():
if "template_id" in dset:
continue
h5file.create_dataset(dset, shape=shape, dtype=dtype,
**compression_kw)

Expand Down Expand Up @@ -207,6 +210,11 @@ def merge_hdf5_files(inputfiles, outputfile, verbose=False, **compression_kw):
else:
ifo_event_id_pos = None

# Template ID becomes unhelpful in the merged file.
# This will be handled separately.
if "template_id" in dset:
titodalcanton marked this conversation as resolved.
Show resolved Hide resolved
continue

# merge dataset
position[dset] += _merge_dataset(
h5in[dset], h5out[dset], position[dset],
Expand All @@ -218,6 +226,12 @@ def merge_hdf5_files(inputfiles, outputfile, verbose=False, **compression_kw):
while search_datasets:
dataset_names.remove(search_datasets.pop())

template_ids = template_hash_to_id(trigger_file=h5out, bank_path=args.bank_file)
h5out.create_dataset("network/template_id",
data=template_ids,
compression="gzip",
compression_opts=9)

if verbose:
print("Merged triggers written to {}".format(outputfile))
return outputfile
Expand Down Expand Up @@ -423,6 +437,11 @@ parser.add_argument(
metavar="TRIGGER FILE",
help="read in listed trigger files",
)
parser.add_argument(
"--bank-file",
required=True,
help="full template bank file for mapping hashes to ids",
)
parser.add_argument(
"-o",
"--output-dir",
Expand Down
9 changes: 7 additions & 2 deletions bin/pygrb/pycbc_make_offline_grb_workflow
Original file line number Diff line number Diff line change
Expand Up @@ -273,9 +273,14 @@ all_files.extend(datafind_veto_files)
# TEMPLATE BANK AND SPLIT BANK
bank_files = _workflow.setup_tmpltbank_workflow(wflow, sciSegs,
datafind_files, df_dir)
# Check there are not multiple banks
if len(bank_files) > 1:
raise NotImplementedError("Multiple banks not supported")
full_bank_file = bank_files[0]
# Note: setup_splittable_workflow requires a FileList as input
splitbank_files = _workflow.setup_splittable_workflow(wflow, bank_files, df_dir,
tags=["inspiral"])
all_files.extend(bank_files)
all_files.append(full_bank_file)
all_files.extend(splitbank_files)

# INJECTIONS
Expand Down Expand Up @@ -411,7 +416,7 @@ if post_proc_method == "PYGRB_OFFLINE":
# The content and structure of pp_files are described in the definition
# of _workflow.setup_pygrb_pp_workflow
pp_files = _workflow.setup_pygrb_pp_workflow(wflow, pp_dir, seg_dir,
sciSegs[ifo][0], inspiral_files,
sciSegs[ifo][0], full_bank_file, inspiral_files,
injs, inj_insp_files, inj_tags)
sec_name = 'workflow-pygrb_pp_workflow'
if not wflow.cp.has_section(sec_name):
Expand Down
95 changes: 50 additions & 45 deletions pycbc/results/pygrb_postprocessing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,16 +310,17 @@ def _get_id_numbers(ligolw_table, column):
# =============================================================================
# Function to build a dictionary (indexed by ifo) of time-slid vetoes
# =============================================================================
def _slide_vetoes(vetoes, slide_dict_or_list, slide_id):
def _slide_vetoes(vetoes, slide_dict_or_list, slide_id, ifos):
"""Build a dictionary (indexed by ifo) of time-slid vetoes"""

# Copy vetoes
slid_vetoes = copy.deepcopy(vetoes)

# Slide them
ifos = vetoes.keys()
for ifo in ifos:
slid_vetoes[ifo].shift(-slide_dict_or_list[slide_id][ifo])
if vetoes is not None:
slid_vetoes = copy.deepcopy(vetoes)
# Slide them
for ifo in ifos:
slid_vetoes[ifo].shift(-slide_dict_or_list[slide_id][ifo])
else:
slid_vetoes = {ifo: segments.segmentlist() for ifo in ifos}

return slid_vetoes

Expand Down Expand Up @@ -461,84 +462,67 @@ def sort_trigs(trial_dict, trigs, slide_dict, seg_dict):
sorted_trigs = {}

# Begin by sorting the triggers into each slide
# New seems pretty slow, so run it once and then use deepcopy
tmp_table = glsctables.New(glsctables.MultiInspiralTable)
for slide_id in slide_dict:
sorted_trigs[slide_id] = copy.deepcopy(tmp_table)
for trig in trigs:
sorted_trigs[int(trig.time_slide_id)].append(trig)
sorted_trigs[slide_id] = []
for slide_id, event_id in zip(trigs['network/slide_id'],
trigs['network/event_id']):
sorted_trigs[slide_id].append(event_id)

for slide_id in slide_dict:
# These can only *reduce* the analysis time
curr_seg_list = seg_dict[slide_id]

# Check the triggers are all in the analysed segment lists
for trig in sorted_trigs[slide_id]:
if trig.end_time not in curr_seg_list:
for event_id in sorted_trigs[slide_id]:
index = numpy.flatnonzero(trigs['network/event_id'] == event_id)[0]
end_time = trigs['network/end_time_gc'][index]
if end_time not in curr_seg_list:
# This can be raised if the trigger is on the segment boundary,
# so check if the trigger is within 1/100 of a second within
# the list
if trig.get_end() + 0.01 in curr_seg_list:
if end_time + 0.01 in curr_seg_list:
continue
if trig.get_end() - 0.01 in curr_seg_list:
if end_time - 0.01 in curr_seg_list:
continue
err_msg = "Triggers found in input files not in the list of "
err_msg += "analysed segments. This should not happen."
raise RuntimeError(err_msg)
# END OF CHECK #

# The below line works like the inverse of .veto and only returns trigs
# that are within the segment specified by trial_dict[slide_id]
sorted_trigs[slide_id] = \
sorted_trigs[slide_id].vetoed(trial_dict[slide_id])
# Keep triggers that are in trial_dict
sorted_trigs[slide_id] = [event_id for event_id in
sorted_trigs[slide_id]
if trigs['network/end_time_gc'][
trigs['network/event_id'] == event_id][0]
in trial_dict[slide_id]]

return sorted_trigs


# =============================================================================
# Extract basic trigger properties and store them as dictionaries
# =============================================================================
def extract_basic_trig_properties(trial_dict, trigs, slide_dict, seg_dict,
opts):
def extract_basic_trig_properties(trial_dict, trigs, slide_dict, seg_dict):
"""Extract and store as dictionaries time, SNR, and BestNR of
time-slid triggers"""

# Sort the triggers into each slide
sorted_trigs = sort_trigs(trial_dict, trigs, slide_dict, seg_dict)
logging.info("Triggers sorted.")

# Local copies of variables entering the BestNR definition
chisq_index = opts.chisq_index
chisq_nhigh = opts.chisq_nhigh
null_thresh = list(map(float, opts.null_snr_threshold.split(',')))
snr_thresh = opts.snr_threshold
sngl_snr_thresh = opts.sngl_snr_threshold
new_snr_thresh = opts.newsnr_threshold
null_grad_thresh = opts.null_grad_thresh
null_grad_val = opts.null_grad_val

# Build the 3 dictionaries
trig_time = {}
trig_snr = {}
trig_bestnr = {}
for slide_id in slide_dict:
slide_trigs = sorted_trigs[slide_id]
if slide_trigs:
trig_time[slide_id] = numpy.asarray(slide_trigs.get_end()).\
astype(float)
trig_snr[slide_id] = numpy.asarray(slide_trigs.get_column('snr'))
trig_time[slide_id] = trigs['network/end_time_gc'][slide_trigs]
trig_snr[slide_id] = trigs['network/coherent_snr'][slide_trigs]
else:
trig_time[slide_id] = numpy.asarray([])
trig_snr[slide_id] = numpy.asarray([])
trig_bestnr[slide_id] = get_bestnrs(slide_trigs,
q=chisq_index,
n=chisq_nhigh,
null_thresh=null_thresh,
snr_threshold=snr_thresh,
sngl_snr_threshold=sngl_snr_thresh,
chisq_threshold=new_snr_thresh,
null_grad_thresh=null_grad_thresh,
null_grad_val=null_grad_val)
trig_bestnr[slide_id] = trigs['network/reweighted_snr'][slide_trigs]
titodalcanton marked this conversation as resolved.
Show resolved Hide resolved
logging.info("Time, SNR, and BestNR of triggers extracted.")

return trig_time, trig_snr, trig_bestnr
Expand Down Expand Up @@ -699,7 +683,7 @@ def construct_trials(seg_files, seg_dict, ifos, slide_dict, vetoes):
seg_buffer.coalesce()

# Construct the ifo-indexed dictionary of slid veteoes
slid_vetoes = _slide_vetoes(vetoes, slide_dict, slide_id)
slid_vetoes = _slide_vetoes(vetoes, slide_dict, slide_id, ifos)

# Construct trial list and check against buffer
trial_dict[slide_id] = segments.segmentlist()
Expand Down Expand Up @@ -804,3 +788,24 @@ def get_coinc_snr(trigs_or_injs, ifos):
coinc_snr = numpy.sqrt(snr_sum_square)

return coinc_snr


def template_hash_to_id(trigger_file, bank_path):
"""
This function converts the template hashes from a trigger file
into 'template_id's that represent indices of the
templates within the bank.
Parameters
----------
trigger_file: h5py File object for trigger file
bank_file: filepath for template bank
"""
with h5py.File(bank_path, "r") as bank:
hashes = bank['template_hash'][:]
ifos = [k for k in trigger_file.keys() if k != 'network']
trig_hashes = trigger_file[f'{ifos[0]}/template_hash'][:]
titodalcanton marked this conversation as resolved.
Show resolved Hide resolved
trig_ids = numpy.zeros(trig_hashes.shape[0], dtype=int)
for idx, t_hash in enumerate(hashes):
matches = numpy.where(trig_hashes == t_hash)
trig_ids[matches] = idx
return trig_ids
14 changes: 8 additions & 6 deletions pycbc/workflow/grb_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,8 +233,8 @@ def get_sky_grid_scale(
return out


def setup_pygrb_pp_workflow(wf, pp_dir, seg_dir, segment, insp_files,
inj_files, inj_insp_files, inj_tags):
def setup_pygrb_pp_workflow(wf, pp_dir, seg_dir, segment, bank_file,
insp_files, inj_files, inj_insp_files, inj_tags):
"""
Generate post-processing section of PyGRB offline workflow
"""
Expand All @@ -258,7 +258,7 @@ def setup_pygrb_pp_workflow(wf, pp_dir, seg_dir, segment, insp_files,
job_instance = exe_class(wf.cp, "trig_combiner")
# Create node for coherent no injections jobs
node, trig_files = job_instance.create_node(wf.ifos, seg_dir, segment,
insp_files, pp_dir)
insp_files, pp_dir, bank_file)
wf.add_node(node)
pp_outs.append(trig_files)

Expand All @@ -285,7 +285,7 @@ def setup_pygrb_pp_workflow(wf, pp_dir, seg_dir, segment, insp_files,
if inj_tag in f.tags[1]])
node, inj_find_file = job_instance.create_node(
tag_inj_files, tag_insp_files,
pp_dir)
bank_file, pp_dir)
wf.add_node(node)
inj_find_files.append(inj_find_file)
pp_outs.append(inj_find_files)
Expand Down Expand Up @@ -321,7 +321,7 @@ def __init__(self, cp, name):
self.num_trials = int(cp.get('trig_combiner', 'num-trials'))

def create_node(self, ifo_tag, seg_dir, segment, insp_files,
out_dir, tags=None):
out_dir, bank_file, tags=None):
node = Node(self)
node.add_opt('--verbose')
node.add_opt("--ifo-tag", ifo_tag)
Expand All @@ -331,6 +331,7 @@ def create_node(self, ifo_tag, seg_dir, segment, insp_files,
node.add_input_list_opt("--input-files", insp_files)
node.add_opt("--user-tag", "PYGRB")
node.add_opt("--num-trials", self.num_trials)
node.add_input_opt("--bank-file", bank_file)
# Prepare output file tag
user_tag = f"PYGRB_GRB{self.trigger_name}"
if tags:
Expand Down Expand Up @@ -385,13 +386,14 @@ class PycbcGrbInjFinderExecutable(Executable):
def __init__(self, cp, exe_name):
super().__init__(cp=cp, name=exe_name)

def create_node(self, inj_files, inj_insp_files,
def create_node(self, inj_files, inj_insp_files, bank_file,
out_dir, tags=None):
if tags is None:
tags = []
node = Node(self)
node.add_input_list_opt('--input-files', inj_insp_files)
node.add_input_list_opt('--inj-files', inj_files)
node.add_input_opt('--bank-file', bank_file)
ifo_tag, desc, segment = filename_metadata(inj_files[0].name)
desc = '_'.join(desc.split('_')[:-1])
out_name = "{}-{}_FOUNDMISSED-{}-{}.h5".format(
Expand Down
Loading