Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PyGRB trigger clustering with short timeslides #4820

Merged
merged 2 commits into from
Jul 21, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
198 changes: 118 additions & 80 deletions bin/pygrb/pycbc_grb_trig_cluster
Original file line number Diff line number Diff line change
Expand Up @@ -168,105 +168,143 @@ outfile = os.path.join(
),
)

# -- generate clustering bins -------------------
# this list contains the indexing of clusters from all slides
all_clusters = []

nbins = int((end - start) // win + 1)
bins = [[] for i in range(nbins)]
loudsnr = numpy.zeros(nbins)
loudtime = numpy.zeros(nbins)
clusters = []
# load necessary information from all triggers

# -- cluster ------------------------------------
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still need this comment ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, because this where we cluster. I will add a comment to flag where the output is written.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I commented on lines you removed. Now I see the comment in line 189 of your new code


with HFile(args.trig_file, "r") as h5f:
time = h5f["network"]["end_time_gc"][()]
snr = h5f["network"][args.rank_column][()]
with h5py.File(args.trig_file, "r") as h5f:
all_times = h5f["network"]["end_time_gc"][()]
Copy link
Contributor

@sebastiangomezlopez sebastiangomezlopez Jul 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is more a matter of taste. Would you consider rewriting it as h5f["network/end_time_gc"][()]? Same for the other lines used to load hdf data.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did the same with the slide_id

all_snrs = h5f["network"][args.rank_column][()]
Copy link
Contributor

@sebastiangomezlopez sebastiangomezlopez Jul 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In case you agree, here an f-string would do the job, h5f[f"network/{args.rank_column"}][()]

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

h5f[f"network/{args.rank_column}"][()]

slide_ids = h5f["network"]["slide_id"][()]

# empty file (no triggers), so just copy the file
if not time.size:
if not all_times.size:
shutil.copyfile(args.trig_file, outfile)
msg = "trigger file is empty\n"
msg += "copied input file to {}".format(outfile)
logging.info(msg)
sys.exit(0)

# find loudest trigger in each bin
for i in tqdm.tqdm(range(time.size), desc="Initialising bins",
disable=not args.verbose, total=time.size, unit='triggers',
**TQDM_KW):
t, s = time[i], snr[i]
idx = int(float(t - start) // win)
bins[idx].append(i)
if s > loudsnr[idx]:
loudsnr[idx] = s
loudtime[idx] = t

prev = -1
nxt_ = 1
first = True
last = False
add_cluster = clusters.append
nclusters = 0

# cluster
bar = tqdm.tqdm(bins, desc="Clustering bins",
disable=not args.verbose, total=nbins, unit='bins',
postfix=dict(nclusters=0), **TQDM_KW)
for i, bin_ in enumerate(bar):
if not bin_: # empty
continue

for idx in bin_:
t, s = time[idx], snr[idx]

if s < loudsnr[i]: # not loudest in own bin
# -- cluster ------------------------------------

unique_slide_ids = numpy.unique(slide_ids)
max_slide_id = max(unique_slide_ids)
msg = 'Clustering '+str(len(slide_ids))+' triggers from '
msg += str(len(unique_slide_ids))+' slides'
logging.info(msg)

for slide_id in unique_slide_ids:
# indices to slice current slide
slide_id_pos = numpy.where(slide_ids == slide_id)[0]
# all time and snr values for the current slide
time = all_times[slide_id_pos]
snr = all_snrs[slide_id_pos]

# generate clustering bins
nbins = int((end - start) // win + 1)
bins = [[] for i in range(nbins)]
loudsnr = numpy.zeros(nbins)
loudtime = numpy.zeros(nbins)
# list to index clusters for current slide
clusters = []

# find loudest trigger in each bin, for the current slide
for i in tqdm.tqdm(range(time.size),
desc="Initialising bins",
disable=not args.verbose,
total=time.size,
unit='triggers',
**TQDM_KW):
t, s = time[i], snr[i]
idx = int(float(t - start) // win)
bins[idx].append(i)
if s > loudsnr[idx]:
loudsnr[idx] = s
loudtime[idx] = t

prev = -1
nxt_ = 1
first = True
last = False
add_cluster = clusters.append
nclusters = 0

# cluster
bar = tqdm.tqdm(bins,
desc="Clustering bins",
disable=not args.verbose,
total=nbins,
unit='bins',
postfix=dict(nclusters=0),
**TQDM_KW)
for i, bin_ in enumerate(bar):
if not bin_: # empty
continue

Comment on lines +189 to 244
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This whole block of code looks good to me

# check loudest event in previous bin
if not first:
prevt = loudtime[prev]
if prevt and abs(prevt - t) < win and s < loudsnr[prev]:
continue
for idx in bin_:
t, s = time[idx], snr[idx]

# check loudest event in next bin
if not last:
nextt = loudtime[nxt_]
if nextt and abs(nextt - t) < win and s < loudsnr[nxt_]:
if s < loudsnr[i]: # not loudest in own bin
continue

loudest = True

# check all events in previous bin
if not first and prevt and abs(prevt - t) < win:
for id2 in bins[prev]:
if abs(time[id2] - t) < win and s < snr[id2]:
loudest = False
break

# check all events in next bin
if loudest and not last and nextt and abs(nextt - t) < win:
for id2 in bins[nxt_]:
if abs(time[id2] - t) < win and s < snr[id2]:
loudest = False
break

# this is loudest in its vicinity, keep it
if loudest:
add_cluster(idx)
nclusters += 1
bar.set_postfix(nclusters=nclusters)

# update things for next time
first = False
last = i == nbins - 1
prev += 1
nxt_ += 1
# check loudest event in previous bin
if not first:
prevt = loudtime[prev]
if prevt and abs(prevt - t) < win and s < loudsnr[prev]:
continue

# check loudest event in next bin
if not last:
nextt = loudtime[nxt_]
if nextt and abs(nextt - t) < win and s < loudsnr[nxt_]:
continue

loudest = True

# check all events in previous bin
if not first and prevt and abs(prevt - t) < win:
for id2 in bins[prev]:
if abs(time[id2] - t) < win and s < snr[id2]:
loudest = False
break

# check all events in next bin
if loudest and not last and nextt and abs(nextt - t) < win:
for id2 in bins[nxt_]:
if abs(time[id2] - t) < win and s < snr[id2]:
loudest = False
break

# this is loudest in its vicinity, keep it
if loudest:
add_cluster(idx)
nclusters += 1
bar.set_postfix(nclusters=nclusters)

# update things for next time
first = False
last = i == nbins - 1
prev += 1
nxt_ += 1

bar.update()

# clusters is the indexing array for a specific slide_id
# all_clusters is the (absolute) indexing of all clustered triggers
# so look up the indices [clusters] within the absolute indexing array
# slide_id_pos which is built at each slide_id
all_clusters += list(slide_id_pos[clusters])
msg = 'Slide '+str(slide_id)+'/'+str(max_slide_id)
msg += ' has '+str(len(slide_id_pos))
msg += ' trigers that were clustered to '+str(len(clusters))
logging.info(msg)

bar.update()
logging.info('Total clustered triggers: '+str(len(all_clusters)))

slice_hdf5(
args.trig_file,
outfile,
numpy.asarray(clusters),
numpy.asarray(all_clusters),
verbose=args.verbose,
)
Loading