Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PyGRB trigger clustering with short timeslides #4820

Merged
merged 2 commits into from
Jul 21, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
196 changes: 118 additions & 78 deletions bin/pygrb/pycbc_grb_trig_cluster
Original file line number Diff line number Diff line change
Expand Up @@ -168,105 +168,145 @@ outfile = os.path.join(
),
)

# -- generate clustering bins -------------------
# this list contains the indexing of clusters from all slides
all_clusters = []

nbins = int((end - start) // win + 1)
bins = [[] for i in range(nbins)]
loudsnr = numpy.zeros(nbins)
loudtime = numpy.zeros(nbins)
clusters = []

# -- cluster ------------------------------------
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still need this comment ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, because this where we cluster. I will add a comment to flag where the output is written.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I commented on lines you removed. Now I see the comment in line 189 of your new code

# load necessary information from all triggers

with HFile(args.trig_file, "r") as h5f:
time = h5f["network"]["end_time_gc"][()]
snr = h5f["network"][args.rank_column][()]
all_times = h5f["network/end_time_gc"][()]
all_snrs = h5f[f"network/{args.rank_column}"][()]
slide_ids = h5f["network/slide_id"][()]

# empty file (no triggers), so just copy the file
if not time.size:
if not all_times.size:
shutil.copyfile(args.trig_file, outfile)
msg = "trigger file is empty\n"
msg += "copied input file to {}".format(outfile)
logging.info(msg)
sys.exit(0)

# find loudest trigger in each bin
for i in tqdm.tqdm(range(time.size), desc="Initialising bins",
disable=not args.verbose, total=time.size, unit='triggers',
**TQDM_KW):
t, s = time[i], snr[i]
idx = int(float(t - start) // win)
bins[idx].append(i)
if s > loudsnr[idx]:
loudsnr[idx] = s
loudtime[idx] = t

prev = -1
nxt_ = 1
first = True
last = False
add_cluster = clusters.append
nclusters = 0

# cluster
bar = tqdm.tqdm(bins, desc="Clustering bins",
disable=not args.verbose, total=nbins, unit='bins',
postfix=dict(nclusters=0), **TQDM_KW)
for i, bin_ in enumerate(bar):
if not bin_: # empty
continue

for idx in bin_:
t, s = time[idx], snr[idx]

if s < loudsnr[i]: # not loudest in own bin
# -- cluster ------------------------------------

unique_slide_ids = numpy.unique(slide_ids)
max_slide_id = max(unique_slide_ids)
msg = 'Clustering '+str(len(slide_ids))+' triggers from '
msg += str(len(unique_slide_ids))+' slides'
logging.info(msg)

for slide_id in unique_slide_ids:
# indices to slice current slide
slide_id_pos = numpy.where(slide_ids == slide_id)[0]
# all time and snr values for the current slide
time = all_times[slide_id_pos]
snr = all_snrs[slide_id_pos]

# generate clustering bins
nbins = int((end - start) // win + 1)
bins = [[] for i in range(nbins)]
loudsnr = numpy.zeros(nbins)
loudtime = numpy.zeros(nbins)
# list to index clusters for current slide
clusters = []

# find loudest trigger in each bin, for the current slide
for i in tqdm.tqdm(range(time.size),
desc="Initialising bins",
disable=not args.verbose,
total=time.size,
unit='triggers',
**TQDM_KW):
t, s = time[i], snr[i]
idx = int(float(t - start) // win)
bins[idx].append(i)
if s > loudsnr[idx]:
loudsnr[idx] = s
loudtime[idx] = t

prev = -1
nxt_ = 1
first = True
last = False
add_cluster = clusters.append
nclusters = 0

# cluster
bar = tqdm.tqdm(bins,
desc="Clustering bins",
disable=not args.verbose,
total=nbins,
unit='bins',
postfix=dict(nclusters=0),
**TQDM_KW)
for i, bin_ in enumerate(bar):
if not bin_: # empty
continue

Comment on lines +189 to 244
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This whole block of code looks good to me

# check loudest event in previous bin
if not first:
prevt = loudtime[prev]
if prevt and abs(prevt - t) < win and s < loudsnr[prev]:
continue
for idx in bin_:
t, s = time[idx], snr[idx]

# check loudest event in next bin
if not last:
nextt = loudtime[nxt_]
if nextt and abs(nextt - t) < win and s < loudsnr[nxt_]:
if s < loudsnr[i]: # not loudest in own bin
continue

loudest = True

# check all events in previous bin
if not first and prevt and abs(prevt - t) < win:
for id2 in bins[prev]:
if abs(time[id2] - t) < win and s < snr[id2]:
loudest = False
break

# check all events in next bin
if loudest and not last and nextt and abs(nextt - t) < win:
for id2 in bins[nxt_]:
if abs(time[id2] - t) < win and s < snr[id2]:
loudest = False
break

# this is loudest in its vicinity, keep it
if loudest:
add_cluster(idx)
nclusters += 1
bar.set_postfix(nclusters=nclusters)
# check loudest event in previous bin
if not first:
prevt = loudtime[prev]
if prevt and abs(prevt - t) < win and s < loudsnr[prev]:
continue

# check loudest event in next bin
if not last:
nextt = loudtime[nxt_]
if nextt and abs(nextt - t) < win and s < loudsnr[nxt_]:
continue

loudest = True

# check all events in previous bin
if not first and prevt and abs(prevt - t) < win:
for id2 in bins[prev]:
if abs(time[id2] - t) < win and s < snr[id2]:
loudest = False
break

# check all events in next bin
if loudest and not last and nextt and abs(nextt - t) < win:
for id2 in bins[nxt_]:
if abs(time[id2] - t) < win and s < snr[id2]:
loudest = False
break

# this is loudest in its vicinity, keep it
if loudest:
add_cluster(idx)
nclusters += 1
bar.set_postfix(nclusters=nclusters)

# update things for next time
first = False
last = i == nbins - 1
prev += 1
nxt_ += 1

bar.update()

Comment on lines +245 to +292
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mostly rearranging lines, looks good.

# clusters is the indexing array for a specific slide_id
# all_clusters is the (absolute) indexing of all clustered triggers
# so look up the indices [clusters] within the absolute indexing array
# slide_id_pos which is built at each slide_id
all_clusters += list(slide_id_pos[clusters])
msg = 'Slide '+str(slide_id)+'/'+str(max_slide_id)
msg += ' has '+str(len(slide_id_pos))
msg += ' trigers that were clustered to '+str(len(clusters))
logging.info(msg)

# update things for next time
first = False
last = i == nbins - 1
prev += 1
nxt_ += 1
logging.info('Total clustered triggers: '+str(len(all_clusters)))

bar.update()
# -- write output --------------------------------

slice_hdf5(
args.trig_file,
outfile,
numpy.asarray(clusters),
numpy.asarray(all_clusters),
verbose=args.verbose,
)
Comment on lines +293 to 312
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting approach overall. I'm ok with merging once tests are passed.

Loading