Skip to content

Commit 86d99c2

Browse files
mmccrackanMichael McCrackanMichael McCrackan
authored
Add try except to catch det_match failures (#1077)
* error check for get_groups * further fixes * fix for multilayer_preprocess_tod * fix get_groups * fix group order * revert to dict * performance improvements * extra fixes * updates for handling returns * add check if temp dir exists * fix logger * updates for multilayer preproc * rearrange group removal * address comments * fix space * fix typo * improve docstring --------- Co-authored-by: Michael McCrackan <mmccrack@login40.chn.perlmutter.nersc.gov> Co-authored-by: Michael McCrackan <mmccrack@login15.chn.perlmutter.nersc.gov>
1 parent 6f04c1c commit 86d99c2

File tree

3 files changed

+144
-109
lines changed

3 files changed

+144
-109
lines changed

sotodlib/preprocess/preprocess_util.py

Lines changed: 83 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -176,19 +176,25 @@ def get_groups(obs_id, configs, context):
176176
groups : list of list of int
177177
The list of groups of detectors.
178178
"""
179-
group_by = np.atleast_1d(configs['subobs'].get('use', 'detset'))
180-
for i, gb in enumerate(group_by):
181-
if gb.startswith('dets:'):
182-
group_by[i] = gb.split(':',1)[1]
183-
184-
if (gb == 'detset') and (len(group_by) == 1):
185-
groups = context.obsfiledb.get_detsets(obs_id)
186-
return group_by, [[g] for g in groups]
187-
188-
det_info = context.get_det_info(obs_id)
189-
rs = det_info.subset(keys=group_by).distinct()
190-
groups = [[b for a,b in r.items()] for r in rs]
191-
return group_by, groups
179+
try:
180+
group_by = np.atleast_1d(configs['subobs'].get('use', 'detset'))
181+
for i, gb in enumerate(group_by):
182+
if gb.startswith('dets:'):
183+
group_by[i] = gb.split(':',1)[1]
184+
185+
if (gb == 'detset') and (len(group_by) == 1):
186+
groups = context.obsfiledb.get_detsets(obs_id)
187+
return group_by, [[g] for g in groups], None
188+
189+
det_info = context.get_det_info(obs_id)
190+
rs = det_info.subset(keys=group_by).distinct()
191+
groups = [[b for a,b in r.items()] for r in rs]
192+
return group_by, groups, None
193+
except Exception as e:
194+
error = f'Failed get groups for: {obs_id}'
195+
errmsg = f'{type(e)}: {e}'
196+
tb = ''.join(traceback.format_tb(e.__traceback__))
197+
return [], [], [error, errmsg, tb]
192198

193199

194200
def get_preprocess_db(configs, group_by, logger=None):
@@ -388,8 +394,14 @@ def multilayer_load_and_preprocess(obs_id, configs_init, configs_proc,
388394
configs_proc, context_proc = get_preprocess_context(configs_proc, context_proc)
389395
meta_proc = context_proc.get_meta(obs_id, dets=dets, meta=meta)
390396

391-
group_by_init, groups_init = get_groups(obs_id, configs_init, context_init)
392-
group_by_proc, groups_proc = get_groups(obs_id, configs_proc, context_proc)
397+
group_by_init, groups_init, error_init = get_groups(obs_id, configs_init, context_init)
398+
group_by_proc, groups_proc, error_proc = get_groups(obs_id, configs_proc, context_proc)
399+
400+
if error_init is not None:
401+
raise ValueError(f"{error_init[0]}\n{error_init[1]}\n{error_init[2]}")
402+
403+
if error_proc is not None:
404+
raise ValueError(f"{error_proc[0]}\n{error_proc[1]}\n{error_proc[2]}")
393405

394406
if (group_by_init != group_by_proc).any():
395407
raise ValueError('init and proc groups do not match')
@@ -451,7 +463,7 @@ def find_db(obs_id, configs, dets, context=None, logger=None):
451463
configs = yaml.safe_load(open(configs, "r"))
452464
if context is None:
453465
context = core.Context(configs["context_file"])
454-
group_by, _ = get_groups(obs_id, configs, context)
466+
group_by, _, _ = get_groups(obs_id, configs, context)
455467
cur_groups = [list(np.fromiter(dets.values(), dtype='<U32'))]
456468
dbexist = True
457469
if os.path.exists(configs['archive']['index']):
@@ -560,7 +572,7 @@ def save_group_and_cleanup(obs_id, configs, context=None, subdir='temp',
560572
if context is None:
561573
context = core.Context(configs["context_file"])
562574

563-
group_by, groups = get_groups(obs_id, configs, context)
575+
group_by, groups, error = get_groups(obs_id, configs, context)
564576

565577
all_groups = groups.copy()
566578
for g in all_groups:
@@ -583,6 +595,49 @@ def save_group_and_cleanup(obs_id, configs, context=None, subdir='temp',
583595
except OSError as e:
584596
# remove if it can't be opened
585597
os.remove(outputs_grp['temp_file'])
598+
return error
599+
600+
601+
def cleanup_obs(obs_id, policy_dir, errlog, configs, context=None,
602+
subdir='temp', remove=False):
603+
"""
604+
For a given obs id, this function will search the policy_dir directory
605+
if it exists for any files with that obsnum in their filename. If any are
606+
found, it will run save_group_and_cleanup for that obs id.
607+
608+
Arguments
609+
---------
610+
obs_id: str
611+
Obs id to check and clean up
612+
policy_dir: str
613+
Directory to temp per-group output files
614+
errlog: fpath
615+
Filepath to error logging file.
616+
configs: fpath or dict
617+
Filepath or dictionary containing the preprocess configuration file.
618+
context: core.Context
619+
Optional. Context object used for data loading/querying.
620+
subdir: str
621+
Optional. Subdirectory to save the output files into.
622+
remove: bool
623+
Optional. Default is False. Whether to remove a file if found.
624+
Used when ``overwrite`` is True in driving functions.
625+
"""
626+
627+
if os.path.exists(policy_dir):
628+
found = False
629+
for f in os.listdir(policy_dir):
630+
if obs_id in f:
631+
found = True
632+
break
633+
634+
if found:
635+
error = save_group_and_cleanup(obs_id, configs, context,
636+
subdir=subdir, remove=remove)
637+
if error is not None:
638+
f = open(errlog, 'a')
639+
f.write(f'\n{time.time()}, cleanup error\n{error[0]}\n{error[2]}\n')
640+
f.close()
586641

587642

588643
def preproc_or_load_group(obs_id, configs_init, dets, configs_proc=None, logger=None,
@@ -657,9 +712,12 @@ def preproc_or_load_group(obs_id, configs_init, dets, configs_proc=None, logger=
657712
if context_proc is None:
658713
context_proc = core.Context(configs_proc["context_file"])
659714

660-
group_by, groups = get_groups(obs_id, configs_proc, context_proc)
715+
group_by, groups, error = get_groups(obs_id, configs_proc, context_proc)
661716
else:
662-
group_by, groups = get_groups(obs_id, configs_init, context_init)
717+
group_by, groups, error = get_groups(obs_id, configs_init, context_init)
718+
719+
if error is not None:
720+
return error[0], [error[1], error[2]], [error[1], error[2]], None
663721

664722
all_groups = groups.copy()
665723
cur_groups = [list(np.fromiter(dets.values(), dtype='<U32'))]
@@ -674,11 +732,13 @@ def preproc_or_load_group(obs_id, configs_init, dets, configs_proc=None, logger=
674732
error = 'no_group_overlap'
675733
return error, [obs_id, dets], [obs_id, dets], None
676734

677-
db_init_exist = find_db(obs_id, configs_init, dets, context_init)
735+
db_init_exist = find_db(obs_id, configs_init, dets, context_init,
736+
logger=logger)
678737

679738
db_proc_exist = False
680739
if configs_proc is not None:
681-
db_proc_exist = find_db(obs_id, configs_proc, dets, context_proc)
740+
db_proc_exist = find_db(obs_id, configs_proc, dets, context_proc,
741+
logger=logger)
682742

683743
if (not db_init_exist) and db_proc_exist and (not overwrite):
684744
logger.info('dependent db requires initial db if not overwriting')
@@ -882,7 +942,8 @@ def cleanup_mandb(error, outputs, configs, logger=None, overwrite=False):
882942
errlog = os.path.join(folder, 'errlog.txt')
883943
f = open(errlog, 'a')
884944
f.write(f'{time.time()}, {error}\n')
885-
f.write(f'\t{outputs[0]}\n\t{outputs[1]}\n')
945+
if outputs is not None:
946+
f.write(f'\t{outputs[0]}\n\t{outputs[1]}\n')
886947
f.close()
887948

888949

sotodlib/site_pipeline/multilayer_preprocess_tod.py

Lines changed: 40 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,30 @@ def multilayer_preprocess_tod(obs_id,
6666
configs_proc = yaml.safe_load(open(configs_proc, "r"))
6767
context_proc = core.Context(configs_proc["context_file"])
6868

69-
group_by_proc, groups_proc = pp_util.get_groups(obs_id, configs_proc, context_proc)
69+
group_by_init, groups_init, error_init = pp_util.get_groups(obs_id, configs_init, context_init)
70+
group_by_proc, groups_proc, error_proc = pp_util.get_groups(obs_id, configs_proc, context_proc)
71+
72+
if error_init is not None:
73+
if run_parallel:
74+
return error_init[0], [None, None], [None, None]
75+
else:
76+
return
77+
78+
if error_proc is not None:
79+
if run_parallel:
80+
return error_proc[0], [None, None], [None, None]
81+
else:
82+
return
83+
84+
if len(groups_init) > 0 and len(groups_proc) > 0:
85+
if (group_by_init != group_by_proc).any():
86+
raise ValueError('init and proc groups do not match')
7087

7188
all_groups_proc = groups_proc.copy()
7289
for g in all_groups_proc:
90+
if g not in groups_init:
91+
groups_proc.remove(g)
92+
continue
7393
if group_list is not None:
7494
if g not in group_list:
7595
groups_proc.remove(g)
@@ -281,76 +301,37 @@ def main(configs_init: str,
281301
logger.warning(f"No observations returned from query: {query}")
282302

283303
# clean up lingering files from previous incomplete runs
304+
policy_dir_init = os.path.join(os.path.dirname(configs_init['archive']['policy']['filename']), 'temp')
305+
policy_dir_proc = os.path.join(os.path.dirname(configs_proc['archive']['policy']['filename']), 'temp_proc')
284306
for obs in obs_list:
285307
obs_id = obs['obs_id']
286-
pp_util.save_group_and_cleanup(obs_id, configs_init, context_init,
287-
subdir='temp', remove=overwrite)
288-
pp_util.save_group_and_cleanup(obs_id, configs_proc, context_proc,
289-
subdir='temp_proc', remove=overwrite)
308+
pp_util.cleanup_obs(obs_id, policy_dir_init, errlog, configs_init, context_init,
309+
subdir='temp', remove=overwrite)
310+
pp_util.cleanup_obs(obs_id, policy_dir_proc, errlog, configs_proc, context_proc,
311+
subdir='temp_proc', remove=overwrite)
290312

291313
run_list = []
292314

293315
if overwrite or not os.path.exists(configs_proc['archive']['index']):
294316
# run on all if database doesn't exist
295317
for obs in obs_list:
296-
group_by_init, groups_init = pp_util.get_groups(obs["obs_id"], configs_init, context_init)
297-
group_by_proc, groups_proc = pp_util.get_groups(obs["obs_id"], configs_proc, context_proc)
298-
299-
if (group_by_init != group_by_proc).any():
300-
raise ValueError('init and proc groups do not match')
301-
302-
all_groups_proc = groups_proc.copy()
303-
for g in all_groups_proc:
304-
if g not in groups_init:
305-
groups_proc.remove(g)
306-
307-
run_list.append( (obs, groups_proc) )
318+
#run on all if database doesn't exist
319+
run_list = [ (o,None) for o in obs_list]
320+
group_by_proc = np.atleast_1d(configs_proc['subobs'].get('use', 'detset'))
308321
else:
309322
db = core.metadata.ManifestDb(configs_proc['archive']['index'])
310323
for obs in obs_list:
311324
x = db.inspect({'obs:obs_id': obs["obs_id"]})
312-
group_by_init, groups_init = pp_util.get_groups(obs["obs_id"], configs_init, context_init)
313-
group_by_proc, groups_proc = pp_util.get_groups(obs["obs_id"], configs_proc, context_proc)
314-
315-
if (group_by_init != group_by_proc).any():
316-
raise ValueError('init and proc groups do not match')
317-
318-
all_groups_proc = groups_proc.copy()
319-
for g in all_groups_proc:
320-
if g not in groups_init:
321-
groups_proc.remove(g)
322-
323325
if x is None or len(x) == 0:
324-
run_list.append( (obs, groups_proc) )
325-
elif len(x) != len(groups_proc):
326-
[groups_proc.remove([a[f'dets:{gb}'] for gb in group_by_proc]) for a in x]
327-
run_list.append( (obs, groups_proc) )
326+
run_list.append( (obs, None) )
327+
else:
328+
group_by_proc, groups_proc, _ = pp_util.get_groups(obs["obs_id"], configs_proc, context_proc)
329+
if len(x) != len(groups_proc):
330+
[groups_proc.remove([a[f'dets:{gb}'] for gb in group_by_proc]) for a in x]
331+
run_list.append( (obs, groups_proc) )
328332

329333
logger.info(f'Run list created with {len(run_list)} obsids')
330334

331-
# Expects archive policy filename to be <path>/<filename>.h5 and then this adds
332-
# <path>/<filename>_<xxx>.h5 where xxx is a number that increments up from 0
333-
# whenever the file size exceeds 10 GB.
334-
nfile_init = 0
335-
folder_init = os.path.dirname(configs_init['archive']['policy']['filename'])
336-
basename_init = os.path.splitext(configs_init['archive']['policy']['filename'])[0]
337-
dest_file_init = basename_init + '_' + str(nfile_init).zfill(3) + '.h5'
338-
if not(os.path.exists(folder_init)):
339-
os.makedirs(folder_init)
340-
while os.path.exists(dest_file_init) and os.path.getsize(dest_file_init) > 10e9:
341-
nfile_init += 1
342-
dest_file_init = basename_init + '_' + str(nfile_init).zfill(3) + '.h5'
343-
344-
nfile_proc = 0
345-
folder_proc = os.path.dirname(configs_proc['archive']['policy']['filename'])
346-
basename_proc = os.path.splitext(configs_proc['archive']['policy']['filename'])[0]
347-
dest_file_proc = basename_proc + '_' + str(nfile_proc).zfill(3) + '.h5'
348-
if not(os.path.exists(folder_proc)):
349-
os.makedirs(folder_proc)
350-
while os.path.exists(dest_file_proc) and os.path.getsize(dest_file_proc) > 10e9:
351-
nfile_proc += 1
352-
dest_file_proc = basename_proc + '_' + str(nfile_proc).zfill(3) + '.h5'
353-
354335
# run write_block obs-ids in parallel at once then write all to the sqlite db.
355336
with ProcessPoolExecutor(nproc) as exe:
356337
futures = [exe.submit(multilayer_preprocess_tod, obs_id=r[0]['obs_id'],
@@ -372,12 +353,12 @@ def main(configs_init: str,
372353
continue
373354
futures.remove(future)
374355

375-
if err is None:
356+
if db_datasets_init:
376357
logger.info(f'Processing future result db_dataset: {db_datasets_init}')
377-
if db_datasets_init:
378-
for db_dataset in db_datasets_init:
379-
pp_util.cleanup_mandb(err, db_dataset, configs_init, logger, overwrite)
358+
for db_dataset in db_datasets_init:
359+
pp_util.cleanup_mandb(err, db_dataset, configs_init, logger, overwrite)
380360

361+
if db_datasets_proc:
381362
logger.info(f'Processing future dependent result db_dataset: {db_datasets_proc}')
382363
for db_dataset in db_datasets_proc:
383364
pp_util.cleanup_mandb(err, db_dataset, configs_proc, logger, overwrite)

sotodlib/site_pipeline/preprocess_tod.py

Lines changed: 21 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def dummy_preproc(obs_id, group_list, logger,
2929
error = None
3030
outputs = []
3131
context = core.Context(configs["context_file"])
32-
group_by, groups = pp_util.get_groups(obs_id, configs, context)
32+
group_by, groups, error = pp_util.get_groups(obs_id, configs, context)
3333
pipe = Pipeline(configs["process_pipe"], plot_dir=configs["plot_dir"], logger=logger)
3434
for group in groups:
3535
logger.info(f"Beginning run for {obs_id}:{group}")
@@ -83,7 +83,14 @@ def preprocess_tod(obs_id,
8383
configs = yaml.safe_load(open(configs, "r"))
8484

8585
context = core.Context(configs["context_file"])
86-
group_by, groups = pp_util.get_groups(obs_id, configs, context)
86+
group_by, groups, error = pp_util.get_groups(obs_id, configs, context)
87+
88+
if error is not None:
89+
if run_parallel:
90+
return error[0], [None, None]
91+
else:
92+
return
93+
8794
all_groups = groups.copy()
8895
for g in all_groups:
8996
if group_list is not None:
@@ -325,46 +332,32 @@ def main(
325332
logger.warning(f"No observations returned from query: {query}")
326333

327334
# clean up lingering files from previous incomplete runs
335+
policy_dir = os.path.join(os.path.dirname(configs['archive']['policy']['filename']), 'temp')
328336
for obs in obs_list:
329337
obs_id = obs['obs_id']
330-
pp_util.save_group_and_cleanup(obs_id, configs, context,
331-
subdir='temp', remove=overwrite)
338+
pp_util.cleanup_obs(obs_id, policy_dir, errlog, configs, context,
339+
subdir='temp', remove=overwrite)
332340

333341
run_list = []
334342

335343
if overwrite or not os.path.exists(configs['archive']['index']):
336344
#run on all if database doesn't exist
337-
for obs in obs_list:
338-
group_by, groups = pp_util.get_groups(obs["obs_id"], configs, context)
339-
run_list.append( (obs, groups) )# = [ (o, groups) for o in obs_list]
345+
run_list = [ (o,None) for o in obs_list]
346+
group_by = np.atleast_1d(configs['subobs'].get('use', 'detset'))
340347
else:
341348
db = core.metadata.ManifestDb(configs['archive']['index'])
342349
for obs in obs_list:
343350
x = db.inspect({'obs:obs_id': obs["obs_id"]})
344-
group_by, groups = pp_util.get_groups(obs["obs_id"], configs, context)
345351
if x is None or len(x) == 0:
346352
run_list.append( (obs, None) )
347-
elif len(x) != len(groups):
348-
[groups.remove([a[f'dets:{gb}'] for gb in group_by]) for a in x]
349-
run_list.append( (obs, groups) )
353+
else:
354+
group_by, groups, _ = pp_util.get_groups(obs["obs_id"], configs, context)
355+
if len(x) != len(groups):
356+
[groups.remove([a[f'dets:{gb}'] for gb in group_by]) for a in x]
357+
run_list.append( (obs, groups) )
350358

351359
logger.info(f'Run list created with {len(run_list)} obsids')
352360

353-
# Expects archive policy filename to be <path>/<filename>.h5 and then this adds
354-
# <path>/<filename>_<xxx>.h5 where xxx is a number that increments up from 0
355-
# whenever the file size exceeds 10 GB.
356-
nfile = 0
357-
folder = os.path.dirname(configs['archive']['policy']['filename'])
358-
basename = os.path.splitext(configs['archive']['policy']['filename'])[0]
359-
dest_file = basename + '_' + str(nfile).zfill(3) + '.h5'
360-
if not(os.path.exists(folder)):
361-
os.makedirs(folder)
362-
while os.path.exists(dest_file) and os.path.getsize(dest_file) > 10e9:
363-
nfile += 1
364-
dest_file = basename + '_' + str(nfile).zfill(3) + '.h5'
365-
366-
logger.info(f'Starting dest_file set to {dest_file}')
367-
368361
# Run write_block obs-ids in parallel at once then write all to the sqlite db.
369362
with ProcessPoolExecutor(nproc) as exe:
370363
futures = [exe.submit(preprocess_tod, obs_id=r[0]['obs_id'],
@@ -385,8 +378,8 @@ def main(
385378
continue
386379
futures.remove(future)
387380

388-
logger.info(f'Processing future result db_dataset: {db_datasets}')
389-
if err is None and db_datasets:
381+
if db_datasets:
382+
logger.info(f'Processing future result db_dataset: {db_datasets}')
390383
for db_dataset in db_datasets:
391384
pp_util.cleanup_mandb(err, db_dataset, configs, logger)
392385

0 commit comments

Comments
 (0)