diff --git a/scripts/node_cleanup.py b/scripts/node_cleanup.py index c260df8a6..abac7b96e 100755 --- a/scripts/node_cleanup.py +++ b/scripts/node_cleanup.py @@ -10,7 +10,11 @@ def main(): if args.verbose: teuthology.log.setLevel(logging.DEBUG) log = logging.getLogger(__name__) - stale = query.find_stale_locks(args.owner) + try: + stale = query.find_stale_locks(args.owner) + except Exception: + log.exception(f"Error while check for stale locks held by {args.owner}") + return if not stale: return by_owner = {} @@ -30,6 +34,7 @@ def main(): else: for owner, nodes in by_owner.items(): ops.unlock_safe([node["name"] for node in nodes], owner) + log.info(f"unlocked {len(stale)} nodes") def parse_args(argv): parser = argparse.ArgumentParser( diff --git a/teuthology/dispatcher/supervisor.py b/teuthology/dispatcher/supervisor.py index 83e6d997c..273f9e2ac 100644 --- a/teuthology/dispatcher/supervisor.py +++ b/teuthology/dispatcher/supervisor.py @@ -226,6 +226,7 @@ def reimage(job_config): reimaged = lock_ops.reimage_machines(ctx, targets, job_config['machine_type']) except Exception as e: log.exception('Reimaging error. Nuking machines...') + unlock_targets(job_config) # Reimage failures should map to the 'dead' status instead of 'fail' report.try_push_job_info( ctx.config, diff --git a/teuthology/lock/query.py b/teuthology/lock/query.py index 752eac106..d52149a21 100644 --- a/teuthology/lock/query.py +++ b/teuthology/lock/query.py @@ -160,5 +160,9 @@ def node_active_job(name: str, status: Union[dict, None] = None) -> Union[str, N if resp.ok: job_status = resp.json()["status"] break + elif resp.status_code == 404: + break + else: + log.debug(f"Error {resp.status_code} listing job {run_name}/{job_id} for {name}: {resp.text}") if job_status and job_status not in ('pass', 'fail', 'dead'): return description