From c4a7d8c42b6c1f8f902d93de0b7f9872f75d8a11 Mon Sep 17 00:00:00 2001 From: Jen Patague Date: Tue, 13 Feb 2024 12:32:28 -0800 Subject: [PATCH] TRON-1850: Include starting pods in check for stuck jobs/actions --- tests/bin/check_tron_jobs_test.py | 125 +++++++++++++++++++++++++++++- tron/bin/check_tron_jobs.py | 11 +-- 2 files changed, 130 insertions(+), 6 deletions(-) diff --git a/tests/bin/check_tron_jobs_test.py b/tests/bin/check_tron_jobs_test.py index 2d905eb24..e648f9a59 100644 --- a/tests/bin/check_tron_jobs_test.py +++ b/tests/bin/check_tron_jobs_test.py @@ -506,7 +506,7 @@ def test_job_next_run_starting_overlap_allowed_not_stuck(self): assert_equal(run["id"], "MASTER.test.1") assert_equal(state, State.SUCCEEDED) - def test_job_next_run_starting_no_queueing_not_stuck(self): + def test_job_next_run_running_no_queueing_not_stuck(self): job_runs = { "status": "running", "next_run": None, @@ -549,6 +549,49 @@ def test_job_next_run_starting_no_queueing_not_stuck(self): assert_equal(run["id"], "MASTER.test.1") assert_equal(state, State.SUCCEEDED) + def test_job_next_run_starting_no_queueing_not_stuck(self): + job_runs = { + "status": "starting", + "next_run": None, + "allow_overlap": False, + "queueing": False, + "runs": [ + { + "id": "MASTER.test.3", + "state": "cancelled", + "run_time": time.strftime( + "%Y-%m-%d %H:%M:%S", + time.localtime(time.time() - 600), + ), + "end_time": None, + }, + { + "id": "MASTER.test.2", + "state": "starting", + "run_time": time.strftime( + "%Y-%m-%d %H:%M:%S", + time.localtime(time.time() - 1200), + ), + "end_time": None, + }, + { + "id": "MASTER.test.1", + "state": "succeeded", + "run_time": time.strftime( + "%Y-%m-%d %H:%M:%S", + time.localtime(time.time() - 1800), + ), + "end_time": time.strftime( + "%Y-%m-%d %H:%M:%S", + time.localtime(time.time() - 1700), + ), + }, + ], + } + run, state = check_tron_jobs.get_relevant_run_and_state(job_runs) + assert_equal(run["id"], "MASTER.test.1") + assert_equal(state, State.SUCCEEDED) + def test_job_running_job_exceeds_expected_runtime(self): job_runs = { "status": "running", @@ -587,6 +630,44 @@ def test_job_running_job_exceeds_expected_runtime(self): assert_equal(run["id"], "MASTER.test.99") assert_equal(state, State.STUCK) + def test_job_starting_job_exceeds_expected_runtime(self): + job_runs = { + "status": "running", + "next_run": None, + "expected_runtime": 480.0, + "allow_overlap": True, + "runs": [ + { + "id": "MASTER.test.100", + "state": "scheduled", + "run_time": time.strftime( + "%Y-%m-%d %H:%M:%S", + time.localtime(time.time() + 600), + ), + "end_time": None, + "start_time": None, + "duration": "", + }, + { + "id": "MASTER.test.99", + "state": "starting", + "run_time": time.strftime( + "%Y-%m-%d %H:%M:%S", + time.localtime(time.time() - 600), + ), + "start_time": time.strftime( + "%Y-%m-%d %H:%M:%S", + time.localtime(time.time() - 600), + ), + "end_time": None, + "duration": "0:10:01.883601", + }, + ], + } + run, state = check_tron_jobs.get_relevant_run_and_state(job_runs) + assert_equal(run["id"], "MASTER.test.99") + assert_equal(state, State.STUCK) + def test_job_waiting_job_exceeds_expected_runtime_already_started(self): job_runs = { "status": "running", @@ -854,6 +935,48 @@ def test_get_relevant_action_pick_the_one_exceeds_expected_runtime(self): ) assert_equal(actual["id"], "MASTER.test.1.action3") + def test_get_relevant_action_pick_the_one_starting(self): + action_runs = [ + { + "id": "MASTER.test.1.action3", + "state": "starting", + "start_time": time.strftime( + "%Y-%m-%d %H:%M:%S", + time.localtime(time.time() - 600), + ), + "duration": "0:10:00.006305", + }, + { + "id": "MASTER.test.1.action2", + "state": "running", + "start_time": time.strftime( + "%Y-%m-%d %H:%M:%S", + time.localtime(time.time() - 600), + ), + "duration": "0:10:00.006383", + }, + { + "id": "MASTER.test.1.action1", + "state": "succeeded", + "start_time": time.strftime( + "%Y-%m-%d %H:%M:%S", + time.localtime(time.time() - 600), + ), + "duration": "0:10:00.006331", + }, + ] + actions_expected_runtime = { + "action3": 480.0, + "action2": 720.0, + "action1": 900.0, + } + actual = check_tron_jobs.get_relevant_action( + action_runs=action_runs, + last_state=State.STUCK, + actions_expected_runtime=actions_expected_runtime, + ) + assert_equal(actual["id"], "MASTER.test.1.action3") + def test_get_relevant_action_pick_the_one_exceeds_expected_runtime_with_long_duration( self, ): diff --git a/tron/bin/check_tron_jobs.py b/tron/bin/check_tron_jobs.py index cd733d473..f2f60f757 100755 --- a/tron/bin/check_tron_jobs.py +++ b/tron/bin/check_tron_jobs.py @@ -226,7 +226,7 @@ def get_relevant_run_and_state(job_content): state = run.get("state", "unknown") if state in ["failed", "succeeded", "unknown", "skipped"]: return run, State(state) - elif state in ["running", "waiting"]: + elif state in ["running", "waiting", "starting"]: action_state = is_action_failed_or_unknown(run) if action_state != State.SUCCEEDED: return run, action_state @@ -251,7 +251,7 @@ def is_job_stuck( ): next_run_time = None for job_run in job_runs: - states_to_check = {"running", "waiting"} + states_to_check = {"running", "waiting", "starting"} if job_run.get("state", "unknown") in states_to_check: if is_job_run_exceeding_expected_runtime( job_run, @@ -275,7 +275,7 @@ def is_job_stuck( def is_job_run_exceeding_expected_runtime(job_run, job_expected_runtime): - states_to_check = {"running", "waiting"} + states_to_check = {"running", "waiting", "starting"} if ( job_expected_runtime is not None and job_run.get( @@ -294,7 +294,8 @@ def is_action_run_exceeding_expected_runtime( action_run, actions_expected_runtime, ): - if action_run.get("state", "unknown") == "running": + states_to_check = ["running", "starting"] + if action_run.get("state", "unknown") in states_to_check: action_name = action_run.get("action_name", None) if action_name in actions_expected_runtime and actions_expected_runtime[action_name] is not None: duration_seconds = pytimeparse.parse( @@ -319,7 +320,7 @@ def get_relevant_action(*, action_runs, last_state, actions_expected_runtime): actions_expected_runtime, ): return action_run - if action_state == "running": + if action_state == "running" or action_state == "starting": stuck_action_run_candidate = action_run return stuck_action_run_candidate or action_runs[-1]