Skip to content

Commit

Permalink
TRON-1850: Include starting pods in check for stuck jobs
Browse files Browse the repository at this point in the history
  • Loading branch information
jfongatyelp committed Feb 13, 2024
1 parent c0d3a0a commit 041fac4
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 5 deletions.
38 changes: 38 additions & 0 deletions tests/bin/check_tron_jobs_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,44 @@ def test_job_running_job_exceeds_expected_runtime(self):
assert_equal(run["id"], "MASTER.test.99")
assert_equal(state, State.STUCK)

def test_job_starting_job_exceeds_expected_runtime(self):
job_runs = {
"status": "running",
"next_run": None,
"expected_runtime": 480.0,
"allow_overlap": True,
"runs": [
{
"id": "MASTER.test.100",
"state": "scheduled",
"run_time": time.strftime(
"%Y-%m-%d %H:%M:%S",
time.localtime(time.time() + 600),
),
"end_time": None,
"start_time": None,
"duration": "",
},
{
"id": "MASTER.test.99",
"state": "starting",
"run_time": time.strftime(
"%Y-%m-%d %H:%M:%S",
time.localtime(time.time() - 600),
),
"start_time": time.strftime(
"%Y-%m-%d %H:%M:%S",
time.localtime(time.time() - 600),
),
"end_time": None,
"duration": "0:10:01.883601",
},
],
}
run, state = check_tron_jobs.get_relevant_run_and_state(job_runs)
assert_equal(run["id"], "MASTER.test.99")
assert_equal(state, State.STUCK)

def test_job_waiting_job_exceeds_expected_runtime_already_started(self):
job_runs = {
"status": "running",
Expand Down
11 changes: 6 additions & 5 deletions tron/bin/check_tron_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ def get_relevant_run_and_state(job_content):
state = run.get("state", "unknown")
if state in ["failed", "succeeded", "unknown", "skipped"]:
return run, State(state)
elif state in ["running", "waiting"]:
elif state in ["running", "waiting", "starting"]:
action_state = is_action_failed_or_unknown(run)
if action_state != State.SUCCEEDED:
return run, action_state
Expand All @@ -251,7 +251,7 @@ def is_job_stuck(
):
next_run_time = None
for job_run in job_runs:
states_to_check = {"running", "waiting"}
states_to_check = {"running", "waiting", "starting"}
if job_run.get("state", "unknown") in states_to_check:
if is_job_run_exceeding_expected_runtime(
job_run,
Expand All @@ -275,7 +275,7 @@ def is_job_stuck(


def is_job_run_exceeding_expected_runtime(job_run, job_expected_runtime):
states_to_check = {"running", "waiting"}
states_to_check = {"running", "waiting", "starting"}
if (
job_expected_runtime is not None
and job_run.get(
Expand All @@ -294,7 +294,8 @@ def is_action_run_exceeding_expected_runtime(
action_run,
actions_expected_runtime,
):
if action_run.get("state", "unknown") == "running":
states_to_check = ["running", "starting"]
if action_run.get("state", "unknown") in states_to_check:
action_name = action_run.get("action_name", None)
if action_name in actions_expected_runtime and actions_expected_runtime[action_name] is not None:
duration_seconds = pytimeparse.parse(
Expand All @@ -319,7 +320,7 @@ def get_relevant_action(*, action_runs, last_state, actions_expected_runtime):
actions_expected_runtime,
):
return action_run
if action_state == "running":
if action_state == "running" or action_state == "starting":
stuck_action_run_candidate = action_run
return stuck_action_run_candidate or action_runs[-1]

Expand Down

0 comments on commit 041fac4

Please sign in to comment.