Skip to content

Commit

Permalink
TRON-1850: Include starting pods in check for stuck jobs/actions
Browse files Browse the repository at this point in the history
  • Loading branch information
jfongatyelp committed Feb 13, 2024
1 parent c0d3a0a commit c4a7d8c
Show file tree
Hide file tree
Showing 2 changed files with 130 additions and 6 deletions.
125 changes: 124 additions & 1 deletion tests/bin/check_tron_jobs_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,7 +506,7 @@ def test_job_next_run_starting_overlap_allowed_not_stuck(self):
assert_equal(run["id"], "MASTER.test.1")
assert_equal(state, State.SUCCEEDED)

def test_job_next_run_starting_no_queueing_not_stuck(self):
def test_job_next_run_running_no_queueing_not_stuck(self):
job_runs = {
"status": "running",
"next_run": None,
Expand Down Expand Up @@ -549,6 +549,49 @@ def test_job_next_run_starting_no_queueing_not_stuck(self):
assert_equal(run["id"], "MASTER.test.1")
assert_equal(state, State.SUCCEEDED)

def test_job_next_run_starting_no_queueing_not_stuck(self):
job_runs = {
"status": "starting",
"next_run": None,
"allow_overlap": False,
"queueing": False,
"runs": [
{
"id": "MASTER.test.3",
"state": "cancelled",
"run_time": time.strftime(
"%Y-%m-%d %H:%M:%S",
time.localtime(time.time() - 600),
),
"end_time": None,
},
{
"id": "MASTER.test.2",
"state": "starting",
"run_time": time.strftime(
"%Y-%m-%d %H:%M:%S",
time.localtime(time.time() - 1200),
),
"end_time": None,
},
{
"id": "MASTER.test.1",
"state": "succeeded",
"run_time": time.strftime(
"%Y-%m-%d %H:%M:%S",
time.localtime(time.time() - 1800),
),
"end_time": time.strftime(
"%Y-%m-%d %H:%M:%S",
time.localtime(time.time() - 1700),
),
},
],
}
run, state = check_tron_jobs.get_relevant_run_and_state(job_runs)
assert_equal(run["id"], "MASTER.test.1")
assert_equal(state, State.SUCCEEDED)

def test_job_running_job_exceeds_expected_runtime(self):
job_runs = {
"status": "running",
Expand Down Expand Up @@ -587,6 +630,44 @@ def test_job_running_job_exceeds_expected_runtime(self):
assert_equal(run["id"], "MASTER.test.99")
assert_equal(state, State.STUCK)

def test_job_starting_job_exceeds_expected_runtime(self):
job_runs = {
"status": "running",
"next_run": None,
"expected_runtime": 480.0,
"allow_overlap": True,
"runs": [
{
"id": "MASTER.test.100",
"state": "scheduled",
"run_time": time.strftime(
"%Y-%m-%d %H:%M:%S",
time.localtime(time.time() + 600),
),
"end_time": None,
"start_time": None,
"duration": "",
},
{
"id": "MASTER.test.99",
"state": "starting",
"run_time": time.strftime(
"%Y-%m-%d %H:%M:%S",
time.localtime(time.time() - 600),
),
"start_time": time.strftime(
"%Y-%m-%d %H:%M:%S",
time.localtime(time.time() - 600),
),
"end_time": None,
"duration": "0:10:01.883601",
},
],
}
run, state = check_tron_jobs.get_relevant_run_and_state(job_runs)
assert_equal(run["id"], "MASTER.test.99")
assert_equal(state, State.STUCK)

def test_job_waiting_job_exceeds_expected_runtime_already_started(self):
job_runs = {
"status": "running",
Expand Down Expand Up @@ -854,6 +935,48 @@ def test_get_relevant_action_pick_the_one_exceeds_expected_runtime(self):
)
assert_equal(actual["id"], "MASTER.test.1.action3")

def test_get_relevant_action_pick_the_one_starting(self):
action_runs = [
{
"id": "MASTER.test.1.action3",
"state": "starting",
"start_time": time.strftime(
"%Y-%m-%d %H:%M:%S",
time.localtime(time.time() - 600),
),
"duration": "0:10:00.006305",
},
{
"id": "MASTER.test.1.action2",
"state": "running",
"start_time": time.strftime(
"%Y-%m-%d %H:%M:%S",
time.localtime(time.time() - 600),
),
"duration": "0:10:00.006383",
},
{
"id": "MASTER.test.1.action1",
"state": "succeeded",
"start_time": time.strftime(
"%Y-%m-%d %H:%M:%S",
time.localtime(time.time() - 600),
),
"duration": "0:10:00.006331",
},
]
actions_expected_runtime = {
"action3": 480.0,
"action2": 720.0,
"action1": 900.0,
}
actual = check_tron_jobs.get_relevant_action(
action_runs=action_runs,
last_state=State.STUCK,
actions_expected_runtime=actions_expected_runtime,
)
assert_equal(actual["id"], "MASTER.test.1.action3")

def test_get_relevant_action_pick_the_one_exceeds_expected_runtime_with_long_duration(
self,
):
Expand Down
11 changes: 6 additions & 5 deletions tron/bin/check_tron_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ def get_relevant_run_and_state(job_content):
state = run.get("state", "unknown")
if state in ["failed", "succeeded", "unknown", "skipped"]:
return run, State(state)
elif state in ["running", "waiting"]:
elif state in ["running", "waiting", "starting"]:
action_state = is_action_failed_or_unknown(run)
if action_state != State.SUCCEEDED:
return run, action_state
Expand All @@ -251,7 +251,7 @@ def is_job_stuck(
):
next_run_time = None
for job_run in job_runs:
states_to_check = {"running", "waiting"}
states_to_check = {"running", "waiting", "starting"}
if job_run.get("state", "unknown") in states_to_check:
if is_job_run_exceeding_expected_runtime(
job_run,
Expand All @@ -275,7 +275,7 @@ def is_job_stuck(


def is_job_run_exceeding_expected_runtime(job_run, job_expected_runtime):
states_to_check = {"running", "waiting"}
states_to_check = {"running", "waiting", "starting"}
if (
job_expected_runtime is not None
and job_run.get(
Expand All @@ -294,7 +294,8 @@ def is_action_run_exceeding_expected_runtime(
action_run,
actions_expected_runtime,
):
if action_run.get("state", "unknown") == "running":
states_to_check = ["running", "starting"]
if action_run.get("state", "unknown") in states_to_check:
action_name = action_run.get("action_name", None)
if action_name in actions_expected_runtime and actions_expected_runtime[action_name] is not None:
duration_seconds = pytimeparse.parse(
Expand All @@ -319,7 +320,7 @@ def get_relevant_action(*, action_runs, last_state, actions_expected_runtime):
actions_expected_runtime,
):
return action_run
if action_state == "running":
if action_state == "running" or action_state == "starting":
stuck_action_run_candidate = action_run
return stuck_action_run_candidate or action_runs[-1]

Expand Down

0 comments on commit c4a7d8c

Please sign in to comment.