TRON-1850: Include starting pods in check for stuck jobs/actions

Yelp · Feb 13, 2024 · c4a7d8c · c4a7d8c
1 parent c0d3a0a
commit c4a7d8c
Show file tree

Hide file tree

Showing 2 changed files with 130 additions and 6 deletions.
diff --git a/tests/bin/check_tron_jobs_test.py b/tests/bin/check_tron_jobs_test.py
@@ -506,7 +506,7 @@ def test_job_next_run_starting_overlap_allowed_not_stuck(self):
         assert_equal(run["id"], "MASTER.test.1")
         assert_equal(state, State.SUCCEEDED)
 
-    def test_job_next_run_starting_no_queueing_not_stuck(self):
+    def test_job_next_run_running_no_queueing_not_stuck(self):
         job_runs = {
             "status": "running",
             "next_run": None,
@@ -549,6 +549,49 @@ def test_job_next_run_starting_no_queueing_not_stuck(self):
         assert_equal(run["id"], "MASTER.test.1")
         assert_equal(state, State.SUCCEEDED)
 
+    def test_job_next_run_starting_no_queueing_not_stuck(self):
+        job_runs = {
+            "status": "starting",
+            "next_run": None,
+            "allow_overlap": False,
+            "queueing": False,
+            "runs": [
+                {
+                    "id": "MASTER.test.3",
+                    "state": "cancelled",
+                    "run_time": time.strftime(
+                        "%Y-%m-%d %H:%M:%S",
+                        time.localtime(time.time() - 600),
+                    ),
+                    "end_time": None,
+                },
+                {
+                    "id": "MASTER.test.2",
+                    "state": "starting",
+                    "run_time": time.strftime(
+                        "%Y-%m-%d %H:%M:%S",
+                        time.localtime(time.time() - 1200),
+                    ),
+                    "end_time": None,
+                },
+                {
+                    "id": "MASTER.test.1",
+                    "state": "succeeded",
+                    "run_time": time.strftime(
+                        "%Y-%m-%d %H:%M:%S",
+                        time.localtime(time.time() - 1800),
+                    ),
+                    "end_time": time.strftime(
+                        "%Y-%m-%d %H:%M:%S",
+                        time.localtime(time.time() - 1700),
+                    ),
+                },
+            ],
+        }
+        run, state = check_tron_jobs.get_relevant_run_and_state(job_runs)
+        assert_equal(run["id"], "MASTER.test.1")
+        assert_equal(state, State.SUCCEEDED)
+
     def test_job_running_job_exceeds_expected_runtime(self):
         job_runs = {
             "status": "running",
@@ -587,6 +630,44 @@ def test_job_running_job_exceeds_expected_runtime(self):
         assert_equal(run["id"], "MASTER.test.99")
         assert_equal(state, State.STUCK)
 
+    def test_job_starting_job_exceeds_expected_runtime(self):
+        job_runs = {
+            "status": "running",
+            "next_run": None,
+            "expected_runtime": 480.0,
+            "allow_overlap": True,
+            "runs": [
+                {
+                    "id": "MASTER.test.100",
+                    "state": "scheduled",
+                    "run_time": time.strftime(
+                        "%Y-%m-%d %H:%M:%S",
+                        time.localtime(time.time() + 600),
+                    ),
+                    "end_time": None,
+                    "start_time": None,
+                    "duration": "",
+                },
+                {
+                    "id": "MASTER.test.99",
+                    "state": "starting",
+                    "run_time": time.strftime(
+                        "%Y-%m-%d %H:%M:%S",
+                        time.localtime(time.time() - 600),
+                    ),
+                    "start_time": time.strftime(
+                        "%Y-%m-%d %H:%M:%S",
+                        time.localtime(time.time() - 600),
+                    ),
+                    "end_time": None,
+                    "duration": "0:10:01.883601",
+                },
+            ],
+        }
+        run, state = check_tron_jobs.get_relevant_run_and_state(job_runs)
+        assert_equal(run["id"], "MASTER.test.99")
+        assert_equal(state, State.STUCK)
+
     def test_job_waiting_job_exceeds_expected_runtime_already_started(self):
         job_runs = {
             "status": "running",
@@ -854,6 +935,48 @@ def test_get_relevant_action_pick_the_one_exceeds_expected_runtime(self):
         )
         assert_equal(actual["id"], "MASTER.test.1.action3")
 
+    def test_get_relevant_action_pick_the_one_starting(self):
+        action_runs = [
+            {
+                "id": "MASTER.test.1.action3",
+                "state": "starting",
+                "start_time": time.strftime(
+                    "%Y-%m-%d %H:%M:%S",
+                    time.localtime(time.time() - 600),
+                ),
+                "duration": "0:10:00.006305",
+            },
+            {
+                "id": "MASTER.test.1.action2",
+                "state": "running",
+                "start_time": time.strftime(
+                    "%Y-%m-%d %H:%M:%S",
+                    time.localtime(time.time() - 600),
+                ),
+                "duration": "0:10:00.006383",
+            },
+            {
+                "id": "MASTER.test.1.action1",
+                "state": "succeeded",
+                "start_time": time.strftime(
+                    "%Y-%m-%d %H:%M:%S",
+                    time.localtime(time.time() - 600),
+                ),
+                "duration": "0:10:00.006331",
+            },
+        ]
+        actions_expected_runtime = {
+            "action3": 480.0,
+            "action2": 720.0,
+            "action1": 900.0,
+        }
+        actual = check_tron_jobs.get_relevant_action(
+            action_runs=action_runs,
+            last_state=State.STUCK,
+            actions_expected_runtime=actions_expected_runtime,
+        )
+        assert_equal(actual["id"], "MASTER.test.1.action3")
+
     def test_get_relevant_action_pick_the_one_exceeds_expected_runtime_with_long_duration(
         self,
     ):

diff --git a/tron/bin/check_tron_jobs.py b/tron/bin/check_tron_jobs.py
@@ -226,7 +226,7 @@ def get_relevant_run_and_state(job_content):
         state = run.get("state", "unknown")
         if state in ["failed", "succeeded", "unknown", "skipped"]:
             return run, State(state)
-        elif state in ["running", "waiting"]:
+        elif state in ["running", "waiting", "starting"]:
             action_state = is_action_failed_or_unknown(run)
             if action_state != State.SUCCEEDED:
                 return run, action_state
@@ -251,7 +251,7 @@ def is_job_stuck(
 ):
     next_run_time = None
     for job_run in job_runs:
-        states_to_check = {"running", "waiting"}
+        states_to_check = {"running", "waiting", "starting"}
         if job_run.get("state", "unknown") in states_to_check:
             if is_job_run_exceeding_expected_runtime(
                 job_run,
@@ -275,7 +275,7 @@ def is_job_stuck(
 
 
 def is_job_run_exceeding_expected_runtime(job_run, job_expected_runtime):
-    states_to_check = {"running", "waiting"}
+    states_to_check = {"running", "waiting", "starting"}
     if (
         job_expected_runtime is not None
         and job_run.get(
@@ -294,7 +294,8 @@ def is_action_run_exceeding_expected_runtime(
     action_run,
     actions_expected_runtime,
 ):
-    if action_run.get("state", "unknown") == "running":
+    states_to_check = ["running", "starting"]
+    if action_run.get("state", "unknown") in states_to_check:
         action_name = action_run.get("action_name", None)
         if action_name in actions_expected_runtime and actions_expected_runtime[action_name] is not None:
             duration_seconds = pytimeparse.parse(
@@ -319,7 +320,7 @@ def get_relevant_action(*, action_runs, last_state, actions_expected_runtime):
                     actions_expected_runtime,
                 ):
                     return action_run
-                if action_state == "running":
+                if action_state == "running" or action_state == "starting":
                     stuck_action_run_candidate = action_run
     return stuck_action_run_candidate or action_runs[-1]