[CIVIS-10251] Add civis.utils.job_logs() function (#509)

mheilman · web-flow · commit 16e418a5ea27 · 2025-02-24T12:22:04.000-06:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,16 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
 
 ### Security
 
+## 2.5.0 - 2025-02-24
+
+### Added
+
+- Added `civis.utils.job_logs()` function to return a generator of log messages for a job run (#509)
+
+### Changed
+
+- Revised the CLI commands `civis jobs follow-log` and `civis jobs follow-run-log` to not skip log messages for running jobs (#509)
+
 ## 2.4.3 - 2025-01-13
 
 ### Fixed
@@ -114,7 +124,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
   top-level and nested response objects. (#493)
 
 ### Security
-- Bumped the minimum required version of `requests` to the latest v2.32.3, 
+- Bumped the minimum required version of `requests` to the latest v2.32.3,
   due to a security vulnerability for < v2.32.0
   ([CVE-2024-35195](https://nvd.nist.gov/vuln/detail/CVE-2024-35195)). (#488)
 
@@ -181,7 +191,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
 
 ### Changed
 - Updated references from 'master' to 'main' (#460)
-- Clarified the usage example for `civis.io.civis_to_multifile_csv`. Updated 
+- Clarified the usage example for `civis.io.civis_to_multifile_csv`. Updated
   CircleCI config so dev-requirements is only used when needed. (#452)
 - Removed unneeded `time.sleep` calls and `pytest.mark` calls and mocked `time.sleep` calls to optimize tests. (#453)
 - Refactored tests to remove dependency on the vcr library. (#456)
@@ -227,7 +237,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
 ### Changed
 - Added a warning message when using `civis.io.file_to_civis` with file size of 0 bytes (#451)
 - Specified that `civis.io.civis_file_to_table` can handle compressed files (#450)
-- Explicitly stated CSV-like civis file format requirement in 
+- Explicitly stated CSV-like civis file format requirement in
   `civis.io.civis_file_to_table`'s docstring (#445)
 - Called out the fact that `joblib.Parallel`'s `pre_dispatch` defaults to `"2*n_jobs"`
   in the Sphinx docs (#443)
@@ -247,7 +257,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
   has `VARCHAR` (#439)
 - Updated info about MacOS shell configuration file to be `~/.zshrc` (#444)
 - Fixed the Sphinx docs to show details of multi-word API endpoints (#442)
-- Dropped the buggy/unnecessary `_get_headers` in `civis.io.read_civis_sql` (#415) 
+- Dropped the buggy/unnecessary `_get_headers` in `civis.io.read_civis_sql` (#415)
 - Clarified the `table_columns` parameter in `civis.io.*` functions (#434)
 - Warned about the `retry_total` parameter of `civis.APIClient` being inactive and deprecated (#431)
 - Converted `assert` statements in non-test code into proper error handling (#430, #435)
diff --git a/docs/utils.rst b/docs/utils.rst
@@ -10,4 +10,5 @@ and templates on the Civis Platform.
    :toctree: generated
 
    run_job
-   run_template
+   run_template
+   job_logs
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "civis"
-version = "2.4.3"
+version = "2.5.0"
 description = "Civis API Python Client"
 readme = "README.rst"
 requires-python = ">= 3.10"
diff --git a/src/civis/cli/_cli_commands.py b/src/civis/cli/_cli_commands.py
@@ -4,17 +4,16 @@
 Additional commands to add to the CLI beyond the OpenAPI spec.
 """
 import functools
-import operator
 import os
 import sys
-import time
 
 import click
 import requests
 import webbrowser
 
 import civis
 from civis.io import file_to_civis, civis_to_file
+from civis.utils import job_logs
 
 
 # From http://patorjk.com/software/taag/#p=display&f=3D%20Diagonal&t=CIVIS
@@ -41,11 +40,8 @@
 until the run is done and then exit. If the run is already finished, it
 will output all the logs from that run and then exit.
 
-NOTE: This command could miss some log entries from a currently-running
-job. It does not re-fetch logs that might have been saved out of order, to
-preserve the chronological order of the logs and without duplication.
+NOTE: Log entries may appear our of order, particularly at the end of a run.
 """
-_FOLLOW_POLL_INTERVAL_SEC = 3
 
 
 @click.command("upload")
@@ -236,41 +232,8 @@ def jobs_follow_run_log(id, run_id):
 
 
 def _jobs_follow_run_log(id, run_id):
-    client = civis.APIClient(return_type="raw")
-    local_max_log_id = 0
-    continue_polling = True
-
-    while continue_polling:
-        # This call gets all available log messages since last_id up to
-        # the page size, ordered by log ID. We leave it to Platform to decide
-        # the best page size.
-        response = client.jobs.list_runs_logs(id, run_id, last_id=local_max_log_id)
-        if "civis-max-id" in response.headers:
-            remote_max_log_id = int(response.headers["civis-max-id"])
-        else:
-            # Platform hasn't seen any logs at all yet
-            remote_max_log_id = None
-        logs = response.json()
-        if logs:
-            local_max_log_id = max(log["id"] for log in logs)
-            logs.sort(key=operator.itemgetter("createdAt", "id"))
-        for log in logs:
-            print(" ".join((log["createdAt"], log["message"].rstrip())))
-        # if output is a pipe, write the buffered output immediately:
-        sys.stdout.flush()
-
-        log_finished = response.headers["civis-cache-control"] != "no-store"
-        if remote_max_log_id is None:
-            remote_has_more_logs_to_get_now = False
-        elif local_max_log_id == remote_max_log_id:
-            remote_has_more_logs_to_get_now = False
-            if log_finished:
-                continue_polling = False
-        else:
-            remote_has_more_logs_to_get_now = True
-
-        if continue_polling and not remote_has_more_logs_to_get_now:
-            time.sleep(_FOLLOW_POLL_INTERVAL_SEC)
+    for log in job_logs(id, run_id):
+        print(" ".join((log["createdAt"], log["message"].rstrip())), flush=True)
 
 
 @click.command("download")
diff --git a/src/civis/utils/__init__.py b/src/civis/utils/__init__.py
@@ -1,3 +1,3 @@
-from civis.utils._jobs import run_job, run_template
+from civis.utils._jobs import run_job, run_template, job_logs
 
-__all__ = ["run_job", "run_template"]
+__all__ = ["run_job", "run_template", "job_logs"]
diff --git a/src/civis/utils/_jobs.py b/src/civis/utils/_jobs.py
@@ -1,10 +1,18 @@
 import logging
+import operator
+import time
+from datetime import datetime
 
 from civis import APIClient
 from civis.futures import CivisFuture
 
 log = logging.getLogger(__name__)
 
+_FOLLOW_POLL_INTERVAL_SEC = 5
+_LOG_REFETCH_CUTOFF_SECONDS = 300
+_LOG_REFETCH_COUNT = 100
+_LOGS_PER_QUERY = 250
+
 
 def run_job(job_id, client=None, polling_interval=None):
     """Run a job.
@@ -96,3 +104,144 @@ def run_template(id, arguments, JSONValue=False, client=None):
     else:
         file_ids = {o.name: o.object_id for o in outputs}
         return file_ids
+
+
+def _timestamp_from_iso_str(s):
+    """Return an integer POSIX timestamp for a given ISO date string.
+
+    Note: Until Python 3.11, datetime.fromisoformat doesn't work
+    with the format returned by Civis Platform.
+    """
+    try:
+        return datetime.fromisoformat(s).timestamp()
+    except ValueError:
+        try:
+            # This is the format that Civis Platform returns.
+            return datetime.strptime(s, "%Y-%m-%dT%H:%M:%S.%f%z").timestamp()
+        except ValueError:
+            # Another format, just in case.
+            return datetime.strptime(s, "%Y-%m-%dT%H:%M:%S%z").timestamp()
+
+
+def _compute_effective_max_log_id(logs):
+    """Find a max log ID use in order to avoid missing late messages.
+
+    The order of log IDs may not be consistent with "created at" times
+    since log entries are created by Civis Platform as well as the code
+    for the job itself. This function looks through recent logs
+    and finds a maximum ID that is at least as old as a set cutoff period,
+    so that messages with lower IDs that show up a bit late won't be skipped.
+    With this, it is still theoretically possible but extremely unlikely
+    for some late log messages to be skipped in the job_logs function.
+    """
+    if not logs:
+        return 0
+
+    sorted_logs = sorted(logs, key=operator.itemgetter("id"))
+
+    max_created_at_timestamp = _timestamp_from_iso_str(sorted_logs[-1]["createdAt"])
+    cutoff = time.time() - _LOG_REFETCH_CUTOFF_SECONDS
+    if max_created_at_timestamp < cutoff:
+        return sorted_logs[-1]["id"]
+    elif len(sorted_logs) >= _LOG_REFETCH_COUNT:
+        return sorted_logs[-_LOG_REFETCH_COUNT]["id"]
+
+    return 0
+
+
+def _job_finished_past_timeout(job_id, run_id, finished_timeout, raw_client):
+    """Return true if the run finished more than so many seconds ago."""
+    if finished_timeout is None:
+        return False
+
+    run = raw_client.jobs.get_runs(job_id, run_id)
+    finished_at = run.json()["finishedAt"]
+    if finished_at is None:
+        return False
+    finished_at_ts = _timestamp_from_iso_str(finished_at)
+    result = finished_at_ts < time.time() - finished_timeout
+    return result
+
+
+def job_logs(job_id, run_id=None, finished_timeout=None):
+    """Return a generator of log message dictionaries for a given run.
+
+    Parameters
+    ----------
+    job_id : int
+        The ID of the job to retrieve log message for.
+    run_id : int or None
+        The ID of the run to retrieve log messages for.
+        If None, the ID for the most recent run will be used.
+    finished_timeout: int or None
+        If not None, then this function will return once the run has
+        been finished for the specified number of seconds.
+        If None, then this function will wait until the API says there
+        will be no more new log messages, which may take a few minutes.
+        A timeout of 30-60 seconds is usually enough to retrieve all
+        log messages.
+
+    Yields
+    ------
+    dict
+        A log message dictionary with "message", "createdAt" and other attributes
+        provided by the job logs endpoint. Note that this will block execution
+        until the job has stopped and all log messages are retrieved.
+    """
+    # The return_type for the client is "raw" in order to check
+    # the "civis-cache-control" and "civis-max-id" headers when
+    # list_runs_logs returns an empty list of new messages.
+    # Caching of the endpoint information in
+    # civis.resources.generate_classes_maybe_cached avoids extra API calls.
+    raw_client = APIClient(return_type="raw")
+
+    if run_id is None:
+        run_id = raw_client.jobs.list_runs(
+            job_id, limit=1, order="id", order_dir="desc"
+        ).json()[0]["id"]
+
+    local_max_log_id = 0
+    continue_polling = True
+
+    known_log_ids = set()
+
+    while continue_polling:
+        # This call gets a limited number of log messages since last_id,
+        # ordered by log ID.
+        response = raw_client.jobs.list_runs_logs(
+            job_id,
+            run_id,
+            last_id=local_max_log_id,
+            limit=_LOGS_PER_QUERY,
+        )
+        if "civis-max-id" in response.headers:
+            remote_max_log_id = int(response.headers["civis-max-id"])
+        else:
+            # Platform hasn't seen any logs at all yet
+            remote_max_log_id = None
+        logs = response.json()
+        if logs:
+            local_max_log_id = max(log["id"] for log in logs)
+            logs.sort(key=operator.itemgetter("createdAt", "id"))
+        for log in logs:
+            if log["id"] in known_log_ids:
+                continue
+            known_log_ids.add(log["id"])
+            yield log
+
+        log_finished = response.headers["civis-cache-control"] != "no-store"
+
+        if remote_max_log_id is None:
+            remote_has_more_logs_to_get_now = False
+        elif local_max_log_id == remote_max_log_id:
+            remote_has_more_logs_to_get_now = False
+            local_max_log_id = _compute_effective_max_log_id(logs)
+            if log_finished or _job_finished_past_timeout(
+                job_id, run_id, finished_timeout, raw_client
+            ):
+                continue_polling = False
+        else:
+            remote_has_more_logs_to_get_now = True
+
+        if continue_polling and not remote_has_more_logs_to_get_now:
+            time.sleep(_FOLLOW_POLL_INTERVAL_SEC)
diff --git a/tests/test_jobs.py b/tests/test_jobs.py