Skip to content

Commit

Permalink
Report error categories (#50)
Browse files Browse the repository at this point in the history
* Add yield_file_content helper function with test

* Add get_error_category function to report error category

* Prevent duplicate error messages for all failed workfloe steps

* Reports only the first found error per failed task.

Error categories are sorted by priorities. If high level
error occur it might make the lower error categories
markers to appear in the log, so we need to skip them.
  • Loading branch information
michael-kotliar authored Dec 4, 2020
1 parent dfcb2f5 commit 740a0e2
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 2 deletions.
2 changes: 1 addition & 1 deletion cwl_airflow/utilities/cwl.py
Original file line number Diff line number Diff line change
Expand Up @@ -587,7 +587,7 @@ def execute_workflow_step(
sys.stderr = _stderr

if step_status != "success":
raise ValueError
raise ValueError("Failed to run workflow step")

# To remove "http://commonwl.org/cwltool#generation": 0 (copied from cwltool)
visit_class(step_outputs, ("File",), MutationManager().unset_generation)
Expand Down
14 changes: 14 additions & 0 deletions cwl_airflow/utilities/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,20 @@
from typing import MutableMapping, MutableSequence


# is not actually used anywhere
def yield_file_content(location):
"""
Yields lines from the text file.
\n at the end of the lines are trimmed.
Empty lines or with only spaces/tabs are excluded.
"""

with open(location, "r") as input_stream:
for line in input_stream:
if line.strip():
yield line.strip()


def get_compressed(data_str, reset_position=None):
"""
Converts character string "data_str" as "utf-8" into bytes ("utf-8"
Expand Down
66 changes: 65 additions & 1 deletion cwl_airflow/utilities/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from airflow.models import Variable
from airflow.utils.state import State
from airflow.hooks.http_hook import HttpHook
from airflow.configuration import conf


CONN_ID = "process_report"
Expand All @@ -32,7 +33,67 @@ def sign_with_jwt(data):
return data


def get_error_category(context):
"""
This function should be called only from the dag_run failure callback.
It's higly relies on the log files, so logging level in airflow.cfg
shouldn't be lower than ERROR. We load log file only for the latest task
retry, because the get_error_category function is called when the dag_run
has failed, so all previous task retries didn't bring any positive results.
We load logs only for the actually failed task, not for upstream_failed
tasks. All error categories are sorted by priority from higher level to the
lower one. We report only one (the highest, the first found) error category
per failed task. Error categories from all failed tasks are combined and
deduplicated. The "Failed to run workflow step" category additionally is
filled with failed task ids. The returned value is always a string.
"""

ERROR_MARKERS = {
"Docker is not available for this tool": "Docker or Network problems. Contact support team",
"ERROR - Received SIGTERM. Terminating subprocesses": "Workflow was stopped. Restart with the lower threads or memory parameters",
"Failed to run workflow step": "Workflow step(s) {} failed. Contact support team"
}

# docker daemon is not running; networks is unavailable to pull the docker image or it doesn't exist
# something took too much resources and Aiflow killed the process or something externally has stopped the task
# cwltool exited with error when executing workflow step

dag_run = context["dag_run"]
failed_tis = dag_run.get_task_instances(state=State.FAILED)
log_handler = next( # to get access to logs
(
h for h in logging.getLogger("airflow.task").handlers
if h.name == conf.get("core", "task_log_reader")
),
None
)

categories = set() # use set to prevent duplicates

for ti in failed_tis:
ti.task = context["dag"].get_task(ti.task_id) # for some reasons when retreived from DagRun we need to set "task" property from the DAG
try: # in case log files were deleted or unavailable
logs, _ = log_handler.read(ti) # logs is always a list.
for marker, category in ERROR_MARKERS.items():
if marker in logs[-1]: # logs[-1] is a string with \n from the last task retry
categories.add(category)
break
except Exception as err:
logging.debug(f"Failed to define the error category for task {ti.task_id}. \n {err}")

if categories:
return ". ".join(categories).format(", ".join( [ti.task_id for ti in failed_tis] )) # mainly to fill in the placeholder with failed task ids
return "Unknown error. Contact support team"


def post_progress(context, from_task=None):
"""
If dag_run failed but this function was run from the task callback,
error would be always "". The "error" is not "" only when this function
will be called from the DAG callback, thus making it the last and the only
message with the meaningful error description.
"""

from_task = False if from_task is None else from_task
try:
dag_run = context["dag_run"]
Expand All @@ -44,7 +105,7 @@ def post_progress(context, from_task=None):
"dag_id": dag_run.dag_id,
"run_id": dag_run.run_id,
"progress": int(len_tis_success / len_tis * 100),
"error": context["reason"] if dag_run.state == State.FAILED else ""
"error": get_error_category(context) if dag_run.state == State.FAILED and not from_task else ""
}
)
http_hook.run(endpoint=ROUTES["progress"], json={"payload": data})
Expand Down Expand Up @@ -105,10 +166,12 @@ def task_on_success(context):


def task_on_failure(context):
# no need to post progress as it hasn't been changed
post_status(context)


def task_on_retry(context):
# no need to post progress as it hasn't been changed
post_status(context)


Expand All @@ -118,4 +181,5 @@ def dag_on_success(context):


def dag_on_failure(context):
# we need to post progress, because we will also report error in it
post_progress(context)
18 changes: 18 additions & 0 deletions tests/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from cwl_airflow.utilities.helpers import (
CleanAirflowImport,
yield_file_content,
load_yaml,
remove_field_from_dict,
get_compressed,
Expand All @@ -24,6 +25,23 @@
tempfile.tempdir = "/private/tmp"


@pytest.mark.parametrize(
"location, control_count",
[
(
path.join(DATA_FOLDER, "jobs", "bam-bedgraph-bigwig.json"),
11
)
]
)
def test_yield_file_content(location, control_count):
count = 0
for _ in yield_file_content(location):
count += 1
assert control_count==count, \
"Failed to read a proper number of lines from the text file"


@pytest.mark.parametrize(
"location, control_md5sum",
[
Expand Down

0 comments on commit 740a0e2

Please sign in to comment.