Skip to content

Commit 945fc76

Browse files
Update to Airflow 2.0.0 (#52)
* Use Airflow 1.10.14 as a bridge before switching to Airflow 2.0 * Update dependencies to fit Airflow 2.0.0 * Udpate cwl-airflow init to upgrade to Airflow 2.0.0 * Update logging template to correspond to Airflow 2.0.0 * Fix bug in triggering dags from API, updated readme * Remove deprecated add_connections function * Set lower number of scheduler parsing processes in travis tests * Add DAG to resend all missed progress reports and results * Print error category in cwl-airflow test * Add docker pull limit error category * No need in --conn-extra when adding new connection
1 parent a2bdc32 commit 945fc76

File tree

21 files changed

+1093
-666
lines changed

21 files changed

+1093
-666
lines changed

.travis.yml

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ jobs:
8080
before_script:
8181
- cwl-airflow init --upgrade
8282
- rm -f ~/airflow/dags/bam-bedgraph-bigwig-single.cwl
83-
script: airflow list_dags --report # to check if all DAGs are correct
83+
script: airflow dags list # to check if all DAGs are correct
8484
- name: Test packaging for Ubuntu 18.04, Python 3.6
8585
install:
8686
- ./packaging/portable/ubuntu/pack.sh 18.04 3.6 $TRAVIS_BRANCH
@@ -89,37 +89,44 @@ jobs:
8989
before_script:
9090
- ./python3/bin_portable/airflow --help # to generate airflow.cfg
9191
- sed -i'.backup' -e 's/^executor.*/executor = LocalExecutor/g' ~/airflow/airflow.cfg
92+
- sed -i'.backup' -e 's/^parsing_processes.*/parsing_processes = 1/g' ~/airflow/airflow.cfg
93+
- sed -i'.backup' -e 's/^sql_alchemy_pool_enabled.*/sql_alchemy_pool_enabled = False/g' ~/airflow/airflow.cfg
9294
- sed -i'.backup' -e 's/^dag_dir_list_interval =.*/dag_dir_list_interval = 60/g' ~/airflow/airflow.cfg
9395
- sed -i'.backup' -e 's/^parallelism =.*/parallelism = 1/g' ~/airflow/airflow.cfg
9496
- sed -i'.backup' -e 's/^sql_alchemy_conn.*/sql_alchemy_conn = mysql:\/\/airflow:airflow@127.0.0.1:6603\/airflow/g' ~/airflow/airflow.cfg
95-
- ./python3/bin_portable/cwl-airflow init # to init database
96-
- ./python3/bin_portable/airflow connections --add --conn_id process_report --conn_type http --conn_host localhost --conn_port 3070 --conn_extra "{\"endpoint\":\"/airflow/\"}" # to add process_report connection
97+
- ./python3/bin_portable/cwl-airflow init
98+
- ./python3/bin_portable/airflow connections add process_report --conn-type http --conn-host localhost --conn-port 3070 # to add process_report connection
9799
- ./python3/bin_portable/airflow scheduler > /dev/null 2>&1 &
98100
- ./python3/bin_portable/cwl-airflow api > /dev/null 2>&1 &
101+
- sleep 5 # to let scheduler to parse all dags, otherwise we can't run the following command
102+
- ./python3/bin_portable/airflow dags unpause resend_results
99103
script: ./python3/bin_portable/cwl-airflow test --suite workflows/tests/conformance_tests.yaml --spin --range 1
100104

101105
before_install:
102106
- git clone https://github.com/datirium/workflows.git --recursive
103107
- docker pull mysql/mysql-server:5.7
104108
- docker run -v ~/database:/var/lib/mysql -e MYSQL_ROOT_PASSWORD=airflow -e MYSQL_DATABASE=airflow -e MYSQL_USER=airflow -e MYSQL_PASSWORD=airflow -p 6603:3306 -d mysql/mysql-server:5.7 --explicit-defaults-for-timestamp=1
105109
install:
106-
- pip install ".[mysql,crypto]" --constraint ./packaging/constraints/constraints-$TRAVIS_PYTHON_VERSION.txt
110+
- pip install ".[mysql]" --constraint ./packaging/constraints/constraints-$TRAVIS_PYTHON_VERSION.txt
107111
before_script:
108112
- airflow --help # to generate airflow.cfg
109113
- sed -i'.backup' -e 's/^executor.*/executor = LocalExecutor/g' ~/airflow/airflow.cfg
114+
- sed -i'.backup' -e 's/^parsing_processes.*/parsing_processes = 1/g' ~/airflow/airflow.cfg
115+
- sed -i'.backup' -e 's/^sql_alchemy_pool_enabled.*/sql_alchemy_pool_enabled = False/g' ~/airflow/airflow.cfg
110116
- sed -i'.backup' -e 's/^dag_dir_list_interval =.*/dag_dir_list_interval = 60/g' ~/airflow/airflow.cfg
111117
- sed -i'.backup' -e 's/^parallelism =.*/parallelism = 1/g' ~/airflow/airflow.cfg
112118
- sed -i'.backup' -e 's/^sql_alchemy_conn.*/sql_alchemy_conn = mysql:\/\/airflow:airflow@127.0.0.1:6603\/airflow/g' ~/airflow/airflow.cfg
113-
- cwl-airflow init # to init database
114-
- airflow connections --add --conn_id process_report --conn_type http --conn_host localhost --conn_port 3070 --conn_extra "{\"endpoint\":\"/airflow/\"}" # to add process_report connection
119+
- cwl-airflow init
120+
- airflow connections add process_report --conn-type http --conn-host localhost --conn-port 3070 # to add process_report connection
115121
- airflow scheduler > /dev/null 2>&1 &
116122
- cwl-airflow api > /dev/null 2>&1 &
123+
- sleep 5 # to let scheduler to parse all dags, otherwise we can't run the following command
124+
- airflow dags unpause resend_results
117125
script: cwl-airflow test --suite workflows/tests/conformance_tests.yaml --spin --range $NTEST
118126

119127
branches:
120128
only:
121129
- master
122-
- /^*_devel$/
123130
- /^([1-9]\d*!)?(0|[1-9]\d*)(\.(0|[1-9]\d*))*((a|b|rc)(0|[1-9]\d*))?(\.post(0|[1-9]\d*))?(\.dev(0|[1-9]\d*))?$/
124131

125132
notifications:

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
# **CWL-Airflow**
1010

11-
Python package to extend **[Apache-Airflow 1.10.12](https://airflow.apache.org)**
11+
Python package to extend **[Apache-Airflow 2.0.0](https://airflow.apache.org)**
1212
functionality with **[CWL v1.1](https://www.commonwl.org/v1.1/)** support
1313

1414
## **Cite as**

cwl_airflow/components/api/backend.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from airflow.utils.state import State
2222
from airflow.utils.timezone import parse as parsedate
2323
from airflow.utils.db import provide_session
24+
from airflow.utils.types import DagRunType
2425

2526
from cwl_airflow.utilities.helpers import (
2627
get_version,
@@ -200,7 +201,7 @@ def create_dag_run(self, dag_id, run_id, conf, session):
200201
raise ValueError(f"dag_run {run_id} for dag_id {dag_id} already exists")
201202
else:
202203
run_conf = conf if isinstance(conf, dict) else json.loads(conf)
203-
dag_run = DagRun(dag_id=dag_id, run_id=run_id, conf=run_conf)
204+
dag_run = DagRun(dag_id=dag_id, run_id=run_id, conf=run_conf, run_type=DagRunType.MANUAL)
204205
session.add(dag_run)
205206
session.commit()
206207
return dag_run

cwl_airflow/components/init/config.py

Lines changed: 98 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -13,31 +13,54 @@
1313
)
1414

1515
with CleanAirflowImport():
16-
from airflow import models
1716
from airflow.configuration import conf
18-
from airflow.utils.db import merge_conn
17+
from airflow.exceptions import AirflowConfigException
1918
from airflow.utils.dag_processing import list_py_file_paths
2019
from cwl_airflow.utilities.cwl import overwrite_deprecated_dag
2120

2221

2322
def run_init_config(args):
2423
"""
2524
Runs sequence of steps required to configure CWL-Airflow
26-
for the first time. Safe to run several times
25+
for the first time. Safe to run several times. Upgrades
26+
config to correspond to Airflow 2.0.0
2727
"""
2828

29+
create_airflow_config(args) # will create default airflow.cfg if it wasn't present
30+
patch_airflow_config(args)
2931
init_airflow_db(args)
30-
patch_airflow_config(args.config)
31-
# add_connections(args)
32+
3233
if args.upgrade:
33-
upgrade_dags(args.config)
34-
copy_dags(args.home)
34+
upgrade_dags(args)
35+
copy_dags(args)
36+
37+
38+
def create_airflow_config(args):
39+
"""
40+
Runs airflow --help command with AIRFLOW_HOME and AIRFLOW_CONFIG
41+
environment variables just to create airflow.cfg file
42+
"""
43+
44+
custom_env = os.environ.copy()
45+
custom_env["AIRFLOW_HOME"] = args.home
46+
custom_env["AIRFLOW_CONFIG"] = args.config
47+
try:
48+
run(
49+
["airflow", "--help"],
50+
env=custom_env,
51+
check=True,
52+
stdout=DEVNULL,
53+
stderr=DEVNULL
54+
)
55+
except (FileNotFoundError, CalledProcessError) as err:
56+
logging.error(f"""Failed to find or to run airflow executable'. Exiting.\n{err}""")
57+
sys.exit(1)
3558

3659

3760
def init_airflow_db(args):
3861
"""
3962
Sets AIRFLOW_HOME and AIRFLOW_CONFIG from args.
40-
Call airflow initdb from subprocess to make sure
63+
Call airflow db init from subprocess to make sure
4164
that the only two things we should care about
4265
are AIRFLOW_HOME and AIRFLOW_CONFIG
4366
"""
@@ -47,38 +70,85 @@ def init_airflow_db(args):
4770
custom_env["AIRFLOW_CONFIG"] = args.config
4871
try:
4972
run(
50-
["airflow", "initdb"], # TODO: check what's the difference initdb from updatedb
73+
["airflow", "db", "init"], # `db init` always runs `db upgrade` internally, so it's ok to run only `db init`
5174
env=custom_env,
5275
check=True,
5376
stdout=DEVNULL,
5477
stderr=DEVNULL
5578
)
56-
except (CalledProcessError, FileNotFoundError) as err:
57-
logging.error(f"""Failed to run 'airflow initdb'. Exiting.\n{err}""")
79+
except (FileNotFoundError) as err:
80+
logging.error(f"""Failed to find airflow executable'. Exiting.\n{err}""")
81+
sys.exit(1)
82+
except (CalledProcessError) as err:
83+
logging.error(f"""Failed to run 'airflow db init'. Delete airflow.db if SQLite was used. Exiting.\n{err}""")
5884
sys.exit(1)
5985

6086

61-
def patch_airflow_config(airflow_config):
87+
def patch_airflow_config(args):
6288
"""
63-
Updates provided Airflow configuration file to include defaults for cwl-airflow.
64-
If something went wrong, restores the original airflow.cfg from the backed up copy
89+
Updates current Airflow configuration file to include defaults for cwl-airflow.
90+
If something went wrong, restores the original airflow.cfg from the backed up copy.
91+
If update to Airflow 2.0.0 is required, generates new airflow.cfg with some of the
92+
important parameters copied from the old airflow.cfg. Backed up copy is not deleted in
93+
this case.
6594
"""
6695

6796
# TODO: add cwl section with the following parameters:
6897
# - singularity
6998
# - use_container
7099

100+
# CWL-Airflow specific settings
71101
patches = [
72-
["sed", "-i", "-e", "s/^dags_are_paused_at_creation.*/dags_are_paused_at_creation = False/g", airflow_config],
73-
["sed", "-i", "-e", "s/^load_examples.*/load_examples = False/g", airflow_config],
74-
["sed", "-i", "-e", "s/^logging_config_class.*/logging_config_class = cwl_airflow.config_templates.airflow_local_settings.DEFAULT_LOGGING_CONFIG/g", airflow_config],
75-
["sed", "-i", "-e", "s/^hide_paused_dags_by_default.*/hide_paused_dags_by_default = True/g", airflow_config]
102+
["sed", "-i", "-e", "s#^dags_are_paused_at_creation.*#dags_are_paused_at_creation = False#g", args.config],
103+
["sed", "-i", "-e", "s#^load_examples.*#load_examples = False#g", args.config],
104+
["sed", "-i", "-e", "s#^load_default_connections.*#load_default_connections = False#g", args.config],
105+
["sed", "-i", "-e", "s#^logging_config_class.*#logging_config_class = cwl_airflow.config_templates.airflow_local_settings.DEFAULT_LOGGING_CONFIG#g", args.config],
106+
["sed", "-i", "-e", "s#^hide_paused_dags_by_default.*#hide_paused_dags_by_default = True#g", args.config]
76107
]
77108

78-
airflow_config_backup = airflow_config + "_backup_" + str(uuid.uuid4())
109+
# Minimum amount of setting that should be enough for starting
110+
# SequentialExecutor, LocalExecutor or CeleryExecutor with
111+
# the same dags and metadata database after updating to Airflow 2.0.0.
112+
# All other user specific settings should be manually updated from the
113+
# backuped airflow.cfg as a lot of them have been refactored.
114+
transferable_settings = [
115+
("core", "dags_folder"),
116+
("core", "default_timezone"),
117+
("core", "executor"),
118+
("core", "sql_alchemy_conn"),
119+
("core", "sql_engine_encoding"), # just in case
120+
("core", "fernet_key"), # to be able to read from the old database
121+
("celery", "broker_url"),
122+
("celery", "result_backend")
123+
]
124+
125+
# create a temporary backup of airflow.cfg to restore from if we failed to apply patches
126+
# this backup will be deleted after all patches applied if it wasn't created right before
127+
# Airflow version update to 2.0.0
128+
airflow_config_backup = args.config + "_backup_" + str(uuid.uuid4())
79129
try:
80-
shutil.copyfile(airflow_config, airflow_config_backup)
130+
# reading aiflow.cfg before applying any patches and creating backup
131+
conf.read(args.config)
132+
shutil.copyfile(args.config, airflow_config_backup)
133+
134+
# check if we need to make airflow.cfg correspond to the Airflow 2.0.0
135+
# we search for [logging] section as it's present only Airflow >= 2.0.0
136+
airflow_version_update = not conf.has_section("logging")
137+
if airflow_version_update:
138+
logging.info("Airflow config will be upgraded to correspond to Airflow 2.0.0")
139+
for section, key in transferable_settings:
140+
try:
141+
patches.append(
142+
["sed", "-i", "-e", f"s#^{key}.*#{key} = {conf.get(section, key)}#g", args.config]
143+
)
144+
except AirflowConfigException: # just skip missing in the config section/key
145+
pass
146+
os.remove(args.config) # remove old config
147+
create_airflow_config(args) # create new airflow.cfg with the default values
148+
149+
# Apply all patches
81150
for patch in patches:
151+
logging.debug(f"Applying patch {patch}")
82152
run(
83153
patch,
84154
shell=False, # for proper handling of filenames with spaces
@@ -89,17 +159,17 @@ def patch_airflow_config(airflow_config):
89159
except (CalledProcessError, FileNotFoundError) as err:
90160
logging.error(f"""Failed to patch Airflow configuration file. Restoring from the backup and exiting.\n{err}""")
91161
if os.path.isfile(airflow_config_backup):
92-
shutil.copyfile(airflow_config_backup, airflow_config)
162+
shutil.copyfile(airflow_config_backup, args.config)
93163
sys.exit(1)
94164
finally:
95-
if os.path.isfile(airflow_config_backup):
165+
if os.path.isfile(airflow_config_backup) and not airflow_version_update:
96166
os.remove(airflow_config_backup)
97167

98168

99-
def upgrade_dags(airflow_config):
169+
def upgrade_dags(args):
100170
"""
101171
Corrects old style DAG python files into the new format.
102-
Reads configuration from "airflow_config". Uses standard
172+
Reads configuration from "args.config". Uses standard
103173
"conf.get" instead of "conf_get", because the fields we
104174
use are always set. Copies all deprecated dags into the
105175
"deprecated_dags" folder, adds deprecated DAGs to the
@@ -109,7 +179,7 @@ def upgrade_dags(airflow_config):
109179
files remain unchanged.
110180
"""
111181

112-
conf.read(airflow_config)
182+
conf.read(args.config) # this will read already patched airflow.cfg
113183
dags_folder = conf.get("core", "dags_folder")
114184
for dag_location in list_py_file_paths( # will skip all DAGs from ".airflowignore"
115185
directory=dags_folder,
@@ -125,10 +195,10 @@ def upgrade_dags(airflow_config):
125195
)
126196

127197

128-
def copy_dags(airflow_home, source_folder=None):
198+
def copy_dags(args, source_folder=None):
129199
"""
130200
Copies *.py files (dags) from source_folder (default ../../extensions/dags)
131-
to dags_folder, which is always {airflow_home}/dags. Overwrites existent
201+
to dags_folder, which is always {args.home}/dags. Overwrites existent
132202
files
133203
"""
134204

@@ -142,42 +212,9 @@ def copy_dags(airflow_home, source_folder=None):
142212
"extensions/dags",
143213
)
144214

145-
target_folder = get_dir(os.path.join(airflow_home, "dags"))
215+
target_folder = get_dir(os.path.join(args.home, "dags"))
146216
for root, dirs, files in os.walk(source_folder):
147217
for filename in files:
148218
if re.match(".*\\.py$", filename) and filename != "__init__.py":
149219
# if not os.path.isfile(os.path.join(target_folder, filename)):
150220
shutil.copy(os.path.join(root, filename), target_folder)
151-
152-
153-
# not used anymore
154-
def add_connections(args):
155-
"""
156-
Sets AIRFLOW_HOME and AIRFLOW_CONFIG from args.
157-
Call 'airflow connections --add' from subproces to make sure that
158-
the only two things we should care about are AIRFLOW_HOME and
159-
AIRFLOW_CONFIG. Adds "process_report" connections to the Airflow DB
160-
that is used to report workflow execution progress and results.
161-
"""
162-
163-
custom_env = os.environ.copy()
164-
custom_env["AIRFLOW_HOME"] = args.home
165-
custom_env["AIRFLOW_CONFIG"] = args.config
166-
try:
167-
run(
168-
[
169-
"airflow", "connections", "--add",
170-
"--conn_id", "process_report",
171-
"--conn_type", "http",
172-
"--conn_host", "localhost",
173-
"--conn_port", "3070",
174-
"--conn_extra", "{\"endpoint\":\"/airflow/\"}"
175-
],
176-
env=custom_env,
177-
check=True,
178-
stdout=DEVNULL,
179-
stderr=DEVNULL
180-
)
181-
except (CalledProcessError, FileNotFoundError) as err:
182-
logging.error(f"""Failed to run 'airflow connections --add'. Exiting.\n{err}""")
183-
sys.exit(1)

cwl_airflow/components/test/conformance.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,11 @@ def do_POST(self):
7676
int(self.headers["Content-Length"])
7777
).decode("UTF-8")
7878
)["payload"]
79-
if "results" in payload or payload.get("state", None) == "failed": # "results" can be {}, so we should check only if key is present, but not value
80-
self.server.results_queue.put({
81-
"run_id": payload["run_id"],
79+
if "results" in payload or payload["error"] != "": # "results" can be {}, so we should check only if key is present, but not value
80+
self.server.results_queue.put({ # we read "error" without get, because if we got to this line and "results" not in payload,
81+
"run_id": payload["run_id"], # it will definately has "error"
8282
"dag_id": payload["dag_id"],
83-
"results": payload.get("results", None)
83+
"results": payload.get("results", payload.get("error", None)) # here need to use get for "error", because it is calculated even if "results" is present
8484
})
8585

8686

0 commit comments

Comments
 (0)