-
Notifications
You must be signed in to change notification settings - Fork 0
/
orchestration_ci_helpers.py
executable file
·352 lines (300 loc) · 11.2 KB
/
orchestration_ci_helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
#!/usr/bin/env python3
import json
import pandas as pd
import requests as r
import sys
import time
from atexit import register
from fire import Fire
from functools import partial, wraps
from logging import info, error, basicConfig
from os import environ as env, remove
from subprocess import run, PIPE, Popen, _active, _cleanup
from typing import Tuple
def retry_or_notify(exceptions, tries: int = 4, backoff: int = 2):
"""
Retry calling the decorated function using an exponential backoff,
if all attempts fail then send a slack notification.
Args:
exceptions: The exception to check. may be a tuple of
exceptions to check.
tries: Number of times to try (not retry) before giving up.
backoff: Backoff multiplier (e.g. value of 2 will double the delay
each retry).
"""
tries = int(env.get("MELT_CI_RETRIES", tries))
backoff = int(env.get("MELT_CI_BACKOFF", backoff))
def deco_retry(f):
@wraps(f)
def f_retry(*args, **kwargs):
for attempt in range(1, tries + 2):
mdelay = attempt * backoff
try:
return f(*args, **kwargs)
except exceptions as e:
msg = "{}, Attempt: {}, Retrying in {} seconds...".format(
e, attempt, mdelay
)
error(msg)
time.sleep(mdelay)
continue
else:
if env["CI_COMMIT_REF_NAME"] == "master":
job_msg = "Job Failed: {} -- {}".format(
env["CI_JOB_NAME"], env["CI_JOB_URL"]
)
stage_msg = "Stage: {}".format(env["CI_JOB_STAGE"])
pipeline_msg = "Pipeline: {}".format(env["CI_PIPELINE_URL"])
triggered_msg = "Triggered: {}".format(env["CI_PIPELINE_SOURCE"])
command_msg = "Command: {}".format(*args)
slack_payload = {
"text": "\n".join(
[
job_msg,
stage_msg,
pipeline_msg,
triggered_msg,
command_msg,
]
)
}
r.post(env["SLACK_WEBHOOK_URL"], data=json.dumps(slack_payload))
return f(*args, **kwargs)
return f_retry # true decorator
return deco_retry
@register
def exit_cleanup():
"""
Make sure that when the script exits it kills all subprocesses.
"""
for proc in _active:
proc.kill()
proc.communicate()
_cleanup()
run("kill -9 $(pgrep cloud_sql_proxy)", shell=True, stderr=PIPE)
if not _active:
info("Processes cleaned up.")
else:
error("Sub processes could not be cleaned up.")
def auth_gcloud(bash) -> None:
"""
Authenticate the gcloud service account.
"""
try:
with open("gcp_credentials.json", "w") as file:
file.write(env["GCP_SERVICE_CREDS"])
bash("gcloud auth activate-service-account --key-file=gcp_credentials.json")
except IOError:
error("Could not store GCP creds as a json file.")
sys.exit(1)
except:
error("Could not authenticate service account.")
sys.exit(1)
info("Account successfully authenticated.")
def find_sql_instance(gcp_instance_ref_slug) -> str:
"""
Find the gcp instance with the same ref slug, don't worry about the ID.
"""
if env["CI_COMMIT_REF_NAME"] == "master":
return env["GCP_PRODUCTION_INSTANCE_NAME"]
instance_list_raw = run(
"gcloud sql instances list --project {}".format(env["GCP_PROJECT"]),
stdout=PIPE,
shell=True,
check=True,
).stdout
try:
[instance_name] = [
instance_name
for instance_row in instance_list_raw.decode("utf-8").split(" ")
for instance_name in instance_row.split("\n")
if gcp_instance_ref_slug in instance_name
] or [None]
except:
info("No instance found.")
return instance_name
def set_sql_instance() -> Tuple[str, str, str]:
"""
Create a sql instance using the slug and ci_job_id.
Max length for instance name is 95, take away the length of the
job_id and the two hyphens and that leaves 83 characters for the name.
"""
slug_length = 83 - len(env["CI_PROJECT_NAME"])
gcp_instance_ref_slug = "{}-{}".format(
env["CI_PROJECT_NAME"], env["CI_COMMIT_REF_SLUG"][0:slug_length]
)
job_gcp_instance_name = "{}-{}".format(gcp_instance_ref_slug, env["CI_JOB_ID"][:8])
instance_name = find_sql_instance(gcp_instance_ref_slug)
info("Found instance with name: {}".format(instance_name))
return instance_name, job_gcp_instance_name, gcp_instance_ref_slug
def async_run(command: str, instance: str) -> None:
"""
Run gcloud commands using the async flag and waiting for them to finish/fail.
"""
info("Running command using async: {}".format(command))
run(command + " --async", shell=True, check=True, stdout=PIPE)
# Returns bytes with newlines, assume the first operation id is correct
operation = (
run(
"gcloud sql operations list --instance='{}' --filter='status!=DONE' --format='value(name)'".format(
instance
),
shell=True,
check=True,
stdout=PIPE,
)
.stdout.decode("utf-8")
.rstrip()
)
info("Pending Operation: {}".format(operation))
info("Waiting for operation to finish/fail...")
while True:
status = (
run(
'sleep 10; gcloud sql operations describe "{}" --format="value(status)"'.format(
operation
),
shell=True,
check=True,
stdout=PIPE,
)
.stdout.decode("utf-8")
.rstrip()
)
if status == "DONE":
info(status)
break
else:
info(status)
return
@retry_or_notify(Exception)
def use_cloudsqlproxy(command: str) -> None:
"""
Execute a command while running the cloud sql proxy in the background.
"""
# Get the instance name and start the proxy
instance_name, *_ = set_sql_instance()
sql_proxy_command = "cloud_sql_proxy -instances={}:{}:{}=tcp:5432 -credential_file=gcp_credentials.json -verbose=False"
sql_proxy = Popen(
sql_proxy_command.format(env["GCP_PROJECT"], env["GCP_REGION"], instance_name),
shell=True,
)
info("Proxy is running.")
run(command, shell=True, check=True)
return
def delete_review_cloudsql() -> None:
"""
Delete the cloudsql instance unless it is master (production).
"""
# Set the instance name and make sure it exists and isn't master
instance_name, *_ = set_sql_instance()
if not instance_name:
error("This instance does not exist. Call manage_instances to create one.")
sys.exit(1)
if instance_name == env["GCP_PRODUCTION_INSTANCE_NAME"]:
info("The branch name cannot match the production EDW instance name.")
sys.exit(1)
delete_instance_command = 'gcloud sql instances delete -q --project "{}" "{}"'
run(
delete_instance_command.format(env["GCP_PROJECT"], instance_name),
shell=True,
check=True,
)
info("Instance Deleted.")
return
def manage_review_cloudsql() -> None:
"""
Detemine whether to create, do nothing to, or clean up cloud instances.
"""
instance_name, job_gcp_instance_name, gcp_instance_ref_slug = set_sql_instance()
# Check if script should force delete related instances
if instance_name and env.get("FORCE") == "true":
info("Cleaning up old sql instances.")
list_instances_command = "gcloud sql instances list --project {} --filter {}"
instance_list_raw = run(
list_instances_command.format(env["GCP_PROJECT"], gcp_instance_ref_slug),
shell=True,
check=True,
stdout=PIPE,
).stdout
instance_list = [
instance_name
for instance_row in instance_list_raw.decode("utf-8").split(" ")
for instance_name in instance_row.split("\n")
if env["CI_PROJECT_NAME"] in instance_name
]
delete_instance_command = "gcloud sql instances delete -q --project {} {}"
for instance in instance_list:
info("Deleting instance: {}".format(instance))
run(
delete_instance_command.format(env["GCP_PROJECT"], instance),
shell=True,
check=True,
)
# If not forcing deletion and there is an instance, echo the name
elif instance_name:
info("Instance is available at: {}".format(instance_name))
return
# If no instance existed or force deleted, create a new instance
info("Cloning new instance {}".format(job_gcp_instance_name))
clone_instance_command = 'gcloud sql instances clone -q --project "{}" "{}" "{}"'
async_run(
clone_instance_command.format(
env["GCP_PROJECT"],
env["GCP_PRODUCTION_INSTANCE_NAME"],
job_gcp_instance_name,
),
job_gcp_instance_name,
)
return
def refresh_dev_cloudsql():
"""
Update the dev instance.
"""
info("Restoring the dev instance from the latest successful prod backup.")
# Dump a list of recent backups into a txt file
run(
"gcloud config set project {}".format(env["GCP_PROJECT"]),
shell=True,
check=True,
)
backup_list_filename = "backup_list.txt"
run(
"gcloud sql backups list --instance {} > {}".format(
env["GCP_PRODUCTION_INSTANCE_NAME"], backup_list_filename
),
shell=True,
check=True,
stdout=PIPE,
).stdout
# Get the most recent successful backup ID
backup_df = pd.read_table("backup_list.txt", delim_whitespace=True).query(
'STATUS == "SUCCESSFUL"'
)
backup_df["WINDOW_START_TIME"] = pd.to_datetime(backup_df["WINDOW_START_TIME"])
newest_backup = backup_df["WINDOW_START_TIME"].max()
[backup_id] = backup_df.query("WINDOW_START_TIME == @newest_backup")["ID"]
remove(backup_list_filename)
# Trigger the dev instance refresh
instance_refresh_command = 'gcloud sql backups restore {} -q --restore-instance="{}" --backup-instance="{}"'
run(
instance_refresh_command.format(
backup_id, env["GCP_DEV_INSTANCE_NAME"], env["GCP_PRODUCTION_INSTANCE_NAME"]
),
shell=True,
check=True,
)
if __name__ == "__main__":
# Do some setup before running any Fire functions
basicConfig(stream=sys.stdout, level=20)
bash = partial(run, shell=True, check=True)
auth_gcloud(bash)
Fire(
{
"get_sql_instance": set_sql_instance,
"use_proxy": use_cloudsqlproxy,
"manage_instances": manage_review_cloudsql,
"delete_instance": delete_review_cloudsql,
"refresh_dev_instance": refresh_dev_cloudsql,
}
)