Skip to content

Commit b08d999

Browse files
committed
Turn TASK_GRACE_INTERVAL into a setting
The new setting is measured in seconds. Also added some clarification around graceful shutdown. fixes pulp#6242
1 parent 23b4f33 commit b08d999

File tree

4 files changed

+33
-13
lines changed

4 files changed

+33
-13
lines changed

CHANGES/6242.feature

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Added `TASK_GRACE_INTERVAL` as a setting to specify the amount of seconds a worker gives the current task to finish when receiving SIGINT.

docs/admin/reference/settings.md

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,16 @@ Defaults to `30` seconds.
312312

313313

314314

315+
### TASK_GRACE_INTERVAL
316+
317+
On receiving SIGHUP or SIGTERM a worker will await the currently running task forever.
318+
On SIGINT, this value represents the time before the worker will attempt to kill the subprocess.
319+
This time is only accurate to one worker heartbeat corresponding to `WORKER_TTL / 3`.
320+
321+
Defaults to `600` seconds.
322+
323+
324+
315325
### REMOTE_USER_ENVIRON_NAME
316326

317327
The name of the WSGI environment variable to read for `webserver authentication
@@ -417,8 +427,6 @@ up before the task finishes. Default is 1440 minutes (24 hours).
417427

418428

419429

420-
421-
422430
### UPLOAD_PROTECTION_TIME, TMPFILE_PROTECTION_TIME and TASK_PROTECTION_TIME
423431

424432
Pulp uses `tasks`, `uploads` and `pulp temporary files` to pass data from the api to worker tasks.

pulpcore/app/settings.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,11 @@
292292
CONTENT_APP_TTL = 30
293293
WORKER_TTL = 30
294294

295+
# Seconds for a task to finish on semi graceful worker shutdown (approx)
296+
# On SIGHUP, SIGTERM the currently running task will be awaited forever.
297+
# On SIGINT, this value represents the time before the worker will attempt to kill the subprocess.
298+
TASK_GRACE_INTERVAL = 600
299+
295300
# how long to protect ephemeral items in minutes
296301
ORPHAN_PROTECTION_TIME = 24 * 60
297302

pulpcore/tasking/worker.py

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,9 @@
4545
# Unless/until we can provide reasonable ways to decide to change their values,
4646
# they will live as constants instead of "proper" settings.
4747

48-
# Number of heartbeats for a task to finish on graceful worker shutdown (approx)
49-
TASK_GRACE_INTERVAL = 3
50-
# Number of heartbeats between attempts to kill the subprocess (approx)
48+
# Seconds for a task to finish on semi graceful worker shutdown (approx)
49+
TASK_GRACE_INTERVAL = settings.TASK_GRACE_INTERVAL
50+
# Seconds between attempts to kill the subprocess (approx)
5151
TASK_KILL_INTERVAL = 1
5252
# Number of heartbeats between cleaning up worker processes (approx)
5353
WORKER_CLEANUP_INTERVAL = 100
@@ -69,7 +69,10 @@ def __init__(self):
6969
self.versions = {app.label: app.version for app in pulp_plugin_configs()}
7070
self.cursor = connection.cursor()
7171
self.worker = self.handle_worker_heartbeat()
72-
self.task_grace_timeout = 0
72+
# This defaults to immediate task cancelation.
73+
# It will be set into the future on moderately graceful worker shutdown,
74+
# and set to None for fully graceful shutdown.
75+
self.task_grace_timeout = timezone.now()
7376
self.worker_cleanup_countdown = random.randint(
7477
int(WORKER_CLEANUP_INTERVAL / 10), WORKER_CLEANUP_INTERVAL
7578
)
@@ -104,7 +107,8 @@ def _init_instrumentation(self):
104107
def _signal_handler(self, thesignal, frame):
105108
if thesignal in (signal.SIGHUP, signal.SIGTERM):
106109
_logger.info(_("Worker %s was requested to shut down gracefully."), self.name)
107-
self.task_grace_timeout = -1
110+
# Wait forever...
111+
self.task_grace_timeout = None
108112
else:
109113
# Reset signal handlers to default
110114
# If you kill the process a second time it's not graceful anymore.
@@ -113,7 +117,9 @@ def _signal_handler(self, thesignal, frame):
113117
signal.signal(signal.SIGHUP, signal.SIG_DFL)
114118

115119
_logger.info(_("Worker %s was requested to shut down."), self.name)
116-
self.task_grace_timeout = TASK_GRACE_INTERVAL
120+
self.task_grace_timeout = timezone.now() + timezone.timedelta(
121+
seconds=TASK_GRACE_INTERVAL
122+
)
117123
self.shutdown_requested = True
118124

119125
def _pg_notify_handler(self, notification):
@@ -173,8 +179,6 @@ def worker_cleanup(self):
173179
def beat(self):
174180
if self.worker.last_heartbeat < timezone.now() - self.heartbeat_period:
175181
self.worker = self.handle_worker_heartbeat()
176-
if self.task_grace_timeout > 0:
177-
self.task_grace_timeout -= 1
178182
self.worker_cleanup_countdown -= 1
179183
if self.worker_cleanup_countdown <= 0:
180184
self.worker_cleanup_countdown = WORKER_CLEANUP_INTERVAL
@@ -390,10 +394,12 @@ def supervise_task(self, task):
390394
task_process.start()
391395
while True:
392396
if cancel_state:
393-
if self.task_grace_timeout != 0:
397+
if self.task_grace_timeout is None or self.task_grace_timeout > timezone.now():
394398
_logger.info("Wait for canceled task to abort.")
395399
else:
396-
self.task_grace_timeout = TASK_KILL_INTERVAL
400+
self.task_grace_timeout = timezone.now() + timezone.TimeDelta(
401+
seconds=TASK_KILL_INTERVAL
402+
)
397403
_logger.info(
398404
"Aborting current task %s in domain: %s due to cancelation.",
399405
task.pk,
@@ -430,7 +436,7 @@ def supervise_task(self, task):
430436
if self.sentinel in r:
431437
os.read(self.sentinel, 256)
432438
if self.shutdown_requested:
433-
if self.task_grace_timeout != 0:
439+
if self.task_grace_timeout is None or self.task_grace_timeout > timezone.now():
434440
msg = (
435441
"Worker shutdown requested, waiting for task {pk} in domain: {name} "
436442
"to finish.".format(pk=task.pk, name=domain.name)

0 commit comments

Comments
 (0)