-
Notifications
You must be signed in to change notification settings - Fork 116
query: interrupt query script on SIGTERM #858
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ | |
import os | ||
import os.path | ||
import posixpath | ||
import signal | ||
import subprocess | ||
import sys | ||
import time | ||
|
@@ -97,6 +98,47 @@ | |
pass | ||
|
||
|
||
class TerminationSignal(RuntimeError): # noqa: N818 | ||
def __init__(self, signal): | ||
self.signal = signal | ||
super().__init__("Received termination signal", signal) | ||
|
||
def __repr__(self): | ||
return f"{self.__class__.__name__}({self.signal})" | ||
|
||
|
||
if sys.platform == "win32": | ||
SIGINT = signal.CTRL_C_EVENT | ||
else: | ||
SIGINT = signal.SIGINT | ||
|
||
|
||
def shutdown_process( | ||
proc: subprocess.Popen, | ||
interrupt_timeout: Optional[int] = None, | ||
terminate_timeout: Optional[int] = None, | ||
) -> int: | ||
"""Shut down the process gracefully with SIGINT -> SIGTERM -> SIGKILL.""" | ||
|
||
logger.info("sending interrupt signal to the process %s", proc.pid) | ||
proc.send_signal(SIGINT) | ||
|
||
logger.info("waiting for the process %s to finish", proc.pid) | ||
try: | ||
return proc.wait(interrupt_timeout) | ||
except subprocess.TimeoutExpired: | ||
logger.info( | ||
"timed out waiting, sending terminate signal to the process %s", proc.pid | ||
) | ||
proc.terminate() | ||
try: | ||
return proc.wait(terminate_timeout) | ||
except subprocess.TimeoutExpired: | ||
logger.info("timed out waiting, killing the process %s", proc.pid) | ||
proc.kill() | ||
return proc.wait() | ||
|
||
|
||
def _process_stream(stream: "IO[bytes]", callback: Callable[[str], None]) -> None: | ||
buffer = b"" | ||
while byt := stream.read(1): # Read one byte at a time | ||
|
@@ -1493,6 +1535,8 @@ | |
output_hook: Callable[[str], None] = noop, | ||
params: Optional[dict[str, str]] = None, | ||
job_id: Optional[str] = None, | ||
interrupt_timeout: Optional[int] = None, | ||
terminate_timeout: Optional[int] = None, | ||
) -> None: | ||
cmd = [python_executable, "-c", query_script] | ||
env = dict(env or os.environ) | ||
|
@@ -1506,13 +1550,48 @@ | |
if capture_output: | ||
popen_kwargs = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT} | ||
|
||
def raise_termination_signal(sig: int, _: Any) -> NoReturn: | ||
raise TerminationSignal(sig) | ||
|
||
thread: Optional[Thread] = None | ||
with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc: # noqa: S603 | ||
if capture_output: | ||
args = (proc.stdout, output_hook) | ||
thread = Thread(target=_process_stream, args=args, daemon=True) | ||
thread.start() | ||
thread.join() # wait for the reader thread | ||
logger.info("Starting process %s", proc.pid) | ||
|
||
orig_sigint_handler = signal.getsignal(signal.SIGINT) | ||
# ignore SIGINT in the main process. | ||
# In the terminal, SIGINTs are received by all the processes in | ||
# the foreground process group, so the script will receive the signal too. | ||
# (If we forward the signal to the child, it will receive it twice.) | ||
signal.signal(signal.SIGINT, signal.SIG_IGN) | ||
|
||
orig_sigterm_handler = signal.getsignal(signal.SIGTERM) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this should be happening inside try (to avoid getting the TerminationSignal raised in between) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so it can be raised right before There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Theoretically yes, it could. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so, then we should move it inside and catch and make sure that we handle if for example of the original handlers is not yet initialized, etc There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this a few lines code change, no? also, doing multithreading / multiprocessing is always hard exactly because lot of problems manifest themselves rarely but bite hard in terms of time someone can then spend debugging them later ... so, yes, if there are obvious improvements while it is still possible w/o obvious downsides I would always err on the side of doing safer code There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The best thing to do here is move There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There is no There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
probably, and in
there is to my mind. An external process sends a signal to a another one. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. #873. |
||
signal.signal(signal.SIGTERM, raise_termination_signal) | ||
try: | ||
if capture_output: | ||
args = (proc.stdout, output_hook) | ||
thread = Thread(target=_process_stream, args=args, daemon=True) | ||
thread.start() | ||
|
||
proc.wait() | ||
except TerminationSignal as exc: | ||
signal.signal(signal.SIGTERM, orig_sigterm_handler) | ||
shcheklein marked this conversation as resolved.
Show resolved
Hide resolved
|
||
signal.signal(signal.SIGINT, orig_sigint_handler) | ||
logging.info("Shutting down process %s, received %r", proc.pid, exc) | ||
# Rather than forwarding the signal to the child, we try to shut it down | ||
# gracefully. This is because we consider the script to be interactive | ||
# and special, so we give it time to cleanup before exiting. | ||
shutdown_process(proc, interrupt_timeout, terminate_timeout) | ||
if proc.returncode: | ||
raise QueryScriptCancelError( | ||
"Query script was canceled by user", return_code=proc.returncode | ||
) from exc | ||
finally: | ||
signal.signal(signal.SIGTERM, orig_sigterm_handler) | ||
signal.signal(signal.SIGINT, orig_sigint_handler) | ||
if thread: | ||
thread.join() # wait for the reader thread | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [Q] Will this still run into deadlocks while we are waiting for tqdm/tqdm#1649? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Deadlock was happening when But I can imagine, when the conditions are favorable, the |
||
|
||
logging.info("Process %s exited with return code %s", proc.pid, proc.returncode) | ||
if proc.returncode == QUERY_SCRIPT_CANCELED_EXIT_CODE: | ||
raise QueryScriptCancelError( | ||
"Query script was canceled by user", | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,7 +6,10 @@ | |
|
||
import pytest | ||
|
||
from datachain.catalog.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE | ||
from datachain.catalog.catalog import ( | ||
QUERY_SCRIPT_CANCELED_EXIT_CODE, | ||
TerminationSignal, | ||
) | ||
from datachain.error import QueryScriptCancelError, QueryScriptRunError | ||
|
||
|
||
|
@@ -63,3 +66,15 @@ def test_non_zero_exitcode(catalog, mock_popen): | |
catalog.query("pass") | ||
assert e.value.return_code == 1 | ||
assert "Query script exited with error code 1" in str(e.value) | ||
|
||
|
||
def test_shutdown_process_on_sigterm(mocker, catalog, mock_popen): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not sure tbh this is a very meaningful test :) seems we are testing mocks primarily There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The main goal of this test here is to assert that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yep, I understand. Doesn't see to be very useful though. Or let me put it in a different way - I think we rarely test things this way (to check if function is called w/o testing logic in some way). Still seems to be not very meaningful to my mind. While we can do in this case most likely a proper test There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Handling signals is a tricky thing, especially when the main process needs to receive it (where it happens to be a test process for tests).
We do that in plenty of places in DVC, especially in CLI tests.
If you have some example unittest that we can replace this with (without crossing boundaries into the working of I did add a functional test, but it's complex, and you have to tread very carefully not to get main process killed/blocked/waiting forever. |
||
mock_popen.returncode = -2 | ||
mock_popen.wait.side_effect = [TerminationSignal(15)] | ||
m = mocker.patch("datachain.catalog.catalog.shutdown_process", return_value=-2) | ||
|
||
with pytest.raises(QueryScriptCancelError) as e: | ||
catalog.query("pass", interrupt_timeout=0.1, terminate_timeout=0.2) | ||
assert e.value.return_code == -2 | ||
assert "Query script was canceled by user" in str(e.value) | ||
m.assert_called_once_with(mock_popen, 0.1, 0.2) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
unit test this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's no point in unittesting this. We'll check
mock_popen.terminate.assert_called_once()
which is not that useful.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I mean function test this - it should not be about testing "mocks" or course