Fix signal propagation (#85907)

2 months ago · 5a9afe4409
parent 9ee667030f
commit 5a9afe4409
9 changed files with 120 additions and 19 deletions
--- a/changelogs/fragments/fix-signal-propagation.yml
+++ b/changelogs/fragments/fix-signal-propagation.yml
@ -0,0 +1,3 @@
 bugfixes:
 - SIGINT/SIGTERM Handling - Make SIGINT/SIGTERM handling more robust by splitting concerns
  between forks and the parent.
--- a/lib/ansible/executor/process/worker.py
+++ b/lib/ansible/executor/process/worker.py
@ -17,6 +17,7 @@
 from __future__ import annotations
 import errno
 import io
 import os
 import signal
@ -103,11 +104,19 @@ class WorkerProcess(multiprocessing_context.Process):  # type: ignore[name-defin
        self._cliargs = cliargs
    def _term(self, signum, frame) -> None:
-        """
+        """In child termination when notified by the parent"""
-        terminate the process group created by calling setsid when
+        signal.signal(signum, signal.SIG_DFL)
-        a terminate signal is received by the fork
+
-        """
+        try:
-        os.killpg(self.pid, signum)
+            os.killpg(self.pid, signum)
            os.kill(self.pid, signum)
        except OSError as e:
            if e.errno != errno.ESRCH:
                signame = signal.strsignal(signum)
                display.error(f'Unable to send {signame} to child[{self.pid}]: {e}')
        # fallthrough, if we are still here, just die
        os._exit(1)
    def start(self) -> None:
        """
@ -121,11 +130,6 @@ class WorkerProcess(multiprocessing_context.Process):  # type: ignore[name-defin
        # FUTURE: this lock can be removed once a more generalized pre-fork thread pause is in place
        with display._lock:
            super(WorkerProcess, self).start()
        # Since setsid is called later, if the worker is termed
        # it won't term the new process group
        # register a handler to propagate the signal
        signal.signal(signal.SIGTERM, self._term)
        signal.signal(signal.SIGINT, self._term)
    def _hard_exit(self, e: str) -> t.NoReturn:
        """
@ -170,7 +174,6 @@ class WorkerProcess(multiprocessing_context.Process):  # type: ignore[name-defin
            # to give better errors, and to prevent fd 0 reuse
            sys.stdin.close()
        except Exception as e:
            display.debug(f'Could not detach from stdio: {traceback.format_exc()}')
            display.error(f'Could not detach from stdio: {e}')
            os._exit(1)
@ -187,6 +190,9 @@ class WorkerProcess(multiprocessing_context.Process):  # type: ignore[name-defin
        # Set the queue on Display so calls to Display.display are proxied over the queue
        display.set_queue(self._final_q)
        self._detach()
        # propagate signals
        signal.signal(signal.SIGINT, self._term)
        signal.signal(signal.SIGTERM, self._term)
        try:
            with _task.TaskContext(self._task):
                return self._run()
--- a/lib/ansible/executor/task_queue_manager.py
+++ b/lib/ansible/executor/task_queue_manager.py
@ -18,8 +18,10 @@
 from __future__ import annotations
 import dataclasses
 import errno
 import os
 import sys
 import signal
 import tempfile
 import threading
 import time
@ -185,8 +187,48 @@ class TaskQueueManager:
        # plugins for inter-process locking.
        self._connection_lockfile = tempfile.TemporaryFile()
        self._workers: list[WorkerProcess | None] = []
        # signal handlers to propagate signals to workers
        signal.signal(signal.SIGTERM, self._signal_handler)
        signal.signal(signal.SIGINT, self._signal_handler)
    def _initialize_processes(self, num: int) -> None:
-        self._workers: list[WorkerProcess | None] = [None] * num
+        # mutable update to ensure the reference stays the same
        self._workers[:] = [None] * num
    def _signal_handler(self, signum, frame) -> None:
        """
        terminate all running process groups created as a result of calling
        setsid from within a WorkerProcess.
        Since the children become process leaders, signals will not
        automatically propagate to them.
        """
        signal.signal(signum, signal.SIG_DFL)
        for worker in self._workers:
            if worker is None or not worker.is_alive():
                continue
            if worker.pid:
                try:
                    # notify workers
                    os.kill(worker.pid, signum)
                except OSError as e:
                    if e.errno != errno.ESRCH:
                        signame = signal.strsignal(signum)
                        display.error(f'Unable to send {signame} to child[{worker.pid}]: {e}')
        if signum == signal.SIGINT:
            # Defer to CLI handling
            raise KeyboardInterrupt()
        pid = os.getpid()
        try:
            os.kill(pid, signum)
        except OSError as e:
            signame = signal.strsignal(signum)
            display.error(f'Unable to send {signame} to {pid}: {e}')
    def load_callbacks(self):
        """
--- a/test/integration/targets/connection_ssh/aliases
+++ b/test/integration/targets/connection_ssh/aliases
@ -2,4 +2,5 @@ needs/ssh
 shippable/posix/group3
 needs/target/connection
 needs/target/setup_test_user
 needs/target/test_utils
 setup/always/setup_passlib_controller  # required for setup_test_user
--- a/test/integration/targets/connection_ssh/runme.sh
+++ b/test/integration/targets/connection_ssh/runme.sh
@ -17,7 +17,7 @@ if command -v sshpass > /dev/null; then
        # ansible with timeout. If we time out, our custom prompt was successfully
        # searched for. It's a weird way of doing things, but it does ensure
        # that the flag gets passed to sshpass.
-        timeout 5 ansible -m ping \
+        ../test_utils/scripts/timeout.py 5 -- ansible -m ping \
            -e ansible_connection=ssh \
            -e ansible_ssh_password_mechanism=sshpass \
            -e ansible_sshpass_prompt=notThis: \
--- a/test/integration/targets/signal_propagation/aliases
+++ b/test/integration/targets/signal_propagation/aliases
@ -0,0 +1,3 @@
 shippable/posix/group4
 context/controller
 needs/target/test_utils
--- a/test/integration/targets/signal_propagation/inventory
+++ b/test/integration/targets/signal_propagation/inventory
@ -0,0 +1,14 @@
 localhost0
 localhost1
 localhost2
 localhost3
 localhost4
 localhost5
 localhost6
 localhost7
 localhost8
 localhost9
 [all:vars]
 ansible_connection=local
 ansible_python_interpreter={{ansible_playbook_python}}
--- a/test/integration/targets/signal_propagation/runme.sh
+++ b/test/integration/targets/signal_propagation/runme.sh
@ -0,0 +1,21 @@
 #!/usr/bin/env bash
 set -x
 ../test_utils/scripts/timeout.py -s SIGINT 3 -- \
    ansible all -i inventory -m debug -a 'msg={{lookup("pipe", "sleep 33")}}' -f 10
 if [[ "$?" != "124" ]]; then
    echo "Process was not terminated due to timeout"
    exit 1
 fi
 # a short sleep to let processes die
 sleep 2
 sleeps="$(pgrep -alf 'sleep\ 33')"
 rc="$?"
 if [[ "$rc" == "0" ]]; then
    echo "Found lingering processes:"
    echo "$sleeps"
    exit 1
 fi
--- a/test/integration/targets/test_utils/scripts/timeout.py
+++ b/test/integration/targets/test_utils/scripts/timeout.py
@ -2,21 +2,32 @@
 from __future__ import annotations
 import argparse
 import signal
 import subprocess
 import sys
 def signal_type(v: str) -> signal.Signals:
    if v.isdecimal():
        return signal.Signals(int(v))
    if not v.startswith('SIG'):
        v = f'SIG{v}'
    return getattr(signal.Signals, v)
 parser = argparse.ArgumentParser()
 parser.add_argument('duration', type=int)
 parser.add_argument('--signal', '-s', default=signal.SIGTERM, type=signal_type)
 parser.add_argument('command', nargs='+')
 args = parser.parse_args()
 p: subprocess.Popen | None = None
 try:
-    p = subprocess.run(
+    p = subprocess.Popen(args.command)
-        ' '.join(args.command),
+    p.wait(timeout=args.duration)
        shell=True,
        timeout=args.duration,
        check=False,
    )
    sys.exit(p.returncode)
 except subprocess.TimeoutExpired:
    if p and p.poll() is None:
        p.send_signal(args.signal)
        p.wait()
    sys.exit(124)