From 9fe14e841cb89099abce4589caf20684b7ac4033 Mon Sep 17 00:00:00 2001 From: David Wilson Date: Sat, 21 Apr 2018 15:19:13 +0100 Subject: [PATCH 1/4] parent: reap the child process if connection fails For example if no response is received in :attr:`connect_timeout` seconds, the child would be left running. --- mitogen/parent.py | 6 +++++- tests/data/python_never_responds.sh | 3 +++ tests/parent_test.py | 21 +++++++++++++++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) create mode 100755 tests/data/python_never_responds.sh diff --git a/mitogen/parent.py b/mitogen/parent.py index 56dbc31f..36ecb295 100644 --- a/mitogen/parent.py +++ b/mitogen/parent.py @@ -699,7 +699,11 @@ class Stream(mitogen.core.Stream): LOG.debug('%r.connect(): child process stdin/stdout=%r', self, self.receive_side.fd) - self._connect_bootstrap(extra_fd) + try: + self._connect_bootstrap(extra_fd) + except Exception: + self._reap_child() + raise def _ec0_received(self): LOG.debug('%r._ec0_received()', self) diff --git a/tests/data/python_never_responds.sh b/tests/data/python_never_responds.sh new file mode 100755 index 00000000..f1ad5787 --- /dev/null +++ b/tests/data/python_never_responds.sh @@ -0,0 +1,3 @@ +#!/bin/bash +# I am a Python interpreter that sits idle until the connection times out. +exec -a mitogen-tests-python-never-responds.sh sleep 86400 diff --git a/tests/parent_test.py b/tests/parent_test.py index 590380cd..ec74eb3f 100644 --- a/tests/parent_test.py +++ b/tests/parent_test.py @@ -1,3 +1,4 @@ +import errno import os import subprocess import tempfile @@ -9,6 +10,26 @@ import testlib import mitogen.parent +class ReapChildTest(testlib.RouterMixin, testlib.TestCase): + def test_connect_timeout(self): + # Ensure the child process is reaped if the connection times out. + stream = mitogen.parent.Stream( + router=self.router, + remote_id=1234, + old_router=self.router, + max_message_size=self.router.max_message_size, + python_path=testlib.data_path('python_never_responds.sh'), + connect_timeout=0.5, + ) + self.assertRaises(mitogen.core.TimeoutError, + lambda: stream.connect() + ) + e = self.assertRaises(OSError, + lambda: os.kill(stream.pid, 0) + ) + self.assertEquals(e.args[0], errno.ESRCH) + + class StreamErrorTest(testlib.RouterMixin, testlib.TestCase): def test_direct_eof(self): e = self.assertRaises(mitogen.core.StreamError, From cbe6be449e7f51e0cae6207ffbf78e1a89de4c81 Mon Sep 17 00:00:00 2001 From: David Wilson Date: Sat, 21 Apr 2018 17:51:59 +0100 Subject: [PATCH 2/4] issue #201: parent: log a warning and work around race for now. --- mitogen/parent.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/mitogen/parent.py b/mitogen/parent.py index 36ecb295..1173da09 100644 --- a/mitogen/parent.py +++ b/mitogen/parent.py @@ -26,6 +26,7 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. +import errno import fcntl import getpass import inspect @@ -593,7 +594,15 @@ class Stream(mitogen.core.Stream): # on_disconnect() call. return - pid, status = os.waitpid(self.pid, os.WNOHANG) + try: + pid, status = os.waitpid(self.pid, os.WNOHANG) + except OSError: + e = sys.exc_info()[1] + if e.args[0] == errno.ECHILD: + LOG.warn('%r: waitpid(%r) produced ECHILD', self.pid, self) + return + raise + if pid: LOG.debug('%r: child process exit status was %d', self, status) else: From c5fe817db23522cf299e22a48ee982daa8da7270 Mon Sep 17 00:00:00 2001 From: David Wilson Date: Sat, 21 Apr 2018 18:19:55 +0100 Subject: [PATCH 3/4] ansible: tidy up log repr slightly --- ansible_mitogen/planner.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ansible_mitogen/planner.py b/ansible_mitogen/planner.py index 8f4cf77b..254b203d 100644 --- a/ansible_mitogen/planner.py +++ b/ansible_mitogen/planner.py @@ -167,6 +167,9 @@ class Planner(object): kwargs.setdefault('wrap_async', invocation.wrap_async) return kwargs + def __repr__(self): + return '%s()' % (type(self).__name__,) + class BinaryPlanner(Planner): """ From dc4433ace6ce2f19b5dca3c25d7fb1de40a771ff Mon Sep 17 00:00:00 2001 From: David Wilson Date: Sat, 21 Apr 2018 18:21:24 +0100 Subject: [PATCH 4/4] issue #202: ansible: forget all dependent contexts on Stream disconnect This is a partial fix, there are still at least 2 cases needing covered: - In-progress connections must have CallError or similar sent to any waiters - Once connection delegation exists, it is possible for other worker processes to be active (and in any step in the process), trying to communicate with a context that we know can no longer be communicated with. The solution to that isn't clear yet. Additionally ensure root has /bin/bash shell in both Docker images. --- ansible_mitogen/services.py | 33 +++++++++++++++++++ .../integration/context_service/all.yml | 1 + .../context_service/reconnection.yml | 30 +++++++++++++++++ tests/build_docker_images.py | 1 + 4 files changed, 65 insertions(+) create mode 100644 tests/ansible/integration/context_service/reconnection.yml diff --git a/ansible_mitogen/services.py b/ansible_mitogen/services.py index 6bfede0b..3665e475 100644 --- a/ansible_mitogen/services.py +++ b/ansible_mitogen/services.py @@ -199,6 +199,29 @@ class ContextService(mitogen.service.Service): self._shutdown(context) self._lru_by_via = {} + def _on_stream_disconnect(self, stream): + """ + Respond to Stream disconnection by deleting any record of contexts + reached via that stream. This method runs in the Broker thread and must + not to block. + """ + # TODO: there is a race between creation of a context and disconnection + # of its related stream. An error reply should be sent to any message + # in _waiters_by_key below. + self._lock.acquire() + try: + for context, key in list(self._key_by_context.items()): + if context.context_id in stream.routes: + LOG.info('Dropping %r due to disconnect of %r', + context, stream) + self._response_by_key.pop(key, None) + self._waiters_by_key.pop(key, None) + self._refs_by_context.pop(context, None) + self._lru_by_via.pop(context, None) + self._refs_by_context.pop(context, None) + finally: + self._lock.release() + def _connect(self, key, method_name, **kwargs): """ Actual connect implementation. Arranges for the Mitogen connection to @@ -240,14 +263,24 @@ class ContextService(mitogen.service.Service): if kwargs.get('via'): self._update_lru(context, method_name=method_name, **kwargs) + else: + # For directly connected contexts, listen to the associated + # Stream's disconnect event and use it to invalidate dependent + # Contexts. + stream = self.router.stream_by_id(context.context_id) + mitogen.core.listen(stream, 'disconnect', + lambda: self._on_stream_disconnect(stream)) + home_dir = context.call(os.path.expanduser, '~') # We don't need to wait for the result of this. Ideally we'd check its # return value somewhere, but logs will catch a failure anyway. context.call_async(ansible_mitogen.target.start_fork_parent) + if os.environ.get('MITOGEN_DUMP_THREAD_STACKS'): from mitogen import debug context.call(debug.dump_to_logger) + self._key_by_context[context] = key self._refs_by_context[context] = 0 return { diff --git a/tests/ansible/integration/context_service/all.yml b/tests/ansible/integration/context_service/all.yml index dc84bc0b..e70199f8 100644 --- a/tests/ansible/integration/context_service/all.yml +++ b/tests/ansible/integration/context_service/all.yml @@ -1 +1,2 @@ - import_playbook: lru_one_target.yml +- import_playbook: reconnection.yml diff --git a/tests/ansible/integration/context_service/reconnection.yml b/tests/ansible/integration/context_service/reconnection.yml new file mode 100644 index 00000000..8d7c7e60 --- /dev/null +++ b/tests/ansible/integration/context_service/reconnection.yml @@ -0,0 +1,30 @@ +# Test ContextService ability to handle disconnections, including handling +# cleanup of dependent (via=) contexts. + +- name: integration/context_service/reconnection.yml + hosts: all + any_errors_fatal: true + tasks: + + - become: true + custom_python_detect_environment: + register: old_become_env + + - become: true + # This must be >1 for vanilla Ansible. + shell: | + bash -c "( sleep 3; pkill -f sshd:; ) & disown" + + - connection: local + shell: sleep 3 + + - wait_for_connection: + + - become: true + custom_python_detect_environment: + register: new_become_env + + # Verify the PIDs really changed (i.e. disconnection happened) + - assert: + that: + - old_become_env.pid != new_become_env.pid diff --git a/tests/build_docker_images.py b/tests/build_docker_images.py index 758d0a9c..13eeb1ae 100755 --- a/tests/build_docker_images.py +++ b/tests/build_docker_images.py @@ -35,6 +35,7 @@ RUN yum clean all && \ DOCKERFILE = r""" COPY data/001-mitogen.sudo /etc/sudoers.d/001-mitogen RUN \ + chsh -s /bin/bash && \ mkdir -p /var/run/sshd && \ echo i-am-mitogen-test-docker-image > /etc/sentinel && \ groupadd mitogen__sudo_nopw && \