From dfc67b89fdb055010c8a05987eb5d2be142e1775 Mon Sep 17 00:00:00 2001 From: David Wilson Date: Mon, 10 Sep 2018 18:32:23 +0100 Subject: [PATCH 1/3] docs: some more cleanups - add faulthandler/thread stacks to changelog. - various api.rst cleanups. - docs: explain chain_id in howitworks. --- docs/ansible.rst | 9 ++++++--- docs/api.rst | 26 +++++++++++++------------- docs/changelog.rst | 8 ++++++++ docs/howitworks.rst | 7 +++++++ mitogen/parent.py | 9 +++++---- 5 files changed, 39 insertions(+), 20 deletions(-) diff --git a/docs/ansible.rst b/docs/ansible.rst index 7a06c120..485c24dc 100644 --- a/docs/ansible.rst +++ b/docs/ansible.rst @@ -313,9 +313,10 @@ Performance ^^^^^^^^^^^ One roundtrip initiates a transfer larger than 124 KiB, while smaller transfers -are embedded in a 0-roundtrip remote call. For tools operating via SSH -multiplexing, 4 roundtrips are required to configure the IO channel, in -addition to the time to start the local and remote processes. +are embedded in a 0-roundtrip pipelined call. For tools operating via SSH +multiplexing, 4 roundtrips are required to configure the IO channel, followed +by 6 roundtrips to transfer the file in the case of ``sftp``, in addition to +the time to start the local and remote processes. An invocation of ``scp`` with an empty ``.profile`` over a 30 ms link takes ~140 ms, wasting 110 ms per invocation, rising to ~2,000 ms over a 400 ms @@ -848,6 +849,8 @@ logging is necessary. File-based logging can be enabled by setting enabled, one file per context will be created on the local machine and every target machine, as ``/tmp/mitogen..log``. +.. _diagnosing-hangs: + Diagnosing Hangs ~~~~~~~~~~~~~~~~ diff --git a/docs/api.rst b/docs/api.rst index cb980b55..c74193e3 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -792,8 +792,8 @@ Context Class .. method:: send (msg) - Arrange for `msg` to be delivered to this context. Updates the - message's `dst_id` prior to routing it via the associated router. + Arrange for `msg` to be delivered to this context. + :attr:`dst_id ` is set to the target context ID. :param mitogen.core.Message msg: The message. @@ -801,9 +801,9 @@ Context Class .. method:: send_async (msg, persist=False) Arrange for `msg` to be delivered to this context, with replies - delivered to a newly constructed Receiver. Updates the message's - `dst_id` prior to routing it via the associated router and registers a - handle which is placed in the message's `reply_to`. + directed to a newly constructed receiver. :attr:`dst_id + ` is set to the target context ID, and :attr:`reply_to + ` is set to the newly constructed receiver's handle. :param bool persist: If :data:`False`, the handler will be unregistered after a single @@ -818,15 +818,15 @@ Context Class .. method:: send_await (msg, deadline=None) - As with :meth:`send_async`, but expect a single reply - (`persist=False`) delivered within `deadline` seconds. + Like :meth:`send_async`, but expect a single reply (`persist=False`) + delivered within `deadline` seconds. :param mitogen.core.Message msg: The message. - :param float deadline: If not :data:`None`, seconds before timing out waiting for a reply. - + :returns: + The deserialized reply. :raises mitogen.core.TimeoutError: No message was received and `deadline` passed. @@ -838,9 +838,9 @@ Context Class .. class:: Context - Extend :class:`mitogen.core.Router` with functionality useful to - masters, and child contexts who later become parents. Currently when this - class is required, the target context's router is upgraded at runtime. + Extend :class:`mitogen.core.Context` with functionality useful to masters, + and child contexts who later become parents. Currently when this class is + required, the target context's router is upgraded at runtime. .. attribute:: default_call_chain @@ -1196,7 +1196,7 @@ Broker Class Responsible for handling I/O multiplexing in a private thread. **Note:** This is the somewhat limited core version of the Broker class - used by child contexts. The master subclass is documented below this one. + used by child contexts. The master subclass is documented below. .. attribute:: shutdown_timeout = 3.0 diff --git a/docs/changelog.rst b/docs/changelog.rst index 7beb043c..f4c46976 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -74,6 +74,14 @@ Enhancements improvement, but is easily noticeable on short many-target runs, where startup overhead dominates runtime. +* The `faulthandler `_ module is + automatically activated if it is installed, simplifying debugging of hangs. + See :ref:`diagnosing-hangs` for more information. + +* The ``MITOGEN_DUMP_THREAD_STACKS`` environment variable's value now indicates + the number of seconds between stack dumps. See :ref:`diagnosing-hangs` for + more information. + Fixes ^^^^^ diff --git a/docs/howitworks.rst b/docs/howitworks.rst index 1e3d2768..a3b08eac 100644 --- a/docs/howitworks.rst +++ b/docs/howitworks.rst @@ -377,6 +377,13 @@ Children listen on the following handles: 6-tuples from :class:`mitogen.parent.CallChain`, imports ``mod_name``, then attempts to execute `class_name.func_name(\*args, \**kwargs)`. + * `chain_id`: if not :data:`None`, an identifier unique to the originating + :class:`mitogen.parent.CallChain`. When set, if an exception occurs + during a call, future calls with the same ID automatically fail with the + same exception without ever executing, and failed calls with no + `reply_to` set are not dumped to the logging framework as they otherwise + would. This is used to implement pipelining. + When this channel is closed (by way of receiving a dead message), the child's main thread begins graceful shutdown of its own :py:class:`Broker` and :py:class:`Router`. diff --git a/mitogen/parent.py b/mitogen/parent.py index 8e9f53a5..bb2b5d1e 100644 --- a/mitogen/parent.py +++ b/mitogen/parent.py @@ -1149,14 +1149,15 @@ class CallChain(object): calls execute until :meth:`reset` is invoked. No exception is logged for calls made with :meth:`call_no_reply`, instead - it is saved and reported as the result of subsequent :meth:`call` or - :meth:`call_async` calls. + the exception is saved and reported as the result of subsequent + :meth:`call` or :meth:`call_async` calls. Sequences of asynchronous calls can be made without wasting network round-trips to discover if prior calls succeed, and chains originating from multiple unrelated source contexts may overlap concurrently at a target - context without interference. In this example, 4 calls complete in one - round-trip:: + context without interference. + + In this example, 4 calls complete in one round-trip:: chain = mitogen.parent.CallChain(context, pipelined=True) chain.call_no_reply(os.mkdir, '/tmp/foo') From 530fd18e4c9aa2f39f6e9ddf013f423ca13fa5c9 Mon Sep 17 00:00:00 2001 From: David Wilson Date: Mon, 10 Sep 2018 18:59:29 +0100 Subject: [PATCH 2/3] service: wake FileService client on file open failure. Previously client would hang and excption woud be dumped to logger. --- mitogen/service.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mitogen/service.py b/mitogen/service.py index 3713bdea..ffb7308e 100644 --- a/mitogen/service.py +++ b/mitogen/service.py @@ -873,7 +873,14 @@ class FileService(Service): raise Error(self.context_mismatch_msg) LOG.debug('Serving %r', path) - fp = open(path, 'rb', self.IO_SIZE) + try: + fp = open(path, 'rb', self.IO_SIZE) + except IOError: + msg.reply(mitogen.core.CallError( + sys.exc_info()[1] + )) + return + # Response must arrive first so requestee can begin receive loop, # otherwise first ack won't arrive until all pending chunks were # delivered. In that case max BDP would always be 128KiB, aka. max From 638b196a4538926c125b9258db68c537a3098ee3 Mon Sep 17 00:00:00 2001 From: David Wilson Date: Mon, 10 Sep 2018 19:05:27 +0100 Subject: [PATCH 3/3] ansible: fix put_file() for large temporary files. Reverts 49736b3a, large file copies can't avoid the RTT. The parent stack must be blocked while FileService progresses, as unlike the small file path, it does not make a snapshot of the (possibly temporary) file passed by the action plug-in. So we need to keep that file alive while the service runs. Add a new integration test and a new soak test to cover both. --- ansible_mitogen/connection.py | 12 ++--- docs/changelog.rst | 5 -- tests/ansible/integration/action/all.yml | 1 + tests/ansible/integration/action/copy.yml | 66 +++++++++++++++++++++++ tests/ansible/soak/_file_service_loop.yml | 6 +++ tests/ansible/soak/file_service.yml | 6 +++ 6 files changed, 84 insertions(+), 12 deletions(-) create mode 100644 tests/ansible/integration/action/copy.yml create mode 100644 tests/ansible/soak/_file_service_loop.yml create mode 100644 tests/ansible/soak/file_service.yml diff --git a/ansible_mitogen/connection.py b/ansible_mitogen/connection.py index c5c96f24..12626485 100644 --- a/ansible_mitogen/connection.py +++ b/ansible_mitogen/connection.py @@ -880,13 +880,11 @@ class Connection(ansible.plugins.connection.ConnectionBase): path=mitogen.utils.cast(in_path) ) - # A roundtrip is always necessary for the target to request the file - # from FileService, however, by pipelining the transfer function, the - # subsequent step (probably a module invocation) can get its - # dependencies and function call in-flight before the transfer is - # complete. This saves at least 1 RTT between the transfer completing - # and the start of the follow-up task. - self.get_chain().call_no_reply( + # For now this must remain synchronous, as the action plug-in may have + # passed us a temporary file to transfer. A future FileService could + # maintain an LRU list of open file descriptors to keep the temporary + # file alive, but that requires more work. + self.get_chain().call( ansible_mitogen.target.transfer_file, context=self.parent, in_path=in_path, diff --git a/docs/changelog.rst b/docs/changelog.rst index f4c46976..162a7c48 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -53,11 +53,6 @@ Enhancements a 250 ms link from 30 seconds to 10 seconds compared to v0.2.2, down from 120 seconds compared to vanilla. -* `49736b3a `_: avoid a - roundtrip when transferring files larger than 124KiB, removing a delay - between waiting for the transfer to complete and start of the follow-up - action. - * `#337 `_: To avoid a scaling limitation, a PTY is no longer allocated for an SSH connection unless the configuration specifies a password. diff --git a/tests/ansible/integration/action/all.yml b/tests/ansible/integration/action/all.yml index c5cb80d7..75f77243 100644 --- a/tests/ansible/integration/action/all.yml +++ b/tests/ansible/integration/action/all.yml @@ -1,3 +1,4 @@ +- import_playbook: copy.yml - import_playbook: fixup_perms2__copy.yml - import_playbook: low_level_execute_command.yml - import_playbook: make_tmp_path.yml diff --git a/tests/ansible/integration/action/copy.yml b/tests/ansible/integration/action/copy.yml new file mode 100644 index 00000000..e3fca87f --- /dev/null +++ b/tests/ansible/integration/action/copy.yml @@ -0,0 +1,66 @@ +# Verify copy module for small and large files, and inline content. + +- name: integration/action/synchronize.yml + hosts: test-targets + any_errors_fatal: true + tasks: + - copy: + dest: /tmp/copy-tiny-file + content: + this is a tiny file. + connection: local + + - copy: + dest: /tmp/copy-large-file + # Must be larger than Connection.SMALL_SIZE_LIMIT. + content: "{% for x in range(200000) %}x{% endfor %}" + connection: local + + # end of making files + + - file: + state: absent + path: "{{item}}" + with_items: + - /tmp/copy-tiny-file.out + - /tmp/copy-large-file.out + - /tmp/copy-tiny-inline-file.out + - /tmp/copy-large-inline-file.out + + # end of cleaning out files + + - copy: + dest: /tmp/copy-large-file.out + src: /tmp/copy-large-file + + - copy: + dest: /tmp/copy-tiny-file.out + src: /tmp/copy-tiny-file + + - copy: + dest: /tmp/copy-tiny-inline-file.out + content: "tiny inline content" + + - copy: + dest: /tmp/copy-large-inline-file.out + content: | + {% for x in range(200000) %}y{% endfor %} + + # stat results + + - stat: + path: "{{item}}" + with_items: + - /tmp/copy-tiny-file.out + - /tmp/copy-large-file.out + - /tmp/copy-tiny-inline-file.out + - /tmp/copy-large-inline-file.out + register: stat + + - assert: + that: + - stat.results[0].stat.checksum == "f29faa9a6f19a700a941bf2aa5b281643c4ec8a0" + - stat.results[1].stat.checksum == "62951f943c41cdd326e5ce2b53a779e7916a820d" + - stat.results[2].stat.checksum == "b26dd6444595e2bdb342aa0a91721b57478b5029" + - stat.results[3].stat.checksum == "d675f47e467eae19e49032a2cc39118e12a6ee72" + diff --git a/tests/ansible/soak/_file_service_loop.yml b/tests/ansible/soak/_file_service_loop.yml new file mode 100644 index 00000000..96111b3c --- /dev/null +++ b/tests/ansible/soak/_file_service_loop.yml @@ -0,0 +1,6 @@ + - file: + path: /tmp/foo-{{inventory_hostname}} + state: absent + - copy: + dest: /tmp/foo-{{inventory_hostname}} + content: "{{content}}" diff --git a/tests/ansible/soak/file_service.yml b/tests/ansible/soak/file_service.yml new file mode 100644 index 00000000..3b338b3c --- /dev/null +++ b/tests/ansible/soak/file_service.yml @@ -0,0 +1,6 @@ +- hosts: all + tasks: + - set_fact: + content: "{% for x in range(126977) %}x{% endfor %}" + - include_tasks: _file_service_loop.yml + with_sequence: start=1 end=100