Merge pull request #263 from dw/dmw

Stray mux process on CTRL+C, EINTR on async task timeout, temp dir cleanup race
pull/274/head
dw 8 years ago committed by GitHub
commit 876a82f00d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -51,7 +51,6 @@ except ImportError: # Ansible <2.4
from ansible.plugins import module_loader from ansible.plugins import module_loader
from ansible.plugins import module_utils_loader from ansible.plugins import module_utils_loader
import mitogen
import ansible_mitogen.target import ansible_mitogen.target

@ -30,6 +30,7 @@ from __future__ import absolute_import
import errno import errno
import logging import logging
import os import os
import signal
import socket import socket
import sys import sys
@ -110,7 +111,7 @@ class MuxProcess(object):
if cls.child_pid: if cls.child_pid:
cls.child_sock.close() cls.child_sock.close()
cls.child_sock = None cls.child_sock = None
cls.worker_sock.recv(1) mitogen.core.io_op(cls.worker_sock.recv, 1)
else: else:
cls.worker_sock.close() cls.worker_sock.close()
cls.worker_sock = None cls.worker_sock = None
@ -128,9 +129,9 @@ class MuxProcess(object):
self._setup_services() self._setup_services()
# Let the parent know our listening socket is ready. # Let the parent know our listening socket is ready.
self.child_sock.send('1') mitogen.core.io_op(self.child_sock.send, '1')
# Block until the socket is closed, which happens on parent exit. # Block until the socket is closed, which happens on parent exit.
self.child_sock.recv(1) mitogen.core.io_op(self.child_sock.recv, 1)
def _setup_master(self): def _setup_master(self):
""" """
@ -140,6 +141,7 @@ class MuxProcess(object):
self.router.responder.whitelist_prefix('ansible') self.router.responder.whitelist_prefix('ansible')
self.router.responder.whitelist_prefix('ansible_mitogen') self.router.responder.whitelist_prefix('ansible_mitogen')
mitogen.core.listen(self.router.broker, 'shutdown', self.on_broker_shutdown) mitogen.core.listen(self.router.broker, 'shutdown', self.on_broker_shutdown)
mitogen.core.listen(self.router.broker, 'exit', self.on_broker_exit)
self.listener = mitogen.unix.Listener( self.listener = mitogen.unix.Listener(
router=self.router, router=self.router,
path=self.unix_listener_path, path=self.unix_listener_path,
@ -168,14 +170,23 @@ class MuxProcess(object):
def on_broker_shutdown(self): def on_broker_shutdown(self):
""" """
Respond to the Router shutdown (indirectly triggered through exit of Respond to broker shutdown by beginning service pool shutdown. Do not
the main thread) by unlinking the listening socket. Ideally this would join on the pool yet, since that would block the broker thread which
happen explicitly, but Ansible provides no hook to allow it. then cannot clean up pending handlers, which is required for the
threads to exit gracefully.
""" """
self.pool.stop() self.pool.stop(join=False)
try: try:
os.unlink(self.listener.path) os.unlink(self.listener.path)
except OSError, e: except OSError, e:
# Prevent a shutdown race with the parent process. # Prevent a shutdown race with the parent process.
if e.args[0] != errno.ENOENT: if e.args[0] != errno.ENOENT:
raise raise
def on_broker_exit(self):
"""
Respond to the broker thread about to exit by sending SIGTERM to
ourself. In future this should gracefully join the pool, but TERM is
fine for now.
"""
os.kill(os.getpid(), signal.SIGTERM)

@ -38,6 +38,7 @@ how to build arguments for it, preseed related data, etc.
from __future__ import absolute_import from __future__ import absolute_import
import cStringIO import cStringIO
import ctypes import ctypes
import errno
import imp import imp
import json import json
import logging import logging
@ -148,7 +149,7 @@ class Runner(object):
implementation simply restores the original environment. implementation simply restores the original environment.
""" """
self._env.revert() self._env.revert()
self._cleanup_temp() self._try_cleanup_temp()
def _cleanup_temp(self): def _cleanup_temp(self):
""" """
@ -162,6 +163,20 @@ class Runner(object):
LOG.debug('Deleting %r', path) LOG.debug('Deleting %r', path)
ansible_mitogen.target.prune_tree(path) ansible_mitogen.target.prune_tree(path)
def _try_cleanup_temp(self):
"""
During broker shutdown triggered by async task timeout or loss of
connection to the parent, it is possible for prune_tree() in
target.py::_on_broker_shutdown() to run before _cleanup_temp(), so skip
cleanup if the directory or a file disappears from beneath us.
"""
try:
self._cleanup_temp()
except (IOError, OSError) as e:
if e.args[0] == errno.ENOENT:
return
raise
def _run(self): def _run(self):
""" """
The _run() method is expected to return a dictionary in the form of The _run() method is expected to return a dictionary in the form of

@ -266,75 +266,67 @@ def _get_async_dir():
) )
def _write_job_status(job_id, dct): class AsyncRunner(object):
""" def __init__(self, job_id, timeout_secs, econtext, kwargs):
Update an async job status file. self.job_id = job_id
""" self.timeout_secs = timeout_secs
LOG.info('_write_job_status(%r, %r)', job_id, dct) self.econtext = econtext
dct.setdefault('ansible_job_id', job_id) self.kwargs = kwargs
dct.setdefault('data', '') self._timed_out = False
self._init_path()
async_dir = _get_async_dir()
if not os.path.exists(async_dir): def _init_path(self):
os.makedirs(async_dir) async_dir = _get_async_dir()
if not os.path.exists(async_dir):
path = os.path.join(async_dir, job_id) os.makedirs(async_dir)
with open(path + '.tmp', 'w') as fp: self.path = os.path.join(async_dir, self.job_id)
fp.write(json.dumps(dct))
os.rename(path + '.tmp', path) def _update(self, dct):
"""
Update an async job status file.
def _sigalrm(broker, timeout_secs, job_id): """
""" LOG.info('%r._update(%r, %r)', self, self.job_id, dct)
Respond to SIGALRM (job timeout) by updating the job file and killing the dct.setdefault('ansible_job_id', self.job_id)
process. dct.setdefault('data', '')
"""
msg = "Job reached maximum time limit of %d seconds." % (timeout_secs,) with open(self.path + '.tmp', 'w') as fp:
_write_job_status(job_id, { fp.write(json.dumps(dct))
"failed": 1, os.rename(self.path + '.tmp', self.path)
"finished": 1,
"msg": msg, def _on_sigalrm(self, signum, frame):
}) """
broker.shutdown() Respond to SIGALRM (job timeout) by updating the job file and killing
the process.
"""
def _install_alarm(broker, timeout_secs, job_id): msg = "Job reached maximum time limit of %d seconds." % (
handler = lambda *_: _sigalrm(broker, timeout_secs, job_id) self.timeout_secs,
signal.signal(signal.SIGALRM, handler) )
signal.alarm(timeout_secs) self._update({
"failed": 1,
"finished": 1,
def _run_module_async(kwargs, job_id, timeout_secs, econtext): "msg": msg,
""" })
1. Immediately updates the status file to mark the job as started. self._timed_out = True
2. Installs a timer/signal handler to implement the time limit. self.econtext.broker.shutdown()
3. Runs as with run_module(), writing the result to the status file.
def _install_alarm(self):
:param dict kwargs: signal.signal(signal.SIGALRM, self._on_sigalrm)
Runner keyword arguments. signal.alarm(self.timeout_secs)
:param str job_id:
String job ID. def _run_module(self):
:param int timeout_secs: kwargs = dict(self.kwargs, **{
If >0, limit the task's maximum run time. 'detach': True,
""" 'econtext': self.econtext,
_write_job_status(job_id, { 'emulate_tty': False,
'started': 1, })
'finished': 0,
'pid': os.getpid()
})
if timeout_secs > 0:
_install_alarm(econtext.broker, timeout_secs, job_id)
kwargs['detach'] = True dct = run_module(kwargs)
kwargs['econtext'] = econtext if mitogen.core.PY3:
kwargs['emulate_tty'] = False for key in 'stdout', 'stderr':
dct = run_module(kwargs) dct[key] = dct[key].decode('utf-8', 'surrogateescape')
if mitogen.core.PY3: return dct
for key in 'stdout', 'stderr':
dct[key] = dct[key].decode('utf-8', 'surrogateescape')
try: def _parse_result(self, dct):
filtered, warnings = ( filtered, warnings = (
ansible.module_utils.json_utils. ansible.module_utils.json_utils.
_filter_non_json_lines(dct['stdout']) _filter_non_json_lines(dct['stdout'])
@ -342,34 +334,69 @@ def _run_module_async(kwargs, job_id, timeout_secs, econtext):
result = json.loads(filtered) result = json.loads(filtered)
result.setdefault('warnings', []).extend(warnings) result.setdefault('warnings', []).extend(warnings)
result['stderr'] = dct['stderr'] result['stderr'] = dct['stderr']
_write_job_status(job_id, result) self._update(result)
except Exception:
_write_job_status(job_id, { def _run(self):
"failed": 1, """
"msg": traceback.format_exc(), 1. Immediately updates the status file to mark the job as started.
"data": dct['stdout'], # temporary notice only 2. Installs a timer/signal handler to implement the time limit.
"stderr": dct['stderr'] 3. Runs as with run_module(), writing the result to the status file.
:param dict kwargs:
Runner keyword arguments.
:param str job_id:
String job ID.
:param int timeout_secs:
If >0, limit the task's maximum run time.
"""
self._update({
'started': 1,
'finished': 0,
'pid': os.getpid()
}) })
if self.timeout_secs > 0:
self._install_alarm()
dct = self._run_module()
if not self._timed_out:
# After SIGALRM fires, there is a window between broker responding
# to shutdown() by killing the process, and work continuing on the
# main thread. If main thread was asleep in at least
# basic.py/select.select(), an EINTR will be raised. We want to
# discard that exception.
try:
self._parse_result(dct)
except Exception:
self._update({
"failed": 1,
"msg": traceback.format_exc(),
"data": dct['stdout'], # temporary notice only
"stderr": dct['stderr']
})
def run(self):
try:
try:
self._run()
except Exception:
self._update({
"failed": 1,
"msg": traceback.format_exc(),
})
finally:
self.econtext.broker.shutdown()
@mitogen.core.takes_econtext @mitogen.core.takes_econtext
def run_module_async(kwargs, job_id, timeout_secs, econtext): def run_module_async(kwargs, job_id, timeout_secs, econtext):
""" """
Arrange for a module to be executed with its run status and result Execute a module with its run status and result written to a file,
serialized to a disk file. This function expects to run in a child forked terminating on the process on completion. This function must run in a child
using :func:`create_fork_child`. forked using :func:`create_fork_child`.
""" """
try: arunner = AsyncRunner(job_id, timeout_secs, econtext, kwargs)
try: arunner.run()
_run_module_async(kwargs, job_id, timeout_secs, econtext)
except Exception:
# Catch any (ansible_mitogen) bugs and write them to the job file.
_write_job_status(job_id, {
"failed": 1,
"msg": traceback.format_exc(),
})
finally:
econtext.broker.shutdown()
def make_temp_directory(base_dir): def make_temp_directory(base_dir):

@ -48,6 +48,15 @@ LOG = logging.getLogger(__name__)
_last = None _last = None
def enable_evil_interrupts():
signal.signal(signal.SIGALRM, (lambda a, b: None))
signal.setitimer(signal.ITIMER_REAL, 0.01, 0.01)
def disable_evil_interrupts():
signal.setitimer(signal.ITIMER_REAL, 0, 0)
def _hex(n): def _hex(n):
return '%08x' % n return '%08x' % n

@ -271,9 +271,12 @@ class SerializedInvoker(Invoker):
method_name, kwargs, msg = tup method_name, kwargs, msg = tup
try: try:
super(SerializedInvoker, self).invoke(method_name, kwargs, msg) super(SerializedInvoker, self).invoke(method_name, kwargs, msg)
except mitogen.core.CallError:
e = sys.exc_info()[1]
LOG.warning('%r: call error: %s: %s', self, msg, e)
msg.reply(e)
except Exception: except Exception:
LOG.exception('%r: while invoking %r of %r', LOG.exception('%r: while invoking %s()', self, method_name)
self, method_name, self.service)
msg.reply(mitogen.core.Message.dead()) msg.reply(mitogen.core.Message.dead())
def invoke(self, method_name, kwargs, msg): def invoke(self, method_name, kwargs, msg):
@ -456,9 +459,13 @@ class Pool(object):
closed = False closed = False
def stop(self): def stop(self, join=True):
self.closed = True self.closed = True
self._select.close() self._select.close()
if join:
self.join()
def join(self):
for th in self._threads: for th in self._threads:
th.join() th.join()
for invoker in self._invoker_by_name.itervalues(): for invoker in self._invoker_by_name.itervalues():

@ -4,6 +4,7 @@ gathering = explicit
strategy_plugins = ../../ansible_mitogen/plugins/strategy strategy_plugins = ../../ansible_mitogen/plugins/strategy
action_plugins = lib/action action_plugins = lib/action
callback_plugins = lib/callback callback_plugins = lib/callback
stdout_callback = nice_stdout
library = lib/modules library = lib/modules
# module_utils = lib/module_utils # module_utils = lib/module_utils
retry_files_enabled = False retry_files_enabled = False

@ -6,16 +6,20 @@
# Start 2 duplicate jobs, verify they run concurrently. # Start 2 duplicate jobs, verify they run concurrently.
- file:
path: /tmp/flurp
state: absent
- name: create semaphore file and sleep for 5 seconds. - name: create semaphore file and sleep for 5 seconds.
shell: | shell: |
exec 2>/dev/null; exec 2>/dev/null;
bash -c ' bash -c '
echo im_alive $$ > /tmp/flurp echo im_alive $$ > /tmp/flurp;
sleep 60; sleep 60;
'; ';
rm -f /tmp/flurp; rm -f /tmp/flurp;
echo alldone echo alldone
async: 1000 async: 30
poll: 0 poll: 0
register: job1 register: job1
@ -24,30 +28,29 @@
# below compltes quickly. # below compltes quickly.
- name: verify semaphore file exists while this job exists. - name: verify semaphore file exists while this job exists.
shell: | shell: |
[ -f /tmp/flurp ] && { while [ ! -f /tmp/flurp ]; do sleep 0.1; done;
read im_alive pid < /tmp/flurp read im_alive pid < /tmp/flurp
echo $im_alive echo $im_alive
kill $pid &>/dev/null kill $pid &>/dev/null
} async: 30
async: 1000
poll: 0 poll: 0
register: job2 register: job2
- name: (job1) busy-poll up to 100000 times - name: (job1) poll
async_status: async_status:
jid: "{{job1.ansible_job_id}}" jid: "{{job1.ansible_job_id}}"
register: result1 register: result1
until: result1.finished until: result1.finished
retries: 100000 retries: 5
delay: 0 delay: 1
- name: (job2) busy-poll up to 100000 times - name: (job2) poll
async_status: async_status:
jid: "{{job2.ansible_job_id}}" jid: "{{job2.ansible_job_id}}"
register: result2 register: result2
until: result2.finished until: result2.finished
retries: 100000 retries: 5
delay: 0 delay: 1
- assert: - assert:
that: that:

@ -0,0 +1,54 @@
from __future__ import unicode_literals
import io
try:
from ansible.plugins import callback_loader
except ImportError:
from ansible.plugins.loader import callback_loader
def printi(tio, obj, key=None, indent=0):
def write(s, *args):
if args:
s %= args
tio.write(' ' * indent)
if key is not None:
tio.write('%s: ' % (key,))
tio.write(s)
tio.write('\n')
if isinstance(obj, (list, tuple)):
write('[')
for i, obj2 in enumerate(obj):
printi(tio, obj2, key=i, indent=indent+1)
key = None
write(']')
elif isinstance(obj, dict):
write('{')
for key2, obj2 in sorted(obj.iteritems()):
if not (key2.startswith('_ansible_') or
key2.endswith('_lines')):
printi(tio, obj2, key=key2, indent=indent+1)
key = None
write('}')
elif isinstance(obj, basestring):
if isinstance(obj, str):
obj = obj.decode('utf-8', 'replace')
for line in obj.splitlines():
write('%s', line.rstrip('\r\n'))
else:
write('%r', obj)
DefaultModule = callback_loader.get('default', class_only=True)
class CallbackModule(DefaultModule):
def _dump_results(self, result, *args, **kwargs):
try:
tio = io.StringIO()
printi(tio, result)
return tio.getvalue().encode('ascii', 'replace')
except:
import traceback
traceback.print_exc()
raise

@ -35,6 +35,7 @@
- has_sudo_pubkey - has_sudo_pubkey
- require_tty - require_tty
- pw_required - pw_required
- readonly_homedir
- require_tty_pw_required - require_tty_pw_required
- slow_user - slow_user
when: ansible_system != 'Darwin' when: ansible_system != 'Darwin'

Loading…
Cancel
Save