Merge pull request #263 from dw/dmw

Stray mux process on CTRL+C, EINTR on async task timeout, temp dir cleanup race
pull/274/head
dw 8 years ago committed by GitHub
commit 876a82f00d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -51,7 +51,6 @@ except ImportError: # Ansible <2.4
from ansible.plugins import module_loader from ansible.plugins import module_loader
from ansible.plugins import module_utils_loader from ansible.plugins import module_utils_loader
import mitogen
import ansible_mitogen.target import ansible_mitogen.target

@ -30,6 +30,7 @@ from __future__ import absolute_import
import errno import errno
import logging import logging
import os import os
import signal
import socket import socket
import sys import sys
@ -110,7 +111,7 @@ class MuxProcess(object):
if cls.child_pid: if cls.child_pid:
cls.child_sock.close() cls.child_sock.close()
cls.child_sock = None cls.child_sock = None
cls.worker_sock.recv(1) mitogen.core.io_op(cls.worker_sock.recv, 1)
else: else:
cls.worker_sock.close() cls.worker_sock.close()
cls.worker_sock = None cls.worker_sock = None
@ -128,9 +129,9 @@ class MuxProcess(object):
self._setup_services() self._setup_services()
# Let the parent know our listening socket is ready. # Let the parent know our listening socket is ready.
self.child_sock.send('1') mitogen.core.io_op(self.child_sock.send, '1')
# Block until the socket is closed, which happens on parent exit. # Block until the socket is closed, which happens on parent exit.
self.child_sock.recv(1) mitogen.core.io_op(self.child_sock.recv, 1)
def _setup_master(self): def _setup_master(self):
""" """
@ -140,6 +141,7 @@ class MuxProcess(object):
self.router.responder.whitelist_prefix('ansible') self.router.responder.whitelist_prefix('ansible')
self.router.responder.whitelist_prefix('ansible_mitogen') self.router.responder.whitelist_prefix('ansible_mitogen')
mitogen.core.listen(self.router.broker, 'shutdown', self.on_broker_shutdown) mitogen.core.listen(self.router.broker, 'shutdown', self.on_broker_shutdown)
mitogen.core.listen(self.router.broker, 'exit', self.on_broker_exit)
self.listener = mitogen.unix.Listener( self.listener = mitogen.unix.Listener(
router=self.router, router=self.router,
path=self.unix_listener_path, path=self.unix_listener_path,
@ -168,14 +170,23 @@ class MuxProcess(object):
def on_broker_shutdown(self): def on_broker_shutdown(self):
""" """
Respond to the Router shutdown (indirectly triggered through exit of Respond to broker shutdown by beginning service pool shutdown. Do not
the main thread) by unlinking the listening socket. Ideally this would join on the pool yet, since that would block the broker thread which
happen explicitly, but Ansible provides no hook to allow it. then cannot clean up pending handlers, which is required for the
threads to exit gracefully.
""" """
self.pool.stop() self.pool.stop(join=False)
try: try:
os.unlink(self.listener.path) os.unlink(self.listener.path)
except OSError, e: except OSError, e:
# Prevent a shutdown race with the parent process. # Prevent a shutdown race with the parent process.
if e.args[0] != errno.ENOENT: if e.args[0] != errno.ENOENT:
raise raise
def on_broker_exit(self):
"""
Respond to the broker thread about to exit by sending SIGTERM to
ourself. In future this should gracefully join the pool, but TERM is
fine for now.
"""
os.kill(os.getpid(), signal.SIGTERM)

@ -38,6 +38,7 @@ how to build arguments for it, preseed related data, etc.
from __future__ import absolute_import from __future__ import absolute_import
import cStringIO import cStringIO
import ctypes import ctypes
import errno
import imp import imp
import json import json
import logging import logging
@ -148,7 +149,7 @@ class Runner(object):
implementation simply restores the original environment. implementation simply restores the original environment.
""" """
self._env.revert() self._env.revert()
self._cleanup_temp() self._try_cleanup_temp()
def _cleanup_temp(self): def _cleanup_temp(self):
""" """
@ -162,6 +163,20 @@ class Runner(object):
LOG.debug('Deleting %r', path) LOG.debug('Deleting %r', path)
ansible_mitogen.target.prune_tree(path) ansible_mitogen.target.prune_tree(path)
def _try_cleanup_temp(self):
"""
During broker shutdown triggered by async task timeout or loss of
connection to the parent, it is possible for prune_tree() in
target.py::_on_broker_shutdown() to run before _cleanup_temp(), so skip
cleanup if the directory or a file disappears from beneath us.
"""
try:
self._cleanup_temp()
except (IOError, OSError) as e:
if e.args[0] == errno.ENOENT:
return
raise
def _run(self): def _run(self):
""" """
The _run() method is expected to return a dictionary in the form of The _run() method is expected to return a dictionary in the form of

@ -266,45 +266,77 @@ def _get_async_dir():
) )
def _write_job_status(job_id, dct): class AsyncRunner(object):
def __init__(self, job_id, timeout_secs, econtext, kwargs):
self.job_id = job_id
self.timeout_secs = timeout_secs
self.econtext = econtext
self.kwargs = kwargs
self._timed_out = False
self._init_path()
def _init_path(self):
async_dir = _get_async_dir()
if not os.path.exists(async_dir):
os.makedirs(async_dir)
self.path = os.path.join(async_dir, self.job_id)
def _update(self, dct):
""" """
Update an async job status file. Update an async job status file.
""" """
LOG.info('_write_job_status(%r, %r)', job_id, dct) LOG.info('%r._update(%r, %r)', self, self.job_id, dct)
dct.setdefault('ansible_job_id', job_id) dct.setdefault('ansible_job_id', self.job_id)
dct.setdefault('data', '') dct.setdefault('data', '')
async_dir = _get_async_dir() with open(self.path + '.tmp', 'w') as fp:
if not os.path.exists(async_dir):
os.makedirs(async_dir)
path = os.path.join(async_dir, job_id)
with open(path + '.tmp', 'w') as fp:
fp.write(json.dumps(dct)) fp.write(json.dumps(dct))
os.rename(path + '.tmp', path) os.rename(self.path + '.tmp', self.path)
def _sigalrm(broker, timeout_secs, job_id): def _on_sigalrm(self, signum, frame):
""" """
Respond to SIGALRM (job timeout) by updating the job file and killing the Respond to SIGALRM (job timeout) by updating the job file and killing
process. the process.
""" """
msg = "Job reached maximum time limit of %d seconds." % (timeout_secs,) msg = "Job reached maximum time limit of %d seconds." % (
_write_job_status(job_id, { self.timeout_secs,
)
self._update({
"failed": 1, "failed": 1,
"finished": 1, "finished": 1,
"msg": msg, "msg": msg,
}) })
broker.shutdown() self._timed_out = True
self.econtext.broker.shutdown()
def _install_alarm(self):
signal.signal(signal.SIGALRM, self._on_sigalrm)
signal.alarm(self.timeout_secs)
def _run_module(self):
kwargs = dict(self.kwargs, **{
'detach': True,
'econtext': self.econtext,
'emulate_tty': False,
})
def _install_alarm(broker, timeout_secs, job_id): dct = run_module(kwargs)
handler = lambda *_: _sigalrm(broker, timeout_secs, job_id) if mitogen.core.PY3:
signal.signal(signal.SIGALRM, handler) for key in 'stdout', 'stderr':
signal.alarm(timeout_secs) dct[key] = dct[key].decode('utf-8', 'surrogateescape')
return dct
def _parse_result(self, dct):
filtered, warnings = (
ansible.module_utils.json_utils.
_filter_non_json_lines(dct['stdout'])
)
result = json.loads(filtered)
result.setdefault('warnings', []).extend(warnings)
result['stderr'] = dct['stderr']
self._update(result)
def _run_module_async(kwargs, job_id, timeout_secs, econtext): def _run(self):
""" """
1. Immediately updates the status file to mark the job as started. 1. Immediately updates the status file to mark the job as started.
2. Installs a timer/signal handler to implement the time limit. 2. Installs a timer/signal handler to implement the time limit.
@ -317,59 +349,54 @@ def _run_module_async(kwargs, job_id, timeout_secs, econtext):
:param int timeout_secs: :param int timeout_secs:
If >0, limit the task's maximum run time. If >0, limit the task's maximum run time.
""" """
_write_job_status(job_id, { self._update({
'started': 1, 'started': 1,
'finished': 0, 'finished': 0,
'pid': os.getpid() 'pid': os.getpid()
}) })
if timeout_secs > 0: if self.timeout_secs > 0:
_install_alarm(econtext.broker, timeout_secs, job_id) self._install_alarm()
kwargs['detach'] = True
kwargs['econtext'] = econtext
kwargs['emulate_tty'] = False
dct = run_module(kwargs)
if mitogen.core.PY3:
for key in 'stdout', 'stderr':
dct[key] = dct[key].decode('utf-8', 'surrogateescape')
dct = self._run_module()
if not self._timed_out:
# After SIGALRM fires, there is a window between broker responding
# to shutdown() by killing the process, and work continuing on the
# main thread. If main thread was asleep in at least
# basic.py/select.select(), an EINTR will be raised. We want to
# discard that exception.
try: try:
filtered, warnings = ( self._parse_result(dct)
ansible.module_utils.json_utils.
_filter_non_json_lines(dct['stdout'])
)
result = json.loads(filtered)
result.setdefault('warnings', []).extend(warnings)
result['stderr'] = dct['stderr']
_write_job_status(job_id, result)
except Exception: except Exception:
_write_job_status(job_id, { self._update({
"failed": 1, "failed": 1,
"msg": traceback.format_exc(), "msg": traceback.format_exc(),
"data": dct['stdout'], # temporary notice only "data": dct['stdout'], # temporary notice only
"stderr": dct['stderr'] "stderr": dct['stderr']
}) })
def run(self):
@mitogen.core.takes_econtext
def run_module_async(kwargs, job_id, timeout_secs, econtext):
"""
Arrange for a module to be executed with its run status and result
serialized to a disk file. This function expects to run in a child forked
using :func:`create_fork_child`.
"""
try: try:
try: try:
_run_module_async(kwargs, job_id, timeout_secs, econtext) self._run()
except Exception: except Exception:
# Catch any (ansible_mitogen) bugs and write them to the job file. self._update({
_write_job_status(job_id, {
"failed": 1, "failed": 1,
"msg": traceback.format_exc(), "msg": traceback.format_exc(),
}) })
finally: finally:
econtext.broker.shutdown() self.econtext.broker.shutdown()
@mitogen.core.takes_econtext
def run_module_async(kwargs, job_id, timeout_secs, econtext):
"""
Execute a module with its run status and result written to a file,
terminating on the process on completion. This function must run in a child
forked using :func:`create_fork_child`.
"""
arunner = AsyncRunner(job_id, timeout_secs, econtext, kwargs)
arunner.run()
def make_temp_directory(base_dir): def make_temp_directory(base_dir):

@ -48,6 +48,15 @@ LOG = logging.getLogger(__name__)
_last = None _last = None
def enable_evil_interrupts():
signal.signal(signal.SIGALRM, (lambda a, b: None))
signal.setitimer(signal.ITIMER_REAL, 0.01, 0.01)
def disable_evil_interrupts():
signal.setitimer(signal.ITIMER_REAL, 0, 0)
def _hex(n): def _hex(n):
return '%08x' % n return '%08x' % n

@ -271,9 +271,12 @@ class SerializedInvoker(Invoker):
method_name, kwargs, msg = tup method_name, kwargs, msg = tup
try: try:
super(SerializedInvoker, self).invoke(method_name, kwargs, msg) super(SerializedInvoker, self).invoke(method_name, kwargs, msg)
except mitogen.core.CallError:
e = sys.exc_info()[1]
LOG.warning('%r: call error: %s: %s', self, msg, e)
msg.reply(e)
except Exception: except Exception:
LOG.exception('%r: while invoking %r of %r', LOG.exception('%r: while invoking %s()', self, method_name)
self, method_name, self.service)
msg.reply(mitogen.core.Message.dead()) msg.reply(mitogen.core.Message.dead())
def invoke(self, method_name, kwargs, msg): def invoke(self, method_name, kwargs, msg):
@ -456,9 +459,13 @@ class Pool(object):
closed = False closed = False
def stop(self): def stop(self, join=True):
self.closed = True self.closed = True
self._select.close() self._select.close()
if join:
self.join()
def join(self):
for th in self._threads: for th in self._threads:
th.join() th.join()
for invoker in self._invoker_by_name.itervalues(): for invoker in self._invoker_by_name.itervalues():

@ -4,6 +4,7 @@ gathering = explicit
strategy_plugins = ../../ansible_mitogen/plugins/strategy strategy_plugins = ../../ansible_mitogen/plugins/strategy
action_plugins = lib/action action_plugins = lib/action
callback_plugins = lib/callback callback_plugins = lib/callback
stdout_callback = nice_stdout
library = lib/modules library = lib/modules
# module_utils = lib/module_utils # module_utils = lib/module_utils
retry_files_enabled = False retry_files_enabled = False

@ -6,16 +6,20 @@
# Start 2 duplicate jobs, verify they run concurrently. # Start 2 duplicate jobs, verify they run concurrently.
- file:
path: /tmp/flurp
state: absent
- name: create semaphore file and sleep for 5 seconds. - name: create semaphore file and sleep for 5 seconds.
shell: | shell: |
exec 2>/dev/null; exec 2>/dev/null;
bash -c ' bash -c '
echo im_alive $$ > /tmp/flurp echo im_alive $$ > /tmp/flurp;
sleep 60; sleep 60;
'; ';
rm -f /tmp/flurp; rm -f /tmp/flurp;
echo alldone echo alldone
async: 1000 async: 30
poll: 0 poll: 0
register: job1 register: job1
@ -24,30 +28,29 @@
# below compltes quickly. # below compltes quickly.
- name: verify semaphore file exists while this job exists. - name: verify semaphore file exists while this job exists.
shell: | shell: |
[ -f /tmp/flurp ] && { while [ ! -f /tmp/flurp ]; do sleep 0.1; done;
read im_alive pid < /tmp/flurp read im_alive pid < /tmp/flurp
echo $im_alive echo $im_alive
kill $pid &>/dev/null kill $pid &>/dev/null
} async: 30
async: 1000
poll: 0 poll: 0
register: job2 register: job2
- name: (job1) busy-poll up to 100000 times - name: (job1) poll
async_status: async_status:
jid: "{{job1.ansible_job_id}}" jid: "{{job1.ansible_job_id}}"
register: result1 register: result1
until: result1.finished until: result1.finished
retries: 100000 retries: 5
delay: 0 delay: 1
- name: (job2) busy-poll up to 100000 times - name: (job2) poll
async_status: async_status:
jid: "{{job2.ansible_job_id}}" jid: "{{job2.ansible_job_id}}"
register: result2 register: result2
until: result2.finished until: result2.finished
retries: 100000 retries: 5
delay: 0 delay: 1
- assert: - assert:
that: that:

@ -0,0 +1,54 @@
from __future__ import unicode_literals
import io
try:
from ansible.plugins import callback_loader
except ImportError:
from ansible.plugins.loader import callback_loader
def printi(tio, obj, key=None, indent=0):
def write(s, *args):
if args:
s %= args
tio.write(' ' * indent)
if key is not None:
tio.write('%s: ' % (key,))
tio.write(s)
tio.write('\n')
if isinstance(obj, (list, tuple)):
write('[')
for i, obj2 in enumerate(obj):
printi(tio, obj2, key=i, indent=indent+1)
key = None
write(']')
elif isinstance(obj, dict):
write('{')
for key2, obj2 in sorted(obj.iteritems()):
if not (key2.startswith('_ansible_') or
key2.endswith('_lines')):
printi(tio, obj2, key=key2, indent=indent+1)
key = None
write('}')
elif isinstance(obj, basestring):
if isinstance(obj, str):
obj = obj.decode('utf-8', 'replace')
for line in obj.splitlines():
write('%s', line.rstrip('\r\n'))
else:
write('%r', obj)
DefaultModule = callback_loader.get('default', class_only=True)
class CallbackModule(DefaultModule):
def _dump_results(self, result, *args, **kwargs):
try:
tio = io.StringIO()
printi(tio, result)
return tio.getvalue().encode('ascii', 'replace')
except:
import traceback
traceback.print_exc()
raise

@ -35,6 +35,7 @@
- has_sudo_pubkey - has_sudo_pubkey
- require_tty - require_tty
- pw_required - pw_required
- readonly_homedir
- require_tty_pw_required - require_tty_pw_required
- slow_user - slow_user
when: ansible_system != 'Darwin' when: ansible_system != 'Darwin'

Loading…
Cancel
Save