From 7ce9968ce1a00ed21392db50b6c5779a53ef1305 Mon Sep 17 00:00:00 2001 From: James Cammarata Date: Wed, 25 Apr 2018 15:00:15 -0500 Subject: [PATCH] Properly unlock the socket file lock in ansible-connection (#39223) Also use a lock file per host, rather than one global file lock. Commit 9c0275a879d introduced a bug where the lock file was only being unlocked by the child PID of the resulting fork done in ansible-connection. This causes delays when a large inventory causes a lot of contention on that global lock. This patch fixes the problem by ensuring the lock is released regardless of the fork condition, and also to use a lock file based on the remote address of the target host, removing the global lock bottleneck. Fixes #38892 --- bin/ansible-connection | 103 +++++++++++++++++++++++------------------ 1 file changed, 57 insertions(+), 46 deletions(-) diff --git a/bin/ansible-connection b/bin/ansible-connection index ed6070b3f0d..95878318179 100755 --- a/bin/ansible-connection +++ b/bin/ansible-connection @@ -20,6 +20,8 @@ import traceback import errno import json +from contextlib import contextmanager + from ansible import constants as C from ansible.module_utils._text import to_bytes, to_native, to_text from ansible.module_utils.six import PY3 @@ -33,6 +35,21 @@ from ansible.utils.display import Display from ansible.utils.jsonrpc import JsonRpcServer +@contextmanager +def file_lock(lock_path): + """ + Uses contextmanager to create and release a file lock based on the + given path. This allows us to create locks using `with file_lock()` + to prevent deadlocks related to failure to unlock properly. + """ + + lock_fd = os.open(lock_path, os.O_RDWR | os.O_CREAT, 0o600) + fcntl.lockf(lock_fd, fcntl.LOCK_EX) + yield + fcntl.lockf(lock_fd, fcntl.LOCK_UN) + os.close(lock_fd) + + class ConnectionProcess(object): ''' The connection process wraps around a Connection object that manages @@ -209,60 +226,54 @@ def main(): tmp_path = unfrackpath(C.PERSISTENT_CONTROL_PATH_DIR) makedirs_safe(tmp_path) - lock_path = unfrackpath("%s/.ansible_pc_lock" % tmp_path) + lock_path = unfrackpath("%s/.ansible_pc_lock_%s" % (tmp_path, play_context.remote_addr)) socket_path = unfrackpath(cp % dict(directory=tmp_path)) - # if the socket file doesn't exist, spin up the daemon process - lock_fd = os.open(lock_path, os.O_RDWR | os.O_CREAT, 0o600) - fcntl.lockf(lock_fd, fcntl.LOCK_EX) - - if not os.path.exists(socket_path): - messages.append('local domain socket does not exist, starting it') - original_path = os.getcwd() - r, w = os.pipe() - pid = fork_process() + with file_lock(lock_path): + if not os.path.exists(socket_path): + messages.append('local domain socket does not exist, starting it') + original_path = os.getcwd() + r, w = os.pipe() + pid = fork_process() - if pid == 0: - try: - os.close(r) - wfd = os.fdopen(w, 'w') - process = ConnectionProcess(wfd, play_context, socket_path, original_path, ansible_playbook_pid) - process.start() - except Exception: - messages.append(traceback.format_exc()) - rc = 1 + if pid == 0: + try: + os.close(r) + wfd = os.fdopen(w, 'w') + process = ConnectionProcess(wfd, play_context, socket_path, original_path, ansible_playbook_pid) + process.start() + except Exception: + messages.append(traceback.format_exc()) + rc = 1 - fcntl.lockf(lock_fd, fcntl.LOCK_UN) - os.close(lock_fd) + if rc == 0: + process.run() - if rc == 0: - process.run() + sys.exit(rc) - sys.exit(rc) + else: + os.close(w) + rfd = os.fdopen(r, 'r') + data = json.loads(rfd.read()) + messages.extend(data.pop('messages')) + result.update(data) else: - os.close(w) - rfd = os.fdopen(r, 'r') - data = json.loads(rfd.read()) - messages.extend(data.pop('messages')) - result.update(data) - - else: - messages.append('found existing local domain socket, using it!') - conn = Connection(socket_path) - pc_data = to_text(init_data) - try: - messages.extend(conn.update_play_context(pc_data)) - except Exception as exc: - # Only network_cli has update_play context, so missing this is - # not fatal e.g. netconf - if isinstance(exc, ConnectionError) and getattr(exc, 'code', None) == -32601: - pass - else: - result.update({ - 'error': to_text(exc), - 'exception': traceback.format_exc() - }) + messages.append('found existing local domain socket, using it!') + conn = Connection(socket_path) + pc_data = to_text(init_data) + try: + messages.extend(conn.update_play_context(pc_data)) + except Exception as exc: + # Only network_cli has update_play context, so missing this is + # not fatal e.g. netconf + if isinstance(exc, ConnectionError) and getattr(exc, 'code', None) == -32601: + pass + else: + result.update({ + 'error': to_text(exc), + 'exception': traceback.format_exc() + }) messages.append(sys.stdout.getvalue()) result.update({