From 4531338b1251856f9c6ebe7c163492c127d5b71c Mon Sep 17 00:00:00 2001
From: David Wilson <dw@botanicus.net>
Date: Thu, 31 Jan 2019 11:26:49 +0000
Subject: [PATCH] ansible: document and make affinity stuff portable to
 non-Linux

Portable as in does nothing for the time at least for now.
---
 ansible_mitogen/affinity.py | 164 ++++++++++++++++++++++++++++--------
 ansible_mitogen/process.py  |   4 +-
 ansible_mitogen/strategy.py |   6 +-
 docs/changelog.rst          |  11 +--
 4 files changed, 140 insertions(+), 45 deletions(-)

diff --git a/ansible_mitogen/affinity.py b/ansible_mitogen/affinity.py
index b6b10e02..b97aa38a 100644
--- a/ansible_mitogen/affinity.py
+++ b/ansible_mitogen/affinity.py
@@ -26,6 +26,53 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 
+"""
+As Mitogen separates asynchronous IO out to a broker thread, communication
+necessarily involves context switching and waking that thread. When application
+threads and the broker share a CPU, this can be almost invisibly fast - around
+25 microseconds for a full A->B->A round-trip.
+
+However when threads are scheduled on different CPUs, round-trip delays
+regularly vary wildly, and easily into milliseconds. Many contributing factors
+exist, not least scenarios like:
+
+1. A is preempted immediately after waking B, but before releasing the GIL.
+2. B wakes from IO wait only to immediately enter futex wait.
+3. A may wait 10ms or more for another timeslice, as the scheduler on its CPU
+   runs threads unrelated to its transaction (i.e. not B), wake only to release
+   its GIL, before entering IO sleep waiting for a reply from B, which cannot
+   exist yet.
+4. B wakes, acquires GIL, performs work, and sends reply to A, causing it to
+   wake. B is preempted before releasing GIL.
+5. A wakes from IO wait only to immediately enter futex wait.
+6. B may wait 10ms or more for another timeslice, wake only to release its GIL,
+   before sleeping again.
+7. A wakes, acquires GIL, finally receives reply.
+
+Per above if we are unlucky, on an even moderately busy machine it is possible
+to lose milliseconds just in scheduling delay, and the effect is compounded
+when pairs of threads in process A are communicating with pairs of threads in
+process B using the same scheme, such as when Ansible WorkerProcess is
+communicating with ContextService in the connection multiplexer. In the worst
+case it could involve 4 threads working in lockstep spread across 4 busy CPUs.
+
+Since multithreading in Python is essentially useless except for waiting on IO
+due to the presence of the GIL, at least in Ansible there is no good reason for
+threads in the same process to run on distinct CPUs - they always operate in
+lockstep due to the GIL, and are thus vulnerable to issues like above.
+
+Linux lacks any natural API to describe what we want, it only permits
+individual threads to be constrained to run on specific CPUs, and for that
+constraint to be inherited by new threads and forks of the constrained thread.
+
+This module therefore implements a CPU pinning policy for Ansible processes,
+providing methods that should be called early in any new process, either to
+rebalance which CPU it is pinned to, or in the case of subprocesses, to remove
+the pinning entirely. It is likely to require ongoing tweaking, since pinning
+necessarily involves preventing the scheduler from making load balancing
+decisions.
+"""
+
 import ctypes
 import mmap
 import multiprocessing
@@ -45,9 +92,17 @@ try:
     _sched_setaffinity = _libc.sched_setaffinity
 except (OSError, AttributeError):
     _libc = None
+    _strerror = None
+    _pthread_mutex_init = None
+    _pthread_mutex_lock = None
+    _pthread_mutex_unlock = None
+    _sched_setaffinity = None
 
 
 class pthread_mutex_t(ctypes.Structure):
+    """
+    Wrap pthread_mutex_t to allow storing a lock in shared memory.
+    """
     _fields_ = [
         ('data', ctypes.c_uint8 * 512),
     ]
@@ -66,32 +121,60 @@ class pthread_mutex_t(ctypes.Structure):
 
 
 class State(ctypes.Structure):
+    """
+    Contents of shared memory segment. This allows :meth:`Manager.assign` to be
+    called from any child, since affinity assignment must happen from within
+    the context of the new child process.
+    """
     _fields_ = [
         ('lock', pthread_mutex_t),
         ('counter', ctypes.c_uint8),
     ]
 
 
-class Manager(object):
+class Policy(object):
+    """
+    Process affinity policy.
+    """
+    def assign_controller(self):
+        """
+        Assign the Ansible top-level policy to this process.
+        """
+
+    def assign_muxprocess(self):
+        """
+        Assign the MuxProcess policy to this process.
+        """
+
+    def assign_worker(self):
+        """
+        Assign the WorkerProcess policy to this process.
+        """
+
+    def assign_subprocess(self):
+        """
+        Assign the helper subprocess policy to this process.
+        """
+
+
+class LinuxPolicy(Policy):
     """
-    Bind this process to a randomly selected CPU. If done prior to starting
-    threads, all threads will be bound to the same CPU. This call is a no-op on
-    systems other than Linux.
-
-    A hook is installed that causes `reset_affinity(clear=True)` to run in the
-    child of any process created with :func:`mitogen.parent.detach_popen`,
-    ensuring CPU-intensive children like SSH are not forced to share the same
-    core as the (otherwise potentially very busy) parent.
-
-    Threads bound to the same CPU share cache and experience the lowest
-    possible inter-thread roundtrip latency, for example ensuring the minimum
-    possible time required for :class:`mitogen.service.Pool` to interact with
-    :class:`mitogen.core.Broker`, as required for every message transmitted or
-    received.
-
-    Binding threads of a Python process to one CPU makes sense, as they are
-    otherwise unable to operate in parallel, and all must acquire the same lock
-    prior to executing.
+    :class:`Policy` for Linux machines. The scheme here was tested on an
+    otherwise idle 16 thread machine.
+
+    - The connection multiplexer is pinned to CPU 0.
+    - The Ansible top-level (strategy) is pinned to CPU 1.
+    - WorkerProcesses are pinned sequentually to 2..N, wrapping around when no
+      more CPUs exist.
+    - Children such as SSH may be scheduled on any CPU except 0/1.
+
+    This could at least be improved by having workers pinned to independent
+    cores, before reusing the second hyperthread of an existing core.
+
+    A hook is installed that causes :meth:`reset` to run in the child of any
+    process created with :func:`mitogen.parent.detach_popen`, ensuring
+    CPU-intensive children like SSH are not forced to share the same core as
+    the (otherwise potentially very busy) parent.
     """
     def __init__(self):
         self.mem = mmap.mmap(-1, 4096)
@@ -99,26 +182,14 @@ class Manager(object):
         self.state.lock.init()
 
     def _set_affinity(self, mask):
-        mitogen.parent._preexec_hook = self.clear
+        mitogen.parent._preexec_hook = self._clear
         s = struct.pack('L', mask)
         _sched_setaffinity(os.getpid(), len(s), s)
 
-    def cpu_count(self):
+    def _cpu_count(self):
         return multiprocessing.cpu_count()
 
-    def clear(self):
-        """
-        Clear any prior binding, except for reserved CPUs.
-        """
-        self._set_affinity(0xffffffff & ~3)
-
-    def set_cpu(self, cpu):
-        """
-        Bind to 0-based `cpu`.
-        """
-        self._set_affinity(1 << cpu)
-
-    def assign(self):
+    def _balance(self):
         self.state.lock.acquire()
         try:
             n = self.state.counter
@@ -126,7 +197,28 @@ class Manager(object):
         finally:
             self.state.lock.release()
 
-        self.set_cpu(2 + (n % max(1, (self.cpu_count() - 2))))
+        self._set_cpu(2 + (n % max(1, (self._cpu_count() - 2))))
+
+    def _set_cpu(self, cpu):
+        self._set_affinity(1 << cpu)
+
+    def _clear(self):
+        self._set_affinity(0xffffffff & ~3)
+
+    def assign_controller(self):
+        self._set_cpu(1)
+
+    def assign_muxprocess(self):
+        self._set_cpu(0)
+
+    def assign_worker(self):
+        self._balance()
+
+    def assign_subprocess(self):
+        self._clear()
 
 
-manager = Manager()
+if _sched_setaffinity is not None:
+    policy = LinuxPolicy()
+else:
+    policy = Policy()
diff --git a/ansible_mitogen/process.py b/ansible_mitogen/process.py
index 2049d9b6..902829f7 100644
--- a/ansible_mitogen/process.py
+++ b/ansible_mitogen/process.py
@@ -173,12 +173,12 @@ class MuxProcess(object):
         if _init_logging:
             ansible_mitogen.logging.setup()
         if cls.child_pid:
-            ansible_mitogen.affinity.manager.set_cpu(1)
+            ansible_mitogen.affinity.policy.assign_controller()
             cls.child_sock.close()
             cls.child_sock = None
             mitogen.core.io_op(cls.worker_sock.recv, 1)
         else:
-            ansible_mitogen.affinity.manager.set_cpu(0)
+            ansible_mitogen.affinity.policy.assign_muxprocess()
             cls.worker_sock.close()
             cls.worker_sock = None
             self = cls()
diff --git a/ansible_mitogen/strategy.py b/ansible_mitogen/strategy.py
index 821d63d2..4d1636e2 100644
--- a/ansible_mitogen/strategy.py
+++ b/ansible_mitogen/strategy.py
@@ -28,6 +28,7 @@
 
 from __future__ import absolute_import
 import os
+import signal
 import threading
 
 import mitogen.core
@@ -102,11 +103,12 @@ def wrap_worker__run(*args, **kwargs):
     While the strategy is active, rewrite connection_loader.get() calls for
     some transports into requests for a compatible Mitogen transport.
     """
+    # Ignore parent's attempts to murder us when we still need to write
+    # profiling output.
     if mitogen.core._profile_hook.__name__ != '_profile_hook':
-        import signal
         signal.signal(signal.SIGTERM, signal.SIG_IGN)
 
-    ansible_mitogen.affinity.manager.assign()
+    ansible_mitogen.affinity.policy.assign_worker()
     return mitogen.core._profile_hook('WorkerProcess',
         lambda: worker__run(*args, **kwargs)
     )
diff --git a/docs/changelog.rst b/docs/changelog.rst
index 424585cf..890477e0 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -132,7 +132,8 @@ Mitogen for Ansible
 ~~~~~~~~~~~~~~~~~~~
 
 This release includes a huge variety of important fixes and new optimizations.
-On a synthetic run with 64 hosts it is over 35% faster than v0.2.3.
+It is 35% faster than 0.2.3 on a synthetic 64 target run that places heavy load
+on the connection multiplexer.
 
 Enhancements
 ^^^^^^^^^^^^
@@ -163,10 +164,10 @@ Enhancements
   plug-in path. See :ref:`mitogen-get-stack` for more information.
 
 * `152effc2 <https://github.com/dw/mitogen/commit/152effc2>`_,
-  `bd4b04ae <https://github.com/dw/mitogen/commit/bd4b04ae>`_: multiplexer
-  threads are pinned to one CPU, reducing latency and SMP overhead on a hot
-  path exercised for every task. This yielded a 19% speedup in a 64-target job
-  composed of many short tasks, and should easily be visible as a runtime
+  `bd4b04ae <https://github.com/dw/mitogen/commit/bd4b04ae>`_: a CPU affinity
+  policy was added for Linux controllers, reducing latency and SMP overhead on
+  hot paths exercised for every task. This yielded a 19% speedup in a 64-target
+  job composed of many short tasks, and should easily be visible as a runtime
   improvement in many-host runs.
 
 * `0979422a <https://github.com/dw/mitogen/commit/0979422a>`_: an expensive