From a1e9b9e8dbf2ea7ac5d75b2830542c006fa4721b Mon Sep 17 00:00:00 2001
From: Alex Willmer <alex@moreati.org.uk>
Date: Sat, 31 Mar 2018 11:31:51 +0100
Subject: [PATCH] Issue #160: Reimplement minimize_source as token filters

Benefits:

- More correct than re.sub()
- Better handling of trailing whitespace
- Recognises doc-strings regardless of quoting style

Limitations:

- Still not entirely correct
  - Creates a syntax error when function/class body is only a docstring
  - Doesn't handle indented docstrings yet
- Slower by 50x - 8-10 ms vs 0.2 ms for re.sub()
  - Not much scope for improving this, tokenize is 100% pure Python
- Complex state machine, harder to understand
- Higher line count in parent.py
- Untested with Mitogen parent on Python 2.x and child on Python 2.x+y

No change

- Only requires Python stdlib modules
---
 mitogen/parent.py | 70 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 62 insertions(+), 8 deletions(-)

diff --git a/mitogen/parent.py b/mitogen/parent.py
index 8a9a186a..00b0c3aa 100644
--- a/mitogen/parent.py
+++ b/mitogen/parent.py
@@ -26,12 +26,12 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 
+import cStringIO
 import fcntl
 import getpass
 import inspect
 import logging
 import os
-import re
 import select
 import signal
 import socket
@@ -39,6 +39,7 @@ import sys
 import termios
 import textwrap
 import threading
+import tokenize
 import time
 import types
 import zlib
@@ -48,9 +49,6 @@ from mitogen.core import LOG
 from mitogen.core import IOLOG
 
 
-DOCSTRING_RE = re.compile(r'""".+?"""', re.M | re.S)
-COMMENT_RE = re.compile(r'^[ ]*#[^\n]*$', re.M)
-
 try:
     SC_OPEN_MAX = os.sysconf('SC_OPEN_MAX')
 except:
@@ -79,10 +77,66 @@ def get_log_level():
 
 
 def minimize_source(source):
-    subber = lambda match: '""' + ('\n' * match.group(0).count('\n'))
-    source = DOCSTRING_RE.sub(subber, source)
-    source = COMMENT_RE.sub('', source)
-    return source.replace('    ', '\t')
+    """Remove most comments and docstrings from Python source code.
+    """
+    tokens = tokenize.generate_tokens(cStringIO.StringIO(source).readline)
+    tokens = strip_comments(tokens)
+    tokens = strip_docstrings(tokens)
+    return tokenize.untokenize(tokens)
+
+
+def strip_comments(tokens):
+    """Drop comment tokens from a `tokenize` stream.
+
+    Comments on lines 1-2 are kept, to preserve hashbang and encoding.
+    Trailing whitespace is remove from all lines.
+    """
+    prev_typ = None
+    prev_end_col = 0
+    for typ, tok, (start_row, start_col), (end_row, end_col), line in tokens:
+        if typ in (tokenize.NL, tokenize.NEWLINE):
+            if prev_typ in (tokenize.NL, tokenize.NEWLINE):
+                start_col = 0
+            else:
+                start_col = prev_end_col
+            end_col = start_col + 1
+        elif typ == tokenize.COMMENT and start_row > 2:
+            continue
+        prev_typ = typ
+        prev_end_col = end_col
+        yield typ, tok, (start_row, start_col), (end_row, end_col), line
+
+
+def strip_docstrings(tokens):
+    """Replace docstring tokens with NL tokens in a `tokenize` stream.
+
+    Any STRING token not part of an expression is deemed a docstring.
+    Indented docstrings are not yet recognised.
+    """
+    stack = []
+    state = 'wait_string'
+    for t in tokens:
+        typ = t[0]
+        if state == 'wait_string':
+            if typ in (tokenize.NL, tokenize.COMMENT):
+                yield t
+            elif typ == tokenize.STRING:
+                stack.append(t)
+            elif typ == tokenize.NEWLINE:
+                stack.append(t)
+                start_line, end_line = stack[0][2][0], stack[-1][3][0]+1
+                for i in range(start_line, end_line):
+                    yield tokenize.NL, '\n', (i, 0), (i,1), '\n'
+                del stack[:]
+            else:
+                stack.append(t)
+                for t in stack: yield t
+                del stack[:]
+                state = 'wait_newline'
+        elif state == 'wait_newline':
+            if typ == tokenize.NEWLINE:
+                state = 'wait_string'
+            yield t
 
 
 def flags(names):