From a1e9b9e8dbf2ea7ac5d75b2830542c006fa4721b Mon Sep 17 00:00:00 2001 From: Alex Willmer Date: Sat, 31 Mar 2018 11:31:51 +0100 Subject: [PATCH] Issue #160: Reimplement minimize_source as token filters Benefits: - More correct than re.sub() - Better handling of trailing whitespace - Recognises doc-strings regardless of quoting style Limitations: - Still not entirely correct - Creates a syntax error when function/class body is only a docstring - Doesn't handle indented docstrings yet - Slower by 50x - 8-10 ms vs 0.2 ms for re.sub() - Not much scope for improving this, tokenize is 100% pure Python - Complex state machine, harder to understand - Higher line count in parent.py - Untested with Mitogen parent on Python 2.x and child on Python 2.x+y No change - Only requires Python stdlib modules --- mitogen/parent.py | 70 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 62 insertions(+), 8 deletions(-) diff --git a/mitogen/parent.py b/mitogen/parent.py index 8a9a186a..00b0c3aa 100644 --- a/mitogen/parent.py +++ b/mitogen/parent.py @@ -26,12 +26,12 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. +import cStringIO import fcntl import getpass import inspect import logging import os -import re import select import signal import socket @@ -39,6 +39,7 @@ import sys import termios import textwrap import threading +import tokenize import time import types import zlib @@ -48,9 +49,6 @@ from mitogen.core import LOG from mitogen.core import IOLOG -DOCSTRING_RE = re.compile(r'""".+?"""', re.M | re.S) -COMMENT_RE = re.compile(r'^[ ]*#[^\n]*$', re.M) - try: SC_OPEN_MAX = os.sysconf('SC_OPEN_MAX') except: @@ -79,10 +77,66 @@ def get_log_level(): def minimize_source(source): - subber = lambda match: '""' + ('\n' * match.group(0).count('\n')) - source = DOCSTRING_RE.sub(subber, source) - source = COMMENT_RE.sub('', source) - return source.replace(' ', '\t') + """Remove most comments and docstrings from Python source code. + """ + tokens = tokenize.generate_tokens(cStringIO.StringIO(source).readline) + tokens = strip_comments(tokens) + tokens = strip_docstrings(tokens) + return tokenize.untokenize(tokens) + + +def strip_comments(tokens): + """Drop comment tokens from a `tokenize` stream. + + Comments on lines 1-2 are kept, to preserve hashbang and encoding. + Trailing whitespace is remove from all lines. + """ + prev_typ = None + prev_end_col = 0 + for typ, tok, (start_row, start_col), (end_row, end_col), line in tokens: + if typ in (tokenize.NL, tokenize.NEWLINE): + if prev_typ in (tokenize.NL, tokenize.NEWLINE): + start_col = 0 + else: + start_col = prev_end_col + end_col = start_col + 1 + elif typ == tokenize.COMMENT and start_row > 2: + continue + prev_typ = typ + prev_end_col = end_col + yield typ, tok, (start_row, start_col), (end_row, end_col), line + + +def strip_docstrings(tokens): + """Replace docstring tokens with NL tokens in a `tokenize` stream. + + Any STRING token not part of an expression is deemed a docstring. + Indented docstrings are not yet recognised. + """ + stack = [] + state = 'wait_string' + for t in tokens: + typ = t[0] + if state == 'wait_string': + if typ in (tokenize.NL, tokenize.COMMENT): + yield t + elif typ == tokenize.STRING: + stack.append(t) + elif typ == tokenize.NEWLINE: + stack.append(t) + start_line, end_line = stack[0][2][0], stack[-1][3][0]+1 + for i in range(start_line, end_line): + yield tokenize.NL, '\n', (i, 0), (i,1), '\n' + del stack[:] + else: + stack.append(t) + for t in stack: yield t + del stack[:] + state = 'wait_newline' + elif state == 'wait_newline': + if typ == tokenize.NEWLINE: + state = 'wait_string' + yield t def flags(names):