Issue #160: Reimplement minimize_source as token filters

Benefits:

- More correct than re.sub()
- Better handling of trailing whitespace
- Recognises doc-strings regardless of quoting style

Limitations:

- Still not entirely correct
  - Creates a syntax error when function/class body is only a docstring
  - Doesn't handle indented docstrings yet
- Slower by 50x - 8-10 ms vs 0.2 ms for re.sub()
  - Not much scope for improving this, tokenize is 100% pure Python
- Complex state machine, harder to understand
- Higher line count in parent.py
- Untested with Mitogen parent on Python 2.x and child on Python 2.x+y

No change

- Only requires Python stdlib modules
pull/178/head
Alex Willmer 8 years ago
parent 35ae4e4227
commit a1e9b9e8db

@ -26,12 +26,12 @@
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE. # POSSIBILITY OF SUCH DAMAGE.
import cStringIO
import fcntl import fcntl
import getpass import getpass
import inspect import inspect
import logging import logging
import os import os
import re
import select import select
import signal import signal
import socket import socket
@ -39,6 +39,7 @@ import sys
import termios import termios
import textwrap import textwrap
import threading import threading
import tokenize
import time import time
import types import types
import zlib import zlib
@ -48,9 +49,6 @@ from mitogen.core import LOG
from mitogen.core import IOLOG from mitogen.core import IOLOG
DOCSTRING_RE = re.compile(r'""".+?"""', re.M | re.S)
COMMENT_RE = re.compile(r'^[ ]*#[^\n]*$', re.M)
try: try:
SC_OPEN_MAX = os.sysconf('SC_OPEN_MAX') SC_OPEN_MAX = os.sysconf('SC_OPEN_MAX')
except: except:
@ -79,10 +77,66 @@ def get_log_level():
def minimize_source(source): def minimize_source(source):
subber = lambda match: '""' + ('\n' * match.group(0).count('\n')) """Remove most comments and docstrings from Python source code.
source = DOCSTRING_RE.sub(subber, source) """
source = COMMENT_RE.sub('', source) tokens = tokenize.generate_tokens(cStringIO.StringIO(source).readline)
return source.replace(' ', '\t') tokens = strip_comments(tokens)
tokens = strip_docstrings(tokens)
return tokenize.untokenize(tokens)
def strip_comments(tokens):
"""Drop comment tokens from a `tokenize` stream.
Comments on lines 1-2 are kept, to preserve hashbang and encoding.
Trailing whitespace is remove from all lines.
"""
prev_typ = None
prev_end_col = 0
for typ, tok, (start_row, start_col), (end_row, end_col), line in tokens:
if typ in (tokenize.NL, tokenize.NEWLINE):
if prev_typ in (tokenize.NL, tokenize.NEWLINE):
start_col = 0
else:
start_col = prev_end_col
end_col = start_col + 1
elif typ == tokenize.COMMENT and start_row > 2:
continue
prev_typ = typ
prev_end_col = end_col
yield typ, tok, (start_row, start_col), (end_row, end_col), line
def strip_docstrings(tokens):
"""Replace docstring tokens with NL tokens in a `tokenize` stream.
Any STRING token not part of an expression is deemed a docstring.
Indented docstrings are not yet recognised.
"""
stack = []
state = 'wait_string'
for t in tokens:
typ = t[0]
if state == 'wait_string':
if typ in (tokenize.NL, tokenize.COMMENT):
yield t
elif typ == tokenize.STRING:
stack.append(t)
elif typ == tokenize.NEWLINE:
stack.append(t)
start_line, end_line = stack[0][2][0], stack[-1][3][0]+1
for i in range(start_line, end_line):
yield tokenize.NL, '\n', (i, 0), (i,1), '\n'
del stack[:]
else:
stack.append(t)
for t in stack: yield t
del stack[:]
state = 'wait_newline'
elif state == 'wait_newline':
if typ == tokenize.NEWLINE:
state = 'wait_string'
yield t
def flags(names): def flags(names):

Loading…
Cancel
Save