diff --git a/minify.py b/minify.py new file mode 100644 index 0000000..06d9b9c --- /dev/null +++ b/minify.py @@ -0,0 +1,683 @@ +## {{{ http://code.activestate.com/recipes/576704/ (r16) +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# pyminifier.py +# +# Copyright 2009 Dan McDougall +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; Version 3 of the License +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, the license can be downloaded here: +# +# http://www.gnu.org/licenses/gpl.html + +# Meta +__version__ = '1.4.1' +__license__ = "GNU General Public License (GPL) Version 3" +__version_info__ = (1, 4, 1) +__author__ = 'Dan McDougall ' + +""" +**Python Minifier:** Reduces the size of (minifies) Python code for use on +embedded platforms. + +Performs the following: + - Removes docstrings. + - Removes comments. + - Minimizes code indentation. + - Joins multiline pairs of parentheses, braces, and brackets (and removes extraneous whitespace within). + - Preserves shebangs and encoding info (e.g. "# -- coding: utf-8 --"). + +Various examples and edge cases are sprinkled throughout the pyminifier code so +that it can be tested by minifying itself. The way to test is thus: + +.. code-block:: bash + + $ python pyminifier.py pyminifier.py > minified_pyminifier.py + $ python minified_pyminifier.py pyminifier.py > this_should_be_identical.py + $ diff minified_pyminifier.py this_should_be_identical.py + $ + +If you get an error executing minified_pyminifier.py or +'this_should_be_identical.py' isn't identical to minified_pyminifier.py then +something is broken. +""" + +import sys, re, cStringIO, tokenize +from optparse import OptionParser + +# Compile our regular expressions for speed +multiline_quoted_string = re.compile(r'(\'\'\'|\"\"\")') +not_quoted_string = re.compile(r'(\".*\'\'\'.*\"|\'.*\"\"\".*\')') +trailing_newlines = re.compile(r'\n\n') +shebang = re.compile('^#\!.*$') +encoding = re.compile(".*coding[:=]\s*([-\w.]+)") +multiline_indicator = re.compile('\\\\(\s*#.*)?\n') +# The above also removes trailing comments: "test = 'blah \ # comment here" + +# These aren't used but they're a pretty good reference: +double_quoted_string = re.compile(r'((? last_lineno: + last_col = 0 + if start_col > last_col: + out += (" " * (start_col - last_col)) + # Remove comments: + if token_type == tokenize.COMMENT: + pass + # This series of conditionals removes docstrings: + elif token_type == tokenize.STRING: + if prev_toktype != tokenize.INDENT: + # This is likely a docstring; double-check we're not inside an operator: + if prev_toktype != tokenize.NEWLINE: + # Note regarding NEWLINE vs NL: The tokenize module + # differentiates between newlines that start a new statement + # and newlines inside of operators such as parens, brackes, + # and curly braces. Newlines inside of operators are + # NEWLINE and newlines that start new code are NL. + # Catch whole-module docstrings: + if start_col > 0: + # Unlabelled indentation means we're inside an operator + out += token_string + # Note regarding the INDENT token: The tokenize module does + # not label indentation inside of an operator (parens, + # brackets, and curly braces) as actual indentation. + # For example: + # def foo(): + # "The spaces before this docstring are tokenize.INDENT" + # test = [ + # "The spaces before this string do not get a token" + # ] + else: + out += token_string + prev_toktype = token_type + last_col = end_col + last_lineno = end_line + return out + +def reduce_operators(source): + """ + Remove spaces between operators in 'source' and returns the result. + + Example: + + .. code-block:: python + + def foo(foo, bar, blah): + test = "This is a %s" % foo + + Will become: + + .. code-block:: python + + def foo(foo,bar,blah): + test="This is a %s"%foo + """ + io_obj = cStringIO.StringIO(source) + remove_columns = [] + out = "" + out_line = "" + prev_toktype = tokenize.INDENT + prev_tok = None + last_lineno = -1 + last_col = 0 + lshift = 1 + for tok in tokenize.generate_tokens(io_obj.readline): + token_type = tok[0] + token_string = tok[1] + start_line, start_col = tok[2] + end_line, end_col = tok[3] + ltext = tok[4] + if start_line > last_lineno: + last_col = 0 + if start_col > last_col: + out_line += (" " * (start_col - last_col)) + if token_type == tokenize.OP: + # Operators that begin a line such as @ or open parens should be + # left alone + start_of_line_types = [ # These indicate we're starting a new line + tokenize.NEWLINE, tokenize.DEDENT, tokenize.INDENT] + if prev_toktype not in start_of_line_types: + # This is just a regular operator; remove spaces + remove_columns.append(start_col) # Before OP + remove_columns.append(end_col+1) # After OP + if token_string.endswith('\n'): + out_line += token_string + if remove_columns: + for col in remove_columns: + col = col - lshift + try: + # This was really handy for debugging (looks nice, worth saving): + #print out_line + (" " * col) + "^" + # The above points to the character we're looking at + if out_line[col] == " ": # Only if it is a space + out_line = out_line[:col] + out_line[col+1:] + lshift += 1 # To re-align future changes on this line + except IndexError: # Reached and end of line, no biggie + pass + out += out_line + remove_columns = [] + out_line = "" + lshift = 1 + else: + out_line += token_string + prev_toktype = token_type + prev_token = tok + last_col = end_col + last_lineno = end_line + # This makes sure to capture the last line if it doesn't end in a newline: + out += out_line + # The tokenize module doesn't recognize @ sign before a decorator + return out + +# NOTE: This isn't used anymore... Just here for reference in case someone +# searches the internet looking for a way to remove similarly-styled end-of-line +# comments from non-python code. It also acts as an edge case of sorts with +# that raw triple quoted string inside the "quoted_string" assignment. +def remove_comment(single_line): + """ + Removes the comment at the end of the line (if any) and returns the result. + """ + quoted_string = re.compile( + r'''((? 1 or len(line.split("'''")): + # This is a single line that uses the triple quotes twice + # Treat it as if it were just a regular line: + output += line + '\n' + quoted_string = False + else: + output += line + '\n' + quoted_string = True + elif quoted_string and multiline_quoted_string.search(line): + output += line + '\n' + quoted_string = False + # Now let's focus on the lines containing our opener and/or closer: + elif not quoted_string: + if opener_regex.search(line) or closer_regex.search(line) or inside_pair: + for character in line: + if character == opener: + if not escaped and not inside_quotes: + openers += 1 + inside_pair = True + output += character + else: + escaped = False + output += character + elif character == closer: + if not escaped and not inside_quotes: + if openers and openers == (closers + 1): + closers = 0 + openers = 0 + inside_pair = False + output += character + else: + closers += 1 + output += character + else: + escaped = False + output += character + elif character == '\\': + if escaped: + escaped = False + output += character + else: + escaped = True + output += character + elif character == '"' and escaped: + output += character + escaped = False + elif character == "'" and escaped: + output += character + escaped = False + elif character == '"' and inside_quotes: + if inside_single_quotes: + output += character + else: + inside_quotes = False + inside_double_quotes = False + output += character + elif character == "'" and inside_quotes: + if inside_double_quotes: + output += character + else: + inside_quotes = False + inside_single_quotes = False + output += character + elif character == '"' and not inside_quotes: + inside_quotes = True + inside_double_quotes = True + output += character + elif character == "'" and not inside_quotes: + inside_quotes = True + inside_single_quotes = True + output += character + elif character == ' ' and inside_pair and not inside_quotes: + if not output[-1] in [' ', opener]: + output += ' ' + else: + if escaped: + escaped = False + output += character + if inside_pair == False: + output += '\n' + else: + output += line + '\n' + else: + output += line + '\n' + + # Clean up + output = trailing_newlines.sub('\n', output) + + return output + +def dedent(source): + """ + Minimizes indentation to save precious bytes + + Example: + + .. code-block:: python + + def foo(bar): + test = "This is a test" + + Will become: + + .. code-block:: python + + def foo(bar): + test = "This is a test" + """ + io_obj = cStringIO.StringIO(source) + out = "" + last_lineno = -1 + last_col = 0 + prev_start_line = 0 + indentation = "" + indentation_level = 0 + for i,tok in enumerate(tokenize.generate_tokens(io_obj.readline)): + token_type = tok[0] + token_string = tok[1] + start_line, start_col = tok[2] + end_line, end_col = tok[3] + if start_line > last_lineno: + last_col = 0 + if token_type == tokenize.INDENT: + indentation_level += 1 + continue + if token_type == tokenize.DEDENT: + indentation_level -= 1 + continue + indentation = " " * indentation_level + if start_line > prev_start_line: + out += indentation + token_string + elif start_col > last_col: + out += " " + token_string + else: + out += token_string + prev_start_line = start_line + last_col = end_col + last_lineno = end_line + return out + +def fix_empty_methods(source): + """ + Appends 'pass' to empty methods/functions (i.e. where there was nothing but + a docstring before we removed it =). + + Example: + + .. code-block:: python + + # Note: This triple-single-quote inside a triple-double-quote is also a + # pyminifier self-test + def myfunc(): + '''This is just a placeholder function.''' + + Will become: + + .. code-block:: python + + def myfunc(): pass + """ + def_indentation_level = 0 + output = "" + just_matched = False + previous_line = None + method = re.compile(r'^\s*def\s*.*\(.*\):.*$') + for line in source.split('\n'): + if len(line.strip()) > 0: # Don't look at blank lines + if just_matched == True: + this_indentation_level = len(line.rstrip()) - len(line.strip()) + if def_indentation_level == this_indentation_level: + # This method is empty, insert a 'pass' statement + output += "%s pass\n%s\n" % (previous_line, line) + else: + output += "%s\n%s\n" % (previous_line, line) + just_matched = False + elif method.match(line): + def_indentation_level = len(line) - len(line.strip()) # A commment + just_matched = True + previous_line = line + else: + output += "%s\n" % line # Another self-test + else: + output += "\n" + return output + +def remove_blank_lines(source): + """ + Removes blank lines from 'source' and returns the result. + + Example: + + .. code-block:: python + + test = "foo" + + test2 = "bar" + + Will become: + + .. code-block:: python + + test = "foo" + test2 = "bar" + """ + io_obj = cStringIO.StringIO(source) + source = [a for a in io_obj.readlines() if a.strip()] + return "".join(source) + +def minify(source): + """ + Remove all docstrings, comments, blank lines, and minimize code + indentation from 'source' then prints the result. + """ + preserved_shebang = None + preserved_encoding = None + + # This is for things like shebangs that must be precisely preserved + for line in source.split('\n')[0:2]: + # Save the first comment line if it starts with a shebang + # (e.g. '#!/usr/bin/env python') <--also a self test! + if shebang.match(line): # Must be first line + preserved_shebang = line + continue + # Save the encoding string (must be first or second line in file) + if encoding.match(line): + preserved_encoding = line + + # Remove multilines (e.g. lines that end with '\' followed by a newline) + source = multiline_indicator.sub('', source) + + # Remove docstrings (Note: Must run before fix_empty_methods()) + source = remove_comments_and_docstrings(source) + + # Remove empty (i.e. single line) methods/functions + source = fix_empty_methods(source) + + # Join multiline pairs of parens, brackets, and braces + source = join_multiline_pairs(source) + source = join_multiline_pairs(source, '[]') + source = join_multiline_pairs(source, '{}') + + # Remove whitespace between operators: + source = reduce_operators(source) + + # Minimize indentation + source = dedent(source) + + # Re-add preseved items + if preserved_encoding: + source = preserved_encoding + "\n" + source + if preserved_shebang: + source = preserved_shebang + "\n" + source + + # Remove blank lines + source = remove_blank_lines(source).rstrip('\n') # Stubborn last newline + + return source + +def bz2_pack(source): + "Returns 'source' as a bzip2-compressed, self-extracting python script." + import bz2, base64 + out = "" + compressed_source = bz2.compress(source) + out += 'import bz2, base64\n' + out += "exec bz2.decompress(base64.b64decode('" + out += base64.b64encode((compressed_source)) + out += "'))\n" + return out + +def gz_pack(source): + "Returns 'source' as a gzip-compressed, self-extracting python script." + import zlib, base64 + out = "" + compressed_source = zlib.compress(source) + out += 'import zlib, base64\n' + out += "exec zlib.decompress(base64.b64decode('" + out += base64.b64encode((compressed_source)) + out += "'))\n" + return out + +# The test.+() functions below are for testing pyminifer... +def test_decorator(f): + """Decorator that does nothing""" + return f + +def test_reduce_operators(): + """Test the case where an operator such as an open paren starts a line""" + (a, b) = 1, 2 # The indentation level should be preserved + pass + +def test_empty_functions(): + """ + This is a test method. + This should be replaced with 'def empty_method: pass' + """ + +class test_class(object): + "Testing indented decorators" + + @test_decorator + def foo(self): + pass + +def test_function(): + """ + This function encapsulates the edge cases to prevent them from invading the + global namespace. + """ + foo = ("The # character in this string should " # This comment + "not result in a syntax error") # ...and this one should go away + test_multi_line_list = [ + 'item1', + 'item2', + 'item3' + ] + test_multi_line_dict = { + 'item1': 1, + 'item2': 2, + 'item3': 3 + } + # It may seem strange but the code below tests our docstring removal code. + test_string_inside_operators = imaginary_function( + "This string was indented but the tokenizer won't see it that way." + ) # To understand how this could mess up docstring removal code see the + # remove_comments_and_docstrings() function starting at this line: + # "elif token_type == tokenize.STRING:" + # This tests remove_extraneous_spaces(): + this_line_has_leading_indentation = '''<--That extraneous space should be + removed''' # But not these spaces + +def main(): + usage = '%prog [options] ""' + parser = OptionParser(usage=usage, version=__version__) + parser.disable_interspersed_args() + parser.add_option( + "-o", "--outfile", + dest="outfile", + default=None, + help="Save output to the given file.", + metavar="" + ) + parser.add_option( + "--bzip2", + action="store_true", + dest="bzip2", + default=False, + help="bzip2-compress the result into a self-executing python script." + ) + parser.add_option( + "--gzip", + action="store_true", + dest="gzip", + default=False, + help="gzip-compress the result into a self-executing python script." + ) + options, args = parser.parse_args() + try: + source = open(args[0]).read() + except Exception, e: + print e + parser.print_help() + sys.exit(2) + # Minify our input script + result = minify(source) + # Compress it if we were asked to do so + if options.bzip2: + result = bz2_pack(result) + elif options.gzip: + result = gz_pack(result) + # Either save the result to the output file or print it to stdout + if options.outfile: + f = open(options.outfile, 'w') + f.write(result) + f.close() + else: + print result + +if __name__ == "__main__": + main() +## end of http://code.activestate.com/recipes/576704/ }}}