#!/usr/bin/env python """usage: %prog [options] filename Parse a document to a tree, with optional profiling """ import sys import os import traceback from optparse import OptionParser from html5lib import html5parser, sanitizer from html5lib.tokenizer import HTMLTokenizer from html5lib import treebuilders, serializer, treewalkers from html5lib import constants from html5lib import utils def parse(): optParser = getOptParser() opts,args = optParser.parse_args() encoding = "utf8" try: f = args[-1] # Try opening from the internet if f.startswith('http://'): try: import urllib.request, urllib.parse, urllib.error, cgi f = urllib.request.urlopen(f) contentType = f.headers.get('content-type') if contentType: (mediaType, params) = cgi.parse_header(contentType) encoding = params.get('charset') except: pass elif f == '-': f = sys.stdin if sys.version_info[0] >= 3: encoding = None else: try: # Try opening from file system f = open(f, "rb") except IOError as e: sys.stderr.write("Unable to open file: %s\n" % e) sys.exit(1) except IndexError: sys.stderr.write("No filename provided. Use -h for help\n") sys.exit(1) treebuilder = treebuilders.getTreeBuilder(opts.treebuilder) if opts.sanitize: tokenizer = sanitizer.HTMLSanitizer else: tokenizer = HTMLTokenizer p = html5parser.HTMLParser(tree=treebuilder, tokenizer=tokenizer, debug=opts.log) if opts.fragment: parseMethod = p.parseFragment else: parseMethod = p.parse if opts.profile: import cProfile import pstats cProfile.runctx("run(parseMethod, f, encoding)", None, {"run": run, "parseMethod": parseMethod, "f": f, "encoding": encoding}, "stats.prof") # XXX - We should use a temp file here stats = pstats.Stats('stats.prof') stats.strip_dirs() stats.sort_stats('time') stats.print_stats() elif opts.time: import time t0 = time.time() document = run(parseMethod, f, encoding) t1 = time.time() if document: printOutput(p, document, opts) t2 = time.time() sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1)) else: sys.stderr.write("\n\nRun took: %fs"%(t1-t0)) else: document = run(parseMethod, f, encoding) if document: printOutput(p, document, opts) def run(parseMethod, f, encoding): try: document = parseMethod(f, encoding=encoding) except: document = None traceback.print_exc() return document def printOutput(parser, document, opts): if opts.encoding: print("Encoding:", parser.tokenizer.stream.charEncoding) for item in parser.log: print(item) if document is not None: if opts.xml: tb = opts.treebuilder.lower() if tb == "dom": document.writexml(sys.stdout, encoding="utf-8") elif tb == "lxml": import lxml.etree sys.stdout.write(lxml.etree.tostring(document)) elif tb == "etree": sys.stdout.write(utils.default_etree.tostring(document)) elif opts.tree: if not hasattr(document,'__getitem__'): document = [document] for fragment in document: print(parser.tree.testSerializer(fragment)) elif opts.hilite: sys.stdout.write(document.hilite("utf-8")) elif opts.html: kwargs = {} for opt in serializer.HTMLSerializer.options: try: kwargs[opt] = getattr(opts,opt) except: pass if not kwargs['quote_char']: del kwargs['quote_char'] tokens = treewalkers.getTreeWalker(opts.treebuilder)(document) if sys.version_info[0] >= 3: encoding = None else: encoding = "utf-8" for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding): sys.stdout.write(text) if not text.endswith('\n'): sys.stdout.write('\n') if opts.error: errList=[] for pos, errorcode, datavars in parser.errors: errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars) sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n") def getOptParser(): parser = OptionParser(usage=__doc__) parser.add_option("-p", "--profile", action="store_true", default=False, dest="profile", help="Use the hotshot profiler to " "produce a detailed log of the run") parser.add_option("-t", "--time", action="store_true", default=False, dest="time", help="Time the run using time.time (may not be accurate on all platforms, especially for short runs)") parser.add_option("-b", "--treebuilder", action="store", type="string", dest="treebuilder", default="etree") parser.add_option("-e", "--error", action="store_true", default=False, dest="error", help="Print a list of parse errors") parser.add_option("-f", "--fragment", action="store_true", default=False, dest="fragment", help="Parse as a fragment") parser.add_option("", "--tree", action="store_true", default=False, dest="tree", help="Output as debug tree") parser.add_option("-x", "--xml", action="store_true", default=False, dest="xml", help="Output as xml") parser.add_option("", "--no-html", action="store_false", default=True, dest="html", help="Don't output html") parser.add_option("", "--hilite", action="store_true", default=False, dest="hilite", help="Output as formatted highlighted code.") parser.add_option("-c", "--encoding", action="store_true", default=False, dest="encoding", help="Print character encoding used") parser.add_option("", "--inject-meta-charset", action="store_true", default=False, dest="inject_meta_charset", help="inject ") parser.add_option("", "--strip-whitespace", action="store_true", default=False, dest="strip_whitespace", help="strip whitespace") parser.add_option("", "--omit-optional-tags", action="store_true", default=False, dest="omit_optional_tags", help="omit optional tags") parser.add_option("", "--quote-attr-values", action="store_true", default=False, dest="quote_attr_values", help="quote attribute values") parser.add_option("", "--use-best-quote-char", action="store_true", default=False, dest="use_best_quote_char", help="use best quote character") parser.add_option("", "--quote-char", action="store", default=None, dest="quote_char", help="quote character") parser.add_option("", "--no-minimize-boolean-attributes", action="store_false", default=True, dest="minimize_boolean_attributes", help="minimize boolean attributes") parser.add_option("", "--use-trailing-solidus", action="store_true", default=False, dest="use_trailing_solidus", help="use trailing solidus") parser.add_option("", "--space-before-trailing-solidus", action="store_true", default=False, dest="space_before_trailing_solidus", help="add space before trailing solidus") parser.add_option("", "--escape-lt-in-attrs", action="store_true", default=False, dest="escape_lt_in_attrs", help="escape less than signs in attribute values") parser.add_option("", "--escape-rcdata", action="store_true", default=False, dest="escape_rcdata", help="escape rcdata element values") parser.add_option("", "--sanitize", action="store_true", default=False, dest="sanitize", help="sanitize") parser.add_option("-l", "--log", action="store_true", default=False, dest="log", help="log state transitions") return parser if __name__ == "__main__": parse()