Skip to content

Commit

Permalink
Add phase transition logging support
Browse files Browse the repository at this point in the history
  • Loading branch information
James Graham committed Jun 9, 2010
1 parent 205aced commit 5078e07
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 15 deletions.
63 changes: 54 additions & 9 deletions html5lib/html5parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def startswithany(str, prefixes):
return False

import sys
import types

import inputstream
import tokenizer
Expand All @@ -37,14 +38,18 @@ def startswithany(str, prefixes):
from treebuilders import simpletree

import utils
import constants
from constants import spaceCharacters, asciiUpper2Lower
from constants import scopingElements, formattingElements, specialElements
from constants import headingElements, tableInsertModeElements
from constants import cdataElements, rcdataElements, voidElements
from constants import tokenTypes, ReparseException, namespaces

debug_log = True

def parse(doc, treebuilder="simpletree", encoding=None,
namespaceHTMLElements=True):
"""Parse a string or file-like object into a tree"""
tb = treebuilders.getTreeBuilder(treebuilder)
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
return p.parse(doc, encoding=encoding)
Expand All @@ -55,6 +60,17 @@ def parseFragment(doc, container="div", treebuilder="simpletree", encoding=None,
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
return p.parseFragment(doc, container=container, encoding=encoding)

def method_decorator_metaclass(function):
class Decorated(type):
def __new__(meta, classname, bases, classDict):
for attributeName, attribute in classDict.iteritems():
if type(attribute) == types.FunctionType:
attribute = function(attribute)

classDict[attributeName] = attribute
return type.__new__(meta, classname, bases, classDict)
return Decorated

class HTMLParser(object):
"""HTML parser. Generates a tree structure from a stream of (possibly
malformed) HTML"""
Expand Down Expand Up @@ -129,6 +145,7 @@ def reset(self):
self.tree.reset()
self.firstStartTag = False
self.errors = []
self.log = [] #only used with debug mode
# "quirks" / "limited quirks" / "no quirks"
self.compatMode = "no quirks"

Expand Down Expand Up @@ -420,6 +437,31 @@ def parseRCDataRawtext(self, token, contentType):

self.phase = self.phases["text"]

def log(function):
"""Logger that records which phase processes each token"""
type_names = dict((value, key) for key, value in
constants.tokenTypes.iteritems())
def wrapped(self, *args, **kwargs):
if function.__name__ != "__init__" and len(args) > 0:
token = args[0]
try:
info = {"type":type_names[token['type']]}
except:
print token
raise
if token['type'] in constants.tagTokenTypes:
info["name"] = token['name']

self.parser.log.append((self.parser.tokenizer.state.__name__,
self.parser.phase.__class__.__name__,
self.__class__.__name__,
function.__name__,
info))
return function(self, *args, **kwargs)
else:
return function(self, *args, **kwargs)
return wrapped

class Phase(object):
"""Base class for helper object that implements each phase of processing
"""
Expand All @@ -434,6 +476,9 @@ class Phase(object):
# * EndTag
# - endTag* methods

if debug_log:
__metaclass__ = method_decorator_metaclass(log)

def __init__(self, parser, tree):
self.parser = parser
self.tree = tree
Expand Down Expand Up @@ -1008,7 +1053,7 @@ def startTagForm(self, token):
self.parser.parseError(u"unexpected-start-tag", {"name": "form"})
else:
if self.tree.elementInScope("p"):
self.endTagP("p")
self.endTagP(impliedTagToken("p"))
self.tree.insertElement(token)
self.tree.formPointer = self.tree.openElements[-1]

Expand Down Expand Up @@ -1831,7 +1876,7 @@ def processEOF(self):
return
else:
ignoreEndTag = self.ignoreEndTagColgroup()
self.endTagColgroup("colgroup")
self.endTagColgroup(impliedTagToken("colgroup"))
if not ignoreEndTag:
self.parser.phase.processEOF()

Expand All @@ -1847,7 +1892,7 @@ def startTagCol(self, token):

def startTagOther(self, token):
ignoreEndTag = self.ignoreEndTagColgroup()
self.endTagColgroup("colgroup")
self.endTagColgroup(impliedTagToken("colgroup"))
if not ignoreEndTag:
self.parser.phase.processStartTag(token)

Expand All @@ -1865,7 +1910,7 @@ def endTagCol(self, token):

def endTagOther(self, token):
ignoreEndTag = self.ignoreEndTagColgroup()
self.endTagColgroup("colgroup")
self.endTagColgroup(impliedTagToken("colgroup"))
if not ignoreEndTag:
self.parser.phase.processEndTag(token)

Expand Down Expand Up @@ -2016,7 +2061,7 @@ def startTagTableCell(self, token):

def startTagTableOther(self, token):
ignoreEndTag = self.ignoreEndTagTr()
self.endTagTr("tr")
self.endTagTr(impliedTagToken("tr"))
# XXX how are we sure it's always ignored in the innerHTML case?
if not ignoreEndTag:
self.parser.phase.processStartTag(token)
Expand All @@ -2036,15 +2081,15 @@ def endTagTr(self, token):

def endTagTable(self, token):
ignoreEndTag = self.ignoreEndTagTr()
self.endTagTr("tr")
self.endTagTr(impliedTagToken("tr"))
# Reprocess the current tag if the tr end tag was not ignored
# XXX how are we sure it's always ignored in the innerHTML case?
if not ignoreEndTag:
self.parser.phase.processEndTag(token)

def endTagTableRowGroup(self, token):
if self.tree.elementInScope(token["name"], variant="table"):
self.endTagTr("tr")
self.endTagTr(impliedTagToken("tr"))
self.parser.phase.processEndTag(token)
else:
# innerHTML case
Expand Down Expand Up @@ -2187,12 +2232,12 @@ def startTagOptgroup(self, token):

def startTagSelect(self, token):
self.parser.parseError("unexpected-select-in-select")
self.endTagSelect("select")
self.endTagSelect(impliedTagToken("select"))

def startTagInput(self, token):
self.parser.parseError("unexpected-input-in-select")
if self.tree.elementInScope("select", variant="table"):
self.endTagSelect("select")
self.endTagSelect(impliedTagToken("select"))
self.parser.phase.processStartTag(token)

def startTagOther(self, token):
Expand Down
19 changes: 13 additions & 6 deletions parse.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
#!/usr/bin/env python
"""usage: %prog [options] filename
Parse a document to a simpletree tree, with optional profiling
Parse a document to a tree, with optional profiling
"""
#RELEASE move ./examples/

import sys
import os
from optparse import OptionParser

#RELEASE remove
sys.path.insert(0,os.path.abspath(os.path.join(__file__,'../src')))
#END RELEASE
from html5lib import html5parser, sanitizer
from html5lib.tokenizer import HTMLTokenizer
from html5lib import treebuilders, serializer, treewalkers
Expand Down Expand Up @@ -52,6 +48,8 @@ def parse():
else:
tokenizer = HTMLTokenizer

if opts.log:
html5parser.debug_log = True

p = html5parser.HTMLParser(tree=treebuilder, tokenizer=tokenizer)

Expand Down Expand Up @@ -87,10 +85,16 @@ def parse():
def printOutput(parser, document, opts):
if opts.encoding:
print "Encoding:", parser.tokenizer.stream.charEncoding

if opts.log:
for item in parser.log:
print item

if opts.xml:
sys.stdout.write(document.toxml("utf-8"))
elif opts.tree:
if not hasattr(document,'__getitem__'): document = [document]
if not hasattr(document,'__getitem__'):
document = [document]
for fragment in document:
print parser.tree.testSerializer(fragment).encode("utf-8")
elif opts.hilite:
Expand Down Expand Up @@ -199,6 +203,9 @@ def getOptParser():
parser.add_option("", "--sanitize", action="store_true", default=False,
dest="sanitize", help="sanitize")

parser.add_option("-l", "--log", action="store_true", default=False,
dest="log", help="log state transitions")

return parser

if __name__ == "__main__":
Expand Down

0 comments on commit 5078e07

Please sign in to comment.