Skip to content

Commit

Permalink
Correctness fixes for EOF handling and parse errors
Browse files Browse the repository at this point in the history
--HG--
extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401143
  • Loading branch information
jgraham committed Apr 8, 2008
1 parent 5999365 commit 454a8ca
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 9 deletions.
8 changes: 8 additions & 0 deletions src/html5lib/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,8 @@
"unexpected-end-tag-after-frameset":
_(u"Unexpected end tag (%(name)s)"
u" in the after frameset phase. Ignored."),
"unexpected-end-tag-after-body-innerhtml":
_(u"Unexpected end tag after body(innerHtml)"),
"expected-eof-but-got-char":
_(u"Unexpected non-space characters. Expected end of file."),
"expected-eof-but-got-start-tag":
Expand All @@ -242,6 +244,12 @@
"expected-eof-but-got-end-tag":
_(u"Unexpected end tag (%(name)s)"
u". Expected end of file."),
"eof-in-table":
_(u"Unexpected end of file. Expected table content."),
"eof-in-select":
_(u"Unexpected end of file. Expected select content."),
"eof-in-frameset":
_(u"Unexpected end of file. Expected frameset content."),
"XXX-undefined-error":
(u"Undefined error (this sucks and should be fixed)"),
}
Expand Down
83 changes: 74 additions & 9 deletions src/html5lib/html5parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,7 @@ def __init__(self, parser, tree):
self.tree = tree

def processEOF(self):
raise NotImplementedError
self.tree.generateImpliedEndTags()
if len(self.tree.openElements) > 2:
self.parser.parseError("expected-closing-tag-but-got-eof")
Expand Down Expand Up @@ -547,11 +548,11 @@ def appendToHead(self, element):
self.tree.openElements[-1].appendChild(element)

# the real thing
def processEOF(self):
def processEOF (self):
if self.tree.openElements[-1].name in ("title", "style", "script", "noscript"):
self.parser.parseError("expected-named-closing-tag-but-got-eof",
{"name": self.tree.openElements[-1].name})
self.tree.openElements.pop()
self.tree.openElements.pop()
self.anythingElse()
self.parser.phase.processEOF()

Expand Down Expand Up @@ -776,6 +777,15 @@ def addFormattingElement(self, name, attributes):
self.tree.openElements[-1])

# the real deal
def processEOF(self):
allowed_elements = set(("dd", "dt", "li", "p", "tbody", "td", "tfoot",
"th", "thead", "tr", "body", "html"))
for node in self.tree.openElements[::-1]:
if node.name not in allowed_elements:
self.parser.parseError("expected-closing-tag-but-got-eof")
break
#Stop parsing

def processSpaceCharactersDropNewline(self, data):
# Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
# want to drop leading newlines
Expand Down Expand Up @@ -1311,8 +1321,8 @@ def __init__(self, parser, tree):
def clearStackToTableContext(self):
# "clear the stack back to a table context"
while self.tree.openElements[-1].name not in ("table", "html"):
self.parser.parseError("unexpected-implied-end-tag-in-table",
{"name": self.tree.openElements[-1].name})
#self.parser.parseError("unexpected-implied-end-tag-in-table",
# {"name": self.tree.openElements[-1].name})
self.tree.openElements.pop()
# When the current node is <html> it's an innerHTML case

Expand All @@ -1323,6 +1333,13 @@ def getCurrentTable(self):
return self.tree.openElements[i]

# processing methods
def processEOF(self):
if self.tree.openElements[-1].name != "html":
self.parser.parseError("eof-in-table")
else:
assert self.parser.innerHTML
#Stop parsing

def processSpaceCharacters(self, data):
if "tainted" not in self.getCurrentTable()._flags:
self.tree.insertText(data)
Expand Down Expand Up @@ -1454,6 +1471,9 @@ def __init__(self, parser, tree):
def ignoreEndTagCaption(self):
return not self.tree.elementInScope("caption", True)

def processEOF(self):
self.parser.phases["inBody"].processEOF()

def processCharacters(self, data):
self.parser.phases["inBody"].processCharacters(data)

Expand Down Expand Up @@ -1521,6 +1541,16 @@ def __init__(self, parser, tree):
def ignoreEndTagColgroup(self):
return self.tree.openElements[-1].name == "html"

def processEOF(self):
if self.tree.openElements[-1].name == "html":
assert self.parser.innerHTML
return
else:
ignoreEndTag = self.ignoreEndTagColgroup()
self.endTagColgroup("colgroup")
if not ignoreEndTag:
self.parser.phase.processEOF()

def processCharacters(self, data):
ignoreEndTag = self.ignoreEndTagColgroup()
self.endTagColgroup("colgroup")
Expand Down Expand Up @@ -1564,7 +1594,8 @@ def __init__(self, parser, tree):
("html", self.startTagHtml),
("tr", self.startTagTr),
(("td", "th"), self.startTagTableCell),
(("caption", "col", "colgroup", "tbody", "tfoot", "thead"), self.startTagTableOther)
(("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
self.startTagTableOther)
])
self.startTagHandler.default = self.startTagOther

Expand All @@ -1580,13 +1611,18 @@ def __init__(self, parser, tree):
def clearStackToTableBodyContext(self):
while self.tree.openElements[-1].name not in ("tbody", "tfoot",
"thead", "html"):
self.parser.parseError("unexpected-implied-end-tag-in-table",
{"name": self.tree.openElements[-1].name})
#self.parser.parseError("unexpected-implied-end-tag-in-table",
# {"name": self.tree.openElements[-1].name})
self.tree.openElements.pop()
if self.tree.openElements[-1].name == "html":
assert self.parser.innerHTML

# the rest
def processEOF(self):
self.parser.phases["inTable"].processEOF()

def processSpaceCharacters(self,data):
self.parser.phases["inTable"].processSpaceCharacters(data)
self.parser.phases["inTable"].processSpaceCharacters(data)

def processCharacters(self,data):
self.parser.phases["inTable"].processCharacters(data)
Expand Down Expand Up @@ -1676,6 +1712,9 @@ def ignoreEndTagTr(self):
return not self.tree.elementInScope("tr", tableVariant=True)

# the rest
def processEOF(self):
self.parser.phases["inTable"].processEOF()

def processSpaceCharacters(self, data):
self.parser.phases["inTable"].processSpaceCharacters(data)

Expand Down Expand Up @@ -1757,6 +1796,9 @@ def closeCell(self):
self.endTagTableCell("th")

# the rest
def processEOF(self):
self.parser.phases["inBody"].processEOF()

def processCharacters(self, data):
self.parser.phases["inBody"].processCharacters(data)

Expand Down Expand Up @@ -1834,6 +1876,12 @@ def __init__(self, parser, tree):
self.endTagHandler.default = self.endTagOther

# http:https://www.whatwg.org/specs/web-apps/current-work/#in-select
def processEOF(self):
if self.tree.openElements[-1].name != "html":
self.parser.parseError("eof-in-select")
else:
assert self.parser.innerHtml

def processCharacters(self, data):
self.tree.insertText(data)

Expand Down Expand Up @@ -1919,6 +1967,9 @@ def __init__(self, parser, tree):
])
self.endTagHandler.default = self.endTagOther

def processEOF(self):
self.parser.phases["inSelect"].processEOF()

def processCharacters(self, data):
self.parser.phases["inSelect"].processCharacters(data)

Expand Down Expand Up @@ -1948,6 +1999,10 @@ def __init__(self, parser, tree):
self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)])
self.endTagHandler.default = self.endTagOther

def processEOF(self):
#Stop parsing
pass

def processComment(self, data):
# This is needed because data is to be appended to the <html> element
# here and not to whatever is currently open.
Expand All @@ -1966,7 +2021,7 @@ def processStartTag(self, name, attributes):

def endTagHtml(self,name):
if self.parser.innerHTML:
self.parser.parseError()
self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
else:
# XXX: This may need to be done, not sure:
# Don't set lastPhase to the current phase but to the inBody phase
Expand Down Expand Up @@ -2001,6 +2056,12 @@ def __init__(self, parser, tree):
])
self.endTagHandler.default = self.endTagOther

def processEOF(self):
if self.tree.openElements[-1].name != "html":
self.parser.parseError("eof-in-frameset")
else:
assert self.parser.innerHTML

def processCharacters(self, data):
self.parser.parseError("unexpected-char-in-frameset")

Expand Down Expand Up @@ -2054,6 +2115,10 @@ def __init__(self, parser, tree):
])
self.endTagHandler.default = self.endTagOther

def processEOF(self):
#Stop parsing
pass

def processCharacters(self, data):
self.parser.parseError("unexpected-char-after-frameset")

Expand Down

0 comments on commit 454a8ca

Please sign in to comment.