Skip to content

Commit

Permalink
Phase 2 of the null handling changes
Browse files Browse the repository at this point in the history
  • Loading branch information
James Graham committed Nov 12, 2010
1 parent ff6fd06 commit 520a96c
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 11 deletions.
16 changes: 12 additions & 4 deletions html5lib/html5parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -987,12 +987,15 @@ def processSpaceCharactersDropNewline(self, token):
self.tree.insertText(data)

def processCharacters(self, token):
if token["data"] == u"\u0000":
#The tokenizer should always emit null on its own
return
self.tree.reconstructActiveFormattingElements()
self.tree.insertText(token["data"])
#This must be bad for performance
if (self.parser.framesetOK and
any([char not in set(u"\ufffd") | spaceCharacters
for char in token["data"]])):
any([char not in spaceCharacters
for char in token["data"]])):
self.parser.framesetOK = False

def processSpaceCharacters(self, token):
Expand Down Expand Up @@ -2195,6 +2198,8 @@ def processEOF(self):
assert self.parser.innerHTML

def processCharacters(self, token):
if token["data"] == u"\u0000":
return
self.tree.insertText(token["data"])

def startTagOption(self, token):
Expand Down Expand Up @@ -2375,8 +2380,11 @@ def processCharacters(self, token):
new_token = self.parser.phases["inBody"].processCharacters(token)
self.parser.resetInsertionMode()
return new_token

self.parser.framesetOK = False
elif token["data"] == u"\u0000":
token["data"] = u"\uFFFD"
elif (not self.parser.framesetOK and
any(char not in spaceCharacters for char in token["data"])):
self.parser.framesetOK = False
Phase.processCharacters(self, token)

def processEOF(self):
Expand Down
2 changes: 1 addition & 1 deletion html5lib/tests/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
if token[0] == "StartTag" or token[0] == "EndTag":
token.pop()

if not ignoreErrorOrder:
if not ignoreErrorOrder and not ignoreErrors:
return expectedTokens == receivedTokens
else:
#Sort the tokens into two groups; non-parse errors and parse errors
Expand Down
19 changes: 13 additions & 6 deletions html5lib/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ def dataState(self):
self.tokenQueue.append({"type": tokenTypes["ParseError"],
"data":"invalid-codepoint"})
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": u"\uFFFD"})
"data": u"\u0000"})
elif data is EOF:
# Tokenization ends.
return False
Expand All @@ -282,7 +282,7 @@ def dataState(self):
# have already been appended to lastFourChars and will have broken
# any <!-- or --> sequences
else:
chars = self.stream.charsUntil((u"&", u"<"))
chars = self.stream.charsUntil((u"&", u"<", u"\u0000"))
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data + chars})
return True
Expand Down Expand Up @@ -646,7 +646,7 @@ def scriptDataEscapedState(self):
elif data == EOF:
self.state = self.dataState
else:
chars = self.stream.charsUntil((u"<-", u"\u0000"))
chars = self.stream.charsUntil((u"<", u"-", u"\u0000"))
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data + chars})
return True
Expand Down Expand Up @@ -1150,7 +1150,7 @@ def markupDeclarationOpenState(self):
self.state = self.doctypeState
return True
elif (charStack[-1] == "[" and
self.parser is not None and
self.parser is not None and
self.parser.phase == self.parser.phases["inForeignContent"] and
self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
matched = True
Expand Down Expand Up @@ -1731,8 +1731,15 @@ def cdataSectionState(self):
if matched:
break
data = "".join(data)
#Deal with null here rather than in the parser
nullCount = data.count(u"\u0000")
if nullCount > 0:
for i in xrange(nullCount):
self.tokenQueue.append({"type": tokenTypes["ParseError"],
"data": "invalid-codepoint"})
data = data.replace(u"\u0000", u"\uFFFD")
if data:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data})
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": data})
self.state = self.dataState
return True

0 comments on commit 520a96c

Please sign in to comment.