diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py
index 20798097..11885dcc 100644
--- a/html5lib/html5parser.py
+++ b/html5lib/html5parser.py
@@ -987,12 +987,15 @@ def processSpaceCharactersDropNewline(self, token):
self.tree.insertText(data)
def processCharacters(self, token):
+ if token["data"] == u"\u0000":
+ #The tokenizer should always emit null on its own
+ return
self.tree.reconstructActiveFormattingElements()
self.tree.insertText(token["data"])
#This must be bad for performance
if (self.parser.framesetOK and
- any([char not in set(u"\ufffd") | spaceCharacters
- for char in token["data"]])):
+ any([char not in spaceCharacters
+ for char in token["data"]])):
self.parser.framesetOK = False
def processSpaceCharacters(self, token):
@@ -2195,6 +2198,8 @@ def processEOF(self):
assert self.parser.innerHTML
def processCharacters(self, token):
+ if token["data"] == u"\u0000":
+ return
self.tree.insertText(token["data"])
def startTagOption(self, token):
@@ -2375,8 +2380,11 @@ def processCharacters(self, token):
new_token = self.parser.phases["inBody"].processCharacters(token)
self.parser.resetInsertionMode()
return new_token
-
- self.parser.framesetOK = False
+ elif token["data"] == u"\u0000":
+ token["data"] = u"\uFFFD"
+ elif (not self.parser.framesetOK and
+ any(char not in spaceCharacters for char in token["data"])):
+ self.parser.framesetOK = False
Phase.processCharacters(self, token)
def processEOF(self):
diff --git a/html5lib/tests/test_tokenizer.py b/html5lib/tests/test_tokenizer.py
index b424d360..8a49a3af 100644
--- a/html5lib/tests/test_tokenizer.py
+++ b/html5lib/tests/test_tokenizer.py
@@ -107,7 +107,7 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
if token[0] == "StartTag" or token[0] == "EndTag":
token.pop()
- if not ignoreErrorOrder:
+ if not ignoreErrorOrder and not ignoreErrors:
return expectedTokens == receivedTokens
else:
#Sort the tokens into two groups; non-parse errors and parse errors
diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py
index 7ed348c0..8a29d33c 100644
--- a/html5lib/tokenizer.py
+++ b/html5lib/tokenizer.py
@@ -268,7 +268,7 @@ def dataState(self):
self.tokenQueue.append({"type": tokenTypes["ParseError"],
"data":"invalid-codepoint"})
self.tokenQueue.append({"type": tokenTypes["Characters"],
- "data": u"\uFFFD"})
+ "data": u"\u0000"})
elif data is EOF:
# Tokenization ends.
return False
@@ -282,7 +282,7 @@ def dataState(self):
# have already been appended to lastFourChars and will have broken
# any sequences
else:
- chars = self.stream.charsUntil((u"&", u"<"))
+ chars = self.stream.charsUntil((u"&", u"<", u"\u0000"))
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data + chars})
return True
@@ -646,7 +646,7 @@ def scriptDataEscapedState(self):
elif data == EOF:
self.state = self.dataState
else:
- chars = self.stream.charsUntil((u"<-", u"\u0000"))
+ chars = self.stream.charsUntil((u"<", u"-", u"\u0000"))
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data + chars})
return True
@@ -1150,7 +1150,7 @@ def markupDeclarationOpenState(self):
self.state = self.doctypeState
return True
elif (charStack[-1] == "[" and
- self.parser is not None and
+ self.parser is not None and
self.parser.phase == self.parser.phases["inForeignContent"] and
self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
matched = True
@@ -1731,8 +1731,15 @@ def cdataSectionState(self):
if matched:
break
data = "".join(data)
+ #Deal with null here rather than in the parser
+ nullCount = data.count(u"\u0000")
+ if nullCount > 0:
+ for i in xrange(nullCount):
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ data = data.replace(u"\u0000", u"\uFFFD")
if data:
- self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
- data})
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
+ "data": data})
self.state = self.dataState
return True