diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 20798097..11885dcc 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -987,12 +987,15 @@ def processSpaceCharactersDropNewline(self, token): self.tree.insertText(data) def processCharacters(self, token): + if token["data"] == u"\u0000": + #The tokenizer should always emit null on its own + return self.tree.reconstructActiveFormattingElements() self.tree.insertText(token["data"]) #This must be bad for performance if (self.parser.framesetOK and - any([char not in set(u"\ufffd") | spaceCharacters - for char in token["data"]])): + any([char not in spaceCharacters + for char in token["data"]])): self.parser.framesetOK = False def processSpaceCharacters(self, token): @@ -2195,6 +2198,8 @@ def processEOF(self): assert self.parser.innerHTML def processCharacters(self, token): + if token["data"] == u"\u0000": + return self.tree.insertText(token["data"]) def startTagOption(self, token): @@ -2375,8 +2380,11 @@ def processCharacters(self, token): new_token = self.parser.phases["inBody"].processCharacters(token) self.parser.resetInsertionMode() return new_token - - self.parser.framesetOK = False + elif token["data"] == u"\u0000": + token["data"] = u"\uFFFD" + elif (not self.parser.framesetOK and + any(char not in spaceCharacters for char in token["data"])): + self.parser.framesetOK = False Phase.processCharacters(self, token) def processEOF(self): diff --git a/html5lib/tests/test_tokenizer.py b/html5lib/tests/test_tokenizer.py index b424d360..8a49a3af 100644 --- a/html5lib/tests/test_tokenizer.py +++ b/html5lib/tests/test_tokenizer.py @@ -107,7 +107,7 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder, if token[0] == "StartTag" or token[0] == "EndTag": token.pop() - if not ignoreErrorOrder: + if not ignoreErrorOrder and not ignoreErrors: return expectedTokens == receivedTokens else: #Sort the tokens into two groups; non-parse errors and parse errors diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py index 7ed348c0..8a29d33c 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/tokenizer.py @@ -268,7 +268,7 @@ def dataState(self): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":"invalid-codepoint"}) self.tokenQueue.append({"type": tokenTypes["Characters"], - "data": u"\uFFFD"}) + "data": u"\u0000"}) elif data is EOF: # Tokenization ends. return False @@ -282,7 +282,7 @@ def dataState(self): # have already been appended to lastFourChars and will have broken # any sequences else: - chars = self.stream.charsUntil((u"&", u"<")) + chars = self.stream.charsUntil((u"&", u"<", u"\u0000")) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data + chars}) return True @@ -646,7 +646,7 @@ def scriptDataEscapedState(self): elif data == EOF: self.state = self.dataState else: - chars = self.stream.charsUntil((u"<-", u"\u0000")) + chars = self.stream.charsUntil((u"<", u"-", u"\u0000")) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data + chars}) return True @@ -1150,7 +1150,7 @@ def markupDeclarationOpenState(self): self.state = self.doctypeState return True elif (charStack[-1] == "[" and - self.parser is not None and + self.parser is not None and self.parser.phase == self.parser.phases["inForeignContent"] and self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace): matched = True @@ -1731,8 +1731,15 @@ def cdataSectionState(self): if matched: break data = "".join(data) + #Deal with null here rather than in the parser + nullCount = data.count(u"\u0000") + if nullCount > 0: + for i in xrange(nullCount): + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "invalid-codepoint"}) + data = data.replace(u"\u0000", u"\uFFFD") if data: - self.tokenQueue.append({"type": tokenTypes["Characters"], "data": - data}) + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data": data}) self.state = self.dataState return True