Optimised PCDATA Data State a bit (saves maybe 3%)

--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401240
ssameerr · Dec 18, 2008 · b7c7de7 · b7c7de7
1 parent cfb1e85
commit b7c7de7
Showing 1 changed file with 13 additions and 6 deletions.
diff --git a/src/html5lib/tokenizer.py b/src/html5lib/tokenizer.py
@@ -354,15 +354,22 @@ def dataState(self):
  self.tokenQueue.append({"type": "SpaceCharacters", "data":
  data + self.stream.charsUntil(spaceCharacters, True)})
  # No need to update lastFourChars here, since the first space will
- # have already broken any <!-- or --> sequences
+ # have already been appended to lastFourChars and will have broken
+ # any <!-- or --> sequences
  else:
- chars = self.stream.charsUntil(("&", "<", ">", "-"))
- self.tokenQueue.append({"type": "Characters", "data": 
+ if self.contentModelFlag in\
+ (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]):
+ chars = self.stream.charsUntil((u"&", u"<", u">", u"-"))
+ self.lastFourChars += chars[-4:]
+ self.lastFourChars = self.lastFourChars[-4:]
+ else:
+ chars = self.stream.charsUntil((u"&", u"<"))
+ # lastFourChars only needs to be kept up-to-date if we're
+ # in CDATA or RCDATA, so ignore it here
+ self.tokenQueue.append({"type": "Characters", "data":
  data + chars})
- self.lastFourChars += chars[-4:]
- self.lastFourChars = self.lastFourChars[-4:]
  return True
- 
+
  def entityDataState(self):
  entity = self.consumeEntity()
  if entity: