Skip to content

Commit

Permalink
Optimised invalid-Unicode regexp. Cached match.end(). (Saves about 5-…
Browse files Browse the repository at this point in the history
…10% in the tokeniser.)

--HG--
extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401158
  • Loading branch information
philiptaylor committed May 28, 2008
1 parent d85d895 commit 66d242f
Showing 1 changed file with 5 additions and 4 deletions.
9 changes: 5 additions & 4 deletions src/html5lib/inputstream.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
asciiLettersBytes = [str(item) for item in asciiLetters]
asciiUppercaseBytes = [str(item) for item in asciiUppercase]

invalid_unicode_re = re.compile(u"[\u0001-\u0008]|[\u000E-\u001F]|[\u007F-\u009F]|[\uD800-\uDFFF]|[\uFDD0-\uFDDF]|\uFFFE|\uFFFF|\U0001FFFE|\U0001FFFF|\U0002FFFE|\U0002FFFF|\U0003FFFE|\U0003FFFF|\U0004FFFE|\U0004FFFF|\U0005FFFE|\U0005FFFF|\U0006FFFE|\U0006FFFF|\U0007FFFE|\U0007FFFF|\U0008FFFE|\U0008FFFF|\U0009FFFE|\U0009FFFF|\U000AFFFE|\U000AFFFF|\U000BFFFE\U000BFFFF|\U000CFFFE|\U000CFFFF|\U000DFFFE|\U000DFFFF|\U000EFFFE|\U000EFFFF|\U000FFFFE|\U000FFFFF|\U0010FFFE|\U0010FFFF")
invalid_unicode_re = re.compile(u"[\u0001-\u0008\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDDF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")

# Cache for charsUntil()
charsUntilRegEx = {}
Expand Down Expand Up @@ -320,9 +320,10 @@ def charsUntil(self, characters, opposite = False):
# Find the longest matching prefix
m = chars.match(self.chunk, self.chunkOffset)
# If not everything matched, return everything up to the part that didn't match
if m.end() != len(self.chunk):
rv.append(self.chunk[self.chunkOffset:m.end()])
self.chunkOffset = m.end()
end = m.end()
if end != len(self.chunk):
rv.append(self.chunk[self.chunkOffset:end])
self.chunkOffset = end
break
# If the whole chunk matched, use it all and read the next chunk
rv.append(self.chunk[self.chunkOffset:])
Expand Down

0 comments on commit 66d242f

Please sign in to comment.