Skip to content

Commit

Permalink
Allow changing character encoding
Browse files Browse the repository at this point in the history
--HG--
extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401257
  • Loading branch information
jgraham committed Jan 10, 2009
1 parent f2bf97f commit e4021af
Show file tree
Hide file tree
Showing 7 changed files with 152 additions and 105 deletions.
3 changes: 2 additions & 1 deletion parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
#RELEASE remove
sys.path.insert(0,os.path.abspath(os.path.join(__file__,'../src')))
#END RELEASE
from html5lib import html5parser, liberalxmlparser, sanitizer, tokenizer
from html5lib import html5parser, liberalxmlparser, sanitizer
from html5lib.tokenizer import HTMLTokenizer
from html5lib import treebuilders, serializer, treewalkers
from html5lib import constants

Expand Down
4 changes: 3 additions & 1 deletion src/html5lib/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -1073,7 +1073,6 @@
'utf16': 'utf-16',
'utf16be': 'utf-16-be',
'utf16le': 'utf-16-le',
'utf7': 'utf-7',
'utf8': 'utf-8',
'windows1250': 'cp1250',
'windows1251': 'cp1251',
Expand All @@ -1100,3 +1099,6 @@

class DataLossWarning(UserWarning):
pass

class ReparseException(Exception):
pass
50 changes: 24 additions & 26 deletions src/html5lib/html5parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@
from constants import scopingElements, formattingElements, specialElements
from constants import headingElements, tableInsertModeElements
from constants import cdataElements, rcdataElements, voidElements
from constants import tokenTypes
from constants import tokenTypes, ReparseException

def parse(doc, treebuilderName="simpletree", encoding=None):
tb = treebuilders.getTreeBuilder(treebuilderName)
def parse(doc, treebuilder="simpletree", encoding=None):
tb = treebuilders.getTreeBuilder(treebuilder)
p = HTMLParser(tb)
return p.parse(doc, encoding=encoding)

Expand Down Expand Up @@ -80,18 +80,29 @@ def __init__(self, tree = simpletree.TreeBuilder,

def _parse(self, stream, innerHTML=False, container="div",
encoding=None, parseMeta=True, useChardet=True, **kwargs):

self.tree.reset()
self.firstStartTag = False
self.errors = []
self.compatMode = "no quirks"

self.innerHTMLMode = innerHTML
self.container = container
self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
parseMeta=parseMeta,
useChardet=useChardet, **kwargs)
self.reset()

if innerHTML:
self.innerHTML = container.lower()
while True:
try:
self.mainLoop()
break
except ReparseException, e:
self.reset()

def reset(self):
self.tree.reset()
self.firstStartTag = False
self.errors = []
self.compatMode = "no quirks"

if self.innerHTMLMode:
self.innerHTML = self.container.lower()

if self.innerHTML in cdataElements:
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"]
Expand All @@ -113,9 +124,9 @@ def _parse(self, stream, innerHTML=False, container="div",
# We only seem to have InBodyPhase testcases where the following is
# relevant ... need others too
self.lastPhase = None

self.beforeRCDataPhase = None


def mainLoop(self):
(CharactersToken,
SpaceCharactersToken,
StartTagToken,
Expand Down Expand Up @@ -287,18 +298,6 @@ def __init__(self, parser, tree):

def processEOF(self):
raise NotImplementedError
self.tree.generateImpliedEndTags()
if len(self.tree.openElements) > 2:
self.parser.parseError("expected-closing-tag-but-got-eof")
elif len(self.tree.openElements) == 2 and\
self.tree.openElements[1].name != "body":
# This happens for framesets or something?
self.parser.parseError("expected-closing-tag-but-got-eof")
elif self.parser.innerHTML and len(self.tree.openElements) > 1 :
# XXX This is not what the specification says. Not sure what to do
# here.
self.parser.parseError("eof-in-innerhtml")
# Betting ends.

def processComment(self, data):
# For most phases the following is correct. Where it's not it will be
Expand Down Expand Up @@ -601,8 +600,7 @@ def startTagMeta(self, name, attributes):

if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
if "charset" in attributes:
codec = inputstream.codecName(attributes["charset"])
self.parser.tokenizer.stream.changeEncoding(codec)
self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
elif "content" in attributes:
data = inputstream.EncodingBytes(attributes["content"])
parser = inputstream.ContentAttrParser(data)
Expand Down
160 changes: 102 additions & 58 deletions src/html5lib/inputstream.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import types

from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
from constants import encodings
from constants import encodings, ReparseException

#Non-unicode versions of constants for use in the pre-parser
spaceCharactersBytes = [str(item) for item in spaceCharacters]
Expand All @@ -16,6 +16,82 @@

# Cache for charsUntil()
charsUntilRegEx = {}

class BufferedStream:
"""Buffering for streams that do not have buffering of their own
The buffer is implemented as a list of chunks on the assumption that
joining many strings will be slow since it is O(n**2)
"""

def __init__(self, stream):
self.stream = stream
self.buffer = []
self.position = [-1,0] #chunk number, offset

def tell(self):
pos = 0
for chunk in self.buffer[:self.position[0]]:
pos += len(chunk)
pos += self.position[1]
return pos

def seek(self, pos):
assert pos < self._bufferedBytes()
offset = pos
i = 0
while len(self.buffer[i]) < offset:
offset -= pos
i += 1
self.position = [i, offset]

def read(self, bytes):
if not self.buffer:
return self._readStream(bytes)
elif (self.position[0] == len(self.buffer) and
self.position[1] == len(self.buffer[-1])):
return self._readStream(bytes)
else:
return self._readFromBuffer(bytes)

def _bufferedBytes(self):
return sum([len(item) for item in self.buffer])

def _readStream(self, bytes):
data = self.stream.read(bytes)
self.buffer.append(data)
self.position[0] += 1
self.position[1] = len(data)
return data

def _readFromBuffer(self, bytes):
remainingBytes = bytes
rv = []
bufferIndex = self.position[0]
bufferOffset = self.position[1]
while bufferIndex < len(self.buffer) and remainingBytes != 0:
assert remainingBytes > 0
bufferedData = self.buffer[bufferIndex]

if remainingBytes <= len(bufferedData) - bufferOffset:
bytesToRead = remainingBytes
self.position = [bufferIndex, bufferOffset + bytesToRead]
else:
bytesToRead = len(bufferedData) - bufferOffset
self.position = [bufferIndex, len(bufferedData)]
bufferIndex += 1
data = rv.append(bufferedData[bufferOffset:
bufferOffset + bytesToRead])
remainingBytes -= bytesToRead

bufferOffset = 0

if remainingBytes:
rv.append(self._readStream(remainingBytes))

return "".join(rv)



class HTMLInputStream:
"""Provides a unicode stream of characters to the HTMLTokenizer.
Expand Down Expand Up @@ -65,6 +141,9 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
if (self.charEncoding[0] is None):
self.charEncoding = self.detectEncoding(parseMeta, chardet)

self.reset()

def reset(self):
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
'replace')

Expand Down Expand Up @@ -100,6 +179,10 @@ def openStream(self, source):
self.charEncoding = ("utf-8", "certain")
import cStringIO
stream = cStringIO.StringIO(str(source))

if not(hasattr(stream, "tell") and hasattr(stream, "seek")):
stream = BufferedStream(stream)

return stream

def detectEncoding(self, parseMeta=True, chardet=True):
Expand Down Expand Up @@ -128,7 +211,7 @@ def detectEncoding(self, parseMeta=True, chardet=True):
detector.feed(buffer)
detector.close()
encoding = detector.result['encoding']
self.seek("".join(buffers), 0)
self.rawStream.seek(0)
except ImportError:
pass
# If all else fails use the default encoding
Expand All @@ -146,16 +229,18 @@ def detectEncoding(self, parseMeta=True, chardet=True):

def changeEncoding(self, newEncoding):
newEncoding = codecName(newEncoding)
if newEncoding == "utf16":
newEncoding = "utf8"

if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
newEncoding = "utf-8"
if newEncoding is None:
return
elif newEncoding == self.charEncoding[0]:
self.charEncoding = (self.charEncoding[0] and "certian")
self.charEncoding = (self.charEncoding[0], "certian")
else:
raise NotImplementedError, "Cannot change character encoding mid stream"

self.rawStream.seek(0)
self.reset()
self.charEncoding = (newEncoding, "certian")
raise ReparseException, "Encoding changed from %s to %s"%(self.charEncoding[0], newEncoding)

def detectBOM(self):
"""Attempts to detect at BOM at the start of the stream. If
an encoding can be determined from the BOM return the name of the
Expand All @@ -182,56 +267,21 @@ def detectBOM(self):

# Set the read position past the BOM if one was found, otherwise
# set it to the start of the stream
self.seek(string, encoding and seek or 0)
self.rawStream.seek(encoding and seek or 0)

return encoding

def seek(self, buffer, n):
"""Unget buffer[n:]"""
if hasattr(self.rawStream, 'unget'):
self.rawStream.unget(buffer[n:])
return

if hasattr(self.rawStream, 'seek'):
try:
self.rawStream.seek(n)
return
except IOError:
pass

class BufferedStream:
def __init__(self, data, stream):
self.data = data
self.stream = stream
def read(self, chars=-1):
if chars == -1 or chars > len(self.data):
result = self.data
self.data = ''
if chars == -1:
return result + self.stream.read()
else:
return result + self.stream.read(chars-len(result))
elif not self.data:
return self.stream.read(chars)
else:
result = self.data[:chars]
self.data = self.data[chars:]
return result
def unget(self, data):
if self.data:
self.data += data
else:
self.data = data

self.rawStream = BufferedStream(buffer[n:], self.rawStream)

def detectEncodingMeta(self):
"""Report the encoding declared by the meta element
"""
buffer = self.rawStream.read(self.numBytesMeta)
parser = EncodingParser(buffer)
self.seek(buffer, 0)
self.rawStream.seek(0)
encoding = parser.getEncoding()

if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
encoding = "utf-8"

return encoding

def updatePosition(self, chars):
Expand Down Expand Up @@ -485,13 +535,6 @@ def getEncoding(self):
break
if not keepParsing:
break
if self.encoding is not None:
self.encoding = self.encoding.strip()
#Spec violation that complies with hsivonen + mjs
if (ascii_punctuation_re.sub("", self.encoding) in
("utf16", "utf16be", "utf16le",
"utf32", "utf32be", "utf32le")):
self.encoding = "utf-8"

return self.encoding

Expand Down Expand Up @@ -666,11 +709,12 @@ def parse(self):
except StopIteration:
return None


def codecName(encoding):
"""Return the python codec name corresponding to an encoding or None if the
string doesn't correspond to a valid encoding."""
if (encoding is not None and type(encoding) == types.StringType):
if (encoding is not None and type(encoding) in types.StringTypes):
canonicalName = ascii_punctuation_re.sub("", encoding).lower()
return encodings.get(canonicalName, None)
return encodings.get(canonicalName, None)
else:
return None
10 changes: 6 additions & 4 deletions src/html5lib/treebuilders/etree_lxml.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def _getChildNodes(self):
def testSerializer(element):
rv = []
finalText = None
filter = ihatexml.InfosetFilter()
def serializeElement(element, indent=0):
if not hasattr(element, "tag"):
if hasattr(element, "getroot"):
Expand Down Expand Up @@ -79,10 +80,11 @@ def serializeElement(element, indent=0):
elif type(element.tag) == type(etree.Comment):
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
else:
rv.append("|%s<%s>"%(' '*indent, element.tag))
rv.append("|%s<%s>"%(' '*indent, filter.fromXmlName(element.tag)))
if hasattr(element, "attrib"):
for name, value in element.attrib.iteritems():
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
rv.append('|%s%s="%s"' % (' '*(indent+2),
filter.fromXmlName(name), value))
if element.text:
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
indent += 2
Expand Down Expand Up @@ -239,8 +241,8 @@ def getFragment(self):
return fragment

def insertDoctype(self, name, publicId, systemId):
if not name:
warnings.warn("lxml cannot represent null doctype", DataLossWarning)
if not name or ihatexml.nonXmlBMPRegexp.search(name):
warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning)
doctype = self.doctypeClass(name, publicId, systemId)
self.doctype = doctype

Expand Down
Loading

0 comments on commit e4021af

Please sign in to comment.