Skip to content

Commit

Permalink
Add start of SVG+MathML branch
Browse files Browse the repository at this point in the history
--HG--
branch : svgmathml
extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/branches/svgmathml%401261
  • Loading branch information
jgraham committed Jan 17, 2009
1 parent bf5f514 commit f47bc4f
Show file tree
Hide file tree
Showing 12 changed files with 911 additions and 775 deletions.
5 changes: 3 additions & 2 deletions parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
#RELEASE remove
sys.path.insert(0,os.path.abspath(os.path.join(__file__,'../src')))
#END RELEASE
from html5lib import html5parser, liberalxmlparser, sanitizer, tokenizer
from html5lib import html5parser, liberalxmlparser, sanitizer
from html5lib.tokenizer import HTMLTokenizer
from html5lib import treebuilders, serializer, treewalkers
from html5lib import constants

Expand Down Expand Up @@ -80,7 +81,7 @@ def parse():
t1 = time.time()
printOutput(p, document, opts)
t2 = time.time()
print "\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1)
sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1))
else:
document = parseMethod(f, encoding=encoding)
printOutput(p, document, opts)
Expand Down
17 changes: 17 additions & 0 deletions src/html5lib/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@
_(u"Unexpected end of file in attribute value (')."),
"eof-in-attribute-value-no-quotes":
_(u"Unexpected end of file in attribute value."),
"unexpected-EOF-after-solidus-in-tag":
_(u"Unexpected end of file in tag. Expected >"),
"unexpected-character-after-soldius-in-tag":
_(u"Unexpected character after / in tag. Expected >"),
"expected-dashes-or-doctype":
_(u"Expected '--' or 'DOCTYPE'. Not found."),
"incorrect-comment":
Expand Down Expand Up @@ -1098,5 +1102,18 @@
"ParseError":7
}

namespaces = {
"html":"http:https://www.w3.org/1999/xhtml",
"mathml":"http:https://www.w3.org/1998/Math/MathML",
"svg":"http:https://www.w3.org/2000/svg",
"xlink":"http:https://www.w3.org/1999/xlink",
"xml":"http:https://www.w3.org/XML/1998/namespace",
"xmlns":"http:https://www.w3.org/2000/xmlns/"
}


class DataLossWarning(UserWarning):
pass

class ReparseException(Exception):
pass
1,210 changes: 612 additions & 598 deletions src/html5lib/html5parser.py

Large diffs are not rendered by default.

160 changes: 102 additions & 58 deletions src/html5lib/inputstream.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import types

from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
from constants import encodings
from constants import encodings, ReparseException

#Non-unicode versions of constants for use in the pre-parser
spaceCharactersBytes = [str(item) for item in spaceCharacters]
Expand All @@ -16,6 +16,82 @@

# Cache for charsUntil()
charsUntilRegEx = {}

class BufferedStream:
"""Buffering for streams that do not have buffering of their own
The buffer is implemented as a list of chunks on the assumption that
joining many strings will be slow since it is O(n**2)
"""

def __init__(self, stream):
self.stream = stream
self.buffer = []
self.position = [-1,0] #chunk number, offset

def tell(self):
pos = 0
for chunk in self.buffer[:self.position[0]]:
pos += len(chunk)
pos += self.position[1]
return pos

def seek(self, pos):
assert pos < self._bufferedBytes()
offset = pos
i = 0
while len(self.buffer[i]) < offset:
offset -= pos
i += 1
self.position = [i, offset]

def read(self, bytes):
if not self.buffer:
return self._readStream(bytes)
elif (self.position[0] == len(self.buffer) and
self.position[1] == len(self.buffer[-1])):
return self._readStream(bytes)
else:
return self._readFromBuffer(bytes)

def _bufferedBytes(self):
return sum([len(item) for item in self.buffer])

def _readStream(self, bytes):
data = self.stream.read(bytes)
self.buffer.append(data)
self.position[0] += 1
self.position[1] = len(data)
return data

def _readFromBuffer(self, bytes):
remainingBytes = bytes
rv = []
bufferIndex = self.position[0]
bufferOffset = self.position[1]
while bufferIndex < len(self.buffer) and remainingBytes != 0:
assert remainingBytes > 0
bufferedData = self.buffer[bufferIndex]

if remainingBytes <= len(bufferedData) - bufferOffset:
bytesToRead = remainingBytes
self.position = [bufferIndex, bufferOffset + bytesToRead]
else:
bytesToRead = len(bufferedData) - bufferOffset
self.position = [bufferIndex, len(bufferedData)]
bufferIndex += 1
data = rv.append(bufferedData[bufferOffset:
bufferOffset + bytesToRead])
remainingBytes -= bytesToRead

bufferOffset = 0

if remainingBytes:
rv.append(self._readStream(remainingBytes))

return "".join(rv)



class HTMLInputStream:
"""Provides a unicode stream of characters to the HTMLTokenizer.
Expand Down Expand Up @@ -65,6 +141,9 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
if (self.charEncoding[0] is None):
self.charEncoding = self.detectEncoding(parseMeta, chardet)

self.reset()

def reset(self):
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
'replace')

Expand Down Expand Up @@ -100,6 +179,10 @@ def openStream(self, source):
self.charEncoding = ("utf-8", "certain")
import cStringIO
stream = cStringIO.StringIO(str(source))

if not(hasattr(stream, "tell") and hasattr(stream, "seek")):
stream = BufferedStream(stream)

return stream

def detectEncoding(self, parseMeta=True, chardet=True):
Expand Down Expand Up @@ -128,7 +211,7 @@ def detectEncoding(self, parseMeta=True, chardet=True):
detector.feed(buffer)
detector.close()
encoding = detector.result['encoding']
self.seek("".join(buffers), 0)
self.rawStream.seek(0)
except ImportError:
pass
# If all else fails use the default encoding
Expand All @@ -146,16 +229,18 @@ def detectEncoding(self, parseMeta=True, chardet=True):

def changeEncoding(self, newEncoding):
newEncoding = codecName(newEncoding)
if newEncoding == "utf16":
newEncoding = "utf8"

if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
newEncoding = "utf-8"
if newEncoding is None:
return
elif newEncoding == self.charEncoding[0]:
self.charEncoding = (self.charEncoding[0] and "certian")
self.charEncoding = (self.charEncoding[0], "certian")
else:
raise NotImplementedError, "Cannot change character encoding mid stream"

self.rawStream.seek(0)
self.reset()
self.charEncoding = (newEncoding, "certian")
raise ReparseException, "Encoding changed from %s to %s"%(self.charEncoding[0], newEncoding)

def detectBOM(self):
"""Attempts to detect at BOM at the start of the stream. If
an encoding can be determined from the BOM return the name of the
Expand All @@ -182,56 +267,21 @@ def detectBOM(self):

# Set the read position past the BOM if one was found, otherwise
# set it to the start of the stream
self.seek(string, encoding and seek or 0)
self.rawStream.seek(encoding and seek or 0)

return encoding

def seek(self, buffer, n):
"""Unget buffer[n:]"""
if hasattr(self.rawStream, 'unget'):
self.rawStream.unget(buffer[n:])
return

if hasattr(self.rawStream, 'seek'):
try:
self.rawStream.seek(n)
return
except IOError:
pass

class BufferedStream:
def __init__(self, data, stream):
self.data = data
self.stream = stream
def read(self, chars=-1):
if chars == -1 or chars > len(self.data):
result = self.data
self.data = ''
if chars == -1:
return result + self.stream.read()
else:
return result + self.stream.read(chars-len(result))
elif not self.data:
return self.stream.read(chars)
else:
result = self.data[:chars]
self.data = self.data[chars:]
return result
def unget(self, data):
if self.data:
self.data += data
else:
self.data = data

self.rawStream = BufferedStream(buffer[n:], self.rawStream)

def detectEncodingMeta(self):
"""Report the encoding declared by the meta element
"""
buffer = self.rawStream.read(self.numBytesMeta)
parser = EncodingParser(buffer)
self.seek(buffer, 0)
self.rawStream.seek(0)
encoding = parser.getEncoding()

if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
encoding = "utf-8"

return encoding

def updatePosition(self, chars):
Expand Down Expand Up @@ -485,13 +535,6 @@ def getEncoding(self):
break
if not keepParsing:
break
if self.encoding is not None:
self.encoding = self.encoding.strip()
#Spec violation that complies with hsivonen + mjs
if (ascii_punctuation_re.sub("", self.encoding) in
("utf16", "utf16be", "utf16le",
"utf32", "utf32be", "utf32le")):
self.encoding = "utf-8"

return self.encoding

Expand Down Expand Up @@ -666,11 +709,12 @@ def parse(self):
except StopIteration:
return None


def codecName(encoding):
"""Return the python codec name corresponding to an encoding or None if the
string doesn't correspond to a valid encoding."""
if (encoding is not None and type(encoding) == types.StringType):
if (encoding is not None and type(encoding) in types.StringTypes):
canonicalName = ascii_punctuation_re.sub("", encoding).lower()
return encodings.get(canonicalName, None)
return encodings.get(canonicalName, None)
else:
return None
Loading

0 comments on commit f47bc4f

Please sign in to comment.