Skip to content

Commit

Permalink
Python streaming, and Ruby progress towards that goal
Browse files Browse the repository at this point in the history
--HG--
extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40774
  • Loading branch information
rubys committed Jun 19, 2007
1 parent 2213b78 commit e706c6e
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 19 deletions.
2 changes: 2 additions & 0 deletions parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ def parse():
import urllib
f = urllib.urlopen(f).read()
except: pass
elif f == '-':
f = sys.stdin
else:
try:
# Try opening from file system
Expand Down
2 changes: 1 addition & 1 deletion src/html5parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def _parse(self, stream, innerHTML=False, container="div",
self.errors = []

self.tokenizer = self.tokenizer_class(stream, encoding,
parseMeta=innerHTML)
parseMeta=not innerHTML)

if innerHTML:
self.innerHTML = container.lower()
Expand Down
59 changes: 41 additions & 18 deletions src/inputstream.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def detectEncoding(self, parseMeta=True, chardet=True):
import chardet
buffer = self.rawStream.read()
encoding = chardet.detect(buffer)['encoding']
self.rawStream = self.openStream(buffer)
self.seek(buffer, 0)
except ImportError:
pass
# If all else fails use the default encoding
Expand Down Expand Up @@ -127,18 +127,50 @@ def detectBOM(self):
seek = 2


#AT - move this to the caller?
# Set the read position past the BOM if one was found, otherwise
# set it to the start of the stream
self.rawStream.seek(encoding and seek or 0)
self.seek(string, encoding and seek or 0)

return encoding

def seek(self, buffer, n):
"""Unget buffer[n:]"""
if hasattr(self.rawStream, 'unget'):
self.rawStream.unget(buffer[n:])
return

try:
self.rawStream.seek(n)
except IOError:
class BufferedStream:
def __init__(self, data, stream):
self.data = data
self.stream = stream
def read(self, chars=-1):
if chars == -1 or chars > len(self.data):
result = self.data
self.data = ''
if chars == -1:
return result + self.stream.read()
else:
return result + self.stream.read(chars-len(result))
elif not self.data:
return self.stream.read(chars)
else:
result = self.data[:chars]
self.data = self.data[chars:]
return result
def unget(self, data):
if self.data:
self.data += data
else:
self.data = data
self.rawStream = BufferedStream(buffer[n:], self.rawStream)

def detectEncodingMeta(self):
"""Report the encoding declared by the meta element
"""
parser = EncodingParser(self.rawStream.read(self.numBytesMeta))
self.rawStream.seek(0)
buffer = self.rawStream.read(self.numBytesMeta)
parser = EncodingParser(buffer)
self.seek(buffer, 0)
return parser.getEncoding()

def position(self):
Expand Down Expand Up @@ -195,18 +227,9 @@ def charsUntil(self, characters, opposite = False):
# Put the character stopped on back to the front of the queue
# from where it came.
c = charStack.pop()
if c != EOF:
self.queue.insert(0, c)
if c != EOF:
self.queue.insert(0, c)

# XXX the following is need for correct line number reporting apparently
# but it causes to break other tests with the fixes in tokenizer. I have
# no idea why...
#
#if c != EOF and self.tell <= len(self.dataStream) and \
# self.dataStream[self.tell - 1] == c[0]:
# self.tell -= 1
#else:
# self.queue.insert(0, c)
return u"".join(charStack)

class EncodingBytes(str):
Expand Down

0 comments on commit e706c6e

Please sign in to comment.