Python streaming, and Ruby progress towards that goal

--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40774
ssameerr · Jun 19, 2007 · e706c6e · e706c6e
1 parent 2213b78
commit e706c6e
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 19 deletions.
diff --git a/parse.py b/parse.py
@@ -30,6 +30,8 @@ def parse():
  import urllib
  f = urllib.urlopen(f).read()
  except: pass
+ elif f == '-':
+ f = sys.stdin
  else:
  try:
  # Try opening from file system

diff --git a/src/html5parser.py b/src/html5parser.py
@@ -82,7 +82,7 @@ def _parse(self, stream, innerHTML=False, container="div",
  self.errors = []
 
  self.tokenizer = self.tokenizer_class(stream, encoding,
- parseMeta=innerHTML)
+ parseMeta=not innerHTML)
 
  if innerHTML:
  self.innerHTML = container.lower()

diff --git a/src/inputstream.py b/src/inputstream.py
@@ -87,7 +87,7 @@ def detectEncoding(self, parseMeta=True, chardet=True):
  import chardet
  buffer = self.rawStream.read()
  encoding = chardet.detect(buffer)['encoding']
- self.rawStream = self.openStream(buffer)
+ self.seek(buffer, 0)
  except ImportError:
  pass
  # If all else fails use the default encoding
@@ -127,18 +127,50 @@ def detectBOM(self):
  seek = 2
 
 
- #AT - move this to the caller?
- # Set the read position past the BOM if one was found, otherwise
- # set it to the start of the stream
- self.rawStream.seek(encoding and seek or 0)
+ self.seek(string, encoding and seek or 0)
 
  return encoding
 
+ def seek(self, buffer, n):
+ """Unget buffer[n:]"""
+ if hasattr(self.rawStream, 'unget'):
+ self.rawStream.unget(buffer[n:])
+ return 
+
+ try:
+ self.rawStream.seek(n)
+ except IOError:
+ class BufferedStream:
+ def __init__(self, data, stream):
+ self.data = data
+ self.stream = stream
+ def read(self, chars=-1):
+ if chars == -1 or chars > len(self.data):
+ result = self.data
+ self.data = ''
+ if chars == -1:
+ return result + self.stream.read()
+ else:
+ return result + self.stream.read(chars-len(result))
+ elif not self.data:
+ return self.stream.read(chars)
+ else:
+ result = self.data[:chars]
+ self.data = self.data[chars:]
+ return result
+ def unget(self, data):
+ if self.data:
+ self.data += data
+ else:
+ self.data = data
+ self.rawStream = BufferedStream(buffer[n:], self.rawStream)
+
  def detectEncodingMeta(self):
  """Report the encoding declared by the meta element
  """
- parser = EncodingParser(self.rawStream.read(self.numBytesMeta))
- self.rawStream.seek(0)
+ buffer = self.rawStream.read(self.numBytesMeta)
+ parser = EncodingParser(buffer)
+ self.seek(buffer, 0)
  return parser.getEncoding()
 
  def position(self):
@@ -195,18 +227,9 @@ def charsUntil(self, characters, opposite = False):
  # Put the character stopped on back to the front of the queue
  # from where it came.
  c = charStack.pop()
- if c != EOF:
- self.queue.insert(0, c)
+ if c != EOF:
+ self.queue.insert(0, c)
 
- # XXX the following is need for correct line number reporting apparently
- # but it causes to break other tests with the fixes in tokenizer. I have
- # no idea why...
- #
- #if c != EOF and self.tell <= len(self.dataStream) and \
- # self.dataStream[self.tell - 1] == c[0]:
- # self.tell -= 1
- #else:
- # self.queue.insert(0, c)
  return u"".join(charStack)
 
 class EncodingBytes(str):