Support http Content-Type charset parameter

--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40776
ssameerr · Jun 20, 2007 · a344753 · a344753
1 parent 45a7f4d
commit a344753
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 33 deletions.
diff --git a/parse.py b/parse.py
@@ -21,14 +21,19 @@
 def parse():
  optParser = getOptParser()
  opts,args = optParser.parse_args()
+ encoding = None
 
  try:
  f = args[-1]
  # Try opening from the internet
  if f.startswith('http:https://'):
  try:
- import urllib
- f = urllib.urlopen(f).read()
+ import urllib, cgi
+ f = urllib.urlopen(f)
+ contentType = f.headers.get('content-type')
+ if contentType:
+ (mediaType, params) = cgi.parse_header(contentType)
+ encoding = params.get('charset')
  except: pass
  elif f == '-':
  f = sys.stdin
@@ -57,7 +62,7 @@ def parse():
  import hotshot
  import hotshot.stats
  prof = hotshot.Profile('stats.prof')
- prof.runcall(parseMethod, f)
+ prof.runcall(parseMethod, f, encoding=encoding)
  prof.close()
  # XXX - We should use a temp file here
  stats = hotshot.stats.load('stats.prof')
@@ -67,14 +72,15 @@ def parse():
  elif opts.time:
  import time
  t0 = time.time()
- document = parseMethod(f)
+ document = parseMethod(f, encoding=encoding)
  t1 = time.time()
  printOutput(p, document, opts)
  t2 = time.time()
- sys.stdout.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1))
+ print "\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1)
  else:
- document = parseMethod(f)
+ document = parseMethod(f, encoding=encoding)
  printOutput(p, document, opts)
+ print encoding
 
 def printOutput(parser, document, opts):
  if opts.encoding:

diff --git a/src/inputstream.py b/src/inputstream.py
@@ -126,7 +126,6 @@ def detectBOM(self):
  encoding = bomDict.get(string[:2]) # UTF-16
  seek = 2
 
-
  self.seek(string, encoding and seek or 0)
 
  return encoding
@@ -137,33 +136,38 @@ def seek(self, buffer, n):
  self.rawStream.unget(buffer[n:])
  return 
 
- try:
- self.rawStream.seek(n)
- except IOError:
- class BufferedStream:
- def __init__(self, data, stream):
- self.data = data
- self.stream = stream
- def read(self, chars=-1):
- if chars == -1 or chars > len(self.data):
- result = self.data
- self.data = ''
- if chars == -1:
- return result + self.stream.read()
- else:
- return result + self.stream.read(chars-len(result))
- elif not self.data:
- return self.stream.read(chars)
- else:
- result = self.data[:chars]
- self.data = self.data[chars:]
- return result
- def unget(self, data):
- if self.data:
- self.data += data
+ if hasattr(self.rawStream, 'seek'):
+ try:
+ self.rawStream.seek(n)
+ return
+ except IOError:
+ pass
+
+ class BufferedStream:
+ def __init__(self, data, stream):
+ self.data = data
+ self.stream = stream
+ def read(self, chars=-1):
+ if chars == -1 or chars > len(self.data):
+ result = self.data
+ self.data = ''
+ if chars == -1:
+ return result + self.stream.read()
  else:
- self.data = data
- self.rawStream = BufferedStream(buffer[n:], self.rawStream)
+ return result + self.stream.read(chars-len(result))
+ elif not self.data:
+ return self.stream.read(chars)
+ else:
+ result = self.data[:chars]
+ self.data = self.data[chars:]
+ return result
+ def unget(self, data):
+ if self.data:
+ self.data += data
+ else:
+ self.data = data
+
+ self.rawStream = BufferedStream(buffer[n:], self.rawStream)
 
  def detectEncodingMeta(self):
  """Report the encoding declared by the meta element