Skip to content

Commit

Permalink
Support http Content-Type charset parameter
Browse files Browse the repository at this point in the history
--HG--
extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40776
  • Loading branch information
rubys committed Jun 20, 2007
1 parent 45a7f4d commit a344753
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 33 deletions.
18 changes: 12 additions & 6 deletions parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,19 @@
def parse():
optParser = getOptParser()
opts,args = optParser.parse_args()
encoding = None

try:
f = args[-1]
# Try opening from the internet
if f.startswith('http:https://'):
try:
import urllib
f = urllib.urlopen(f).read()
import urllib, cgi
f = urllib.urlopen(f)
contentType = f.headers.get('content-type')
if contentType:
(mediaType, params) = cgi.parse_header(contentType)
encoding = params.get('charset')
except: pass
elif f == '-':
f = sys.stdin
Expand Down Expand Up @@ -57,7 +62,7 @@ def parse():
import hotshot
import hotshot.stats
prof = hotshot.Profile('stats.prof')
prof.runcall(parseMethod, f)
prof.runcall(parseMethod, f, encoding=encoding)
prof.close()
# XXX - We should use a temp file here
stats = hotshot.stats.load('stats.prof')
Expand All @@ -67,14 +72,15 @@ def parse():
elif opts.time:
import time
t0 = time.time()
document = parseMethod(f)
document = parseMethod(f, encoding=encoding)
t1 = time.time()
printOutput(p, document, opts)
t2 = time.time()
sys.stdout.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1))
print "\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1)
else:
document = parseMethod(f)
document = parseMethod(f, encoding=encoding)
printOutput(p, document, opts)
print encoding

def printOutput(parser, document, opts):
if opts.encoding:
Expand Down
58 changes: 31 additions & 27 deletions src/inputstream.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,6 @@ def detectBOM(self):
encoding = bomDict.get(string[:2]) # UTF-16
seek = 2


self.seek(string, encoding and seek or 0)

return encoding
Expand All @@ -137,33 +136,38 @@ def seek(self, buffer, n):
self.rawStream.unget(buffer[n:])
return

try:
self.rawStream.seek(n)
except IOError:
class BufferedStream:
def __init__(self, data, stream):
self.data = data
self.stream = stream
def read(self, chars=-1):
if chars == -1 or chars > len(self.data):
result = self.data
self.data = ''
if chars == -1:
return result + self.stream.read()
else:
return result + self.stream.read(chars-len(result))
elif not self.data:
return self.stream.read(chars)
else:
result = self.data[:chars]
self.data = self.data[chars:]
return result
def unget(self, data):
if self.data:
self.data += data
if hasattr(self.rawStream, 'seek'):
try:
self.rawStream.seek(n)
return
except IOError:
pass

class BufferedStream:
def __init__(self, data, stream):
self.data = data
self.stream = stream
def read(self, chars=-1):
if chars == -1 or chars > len(self.data):
result = self.data
self.data = ''
if chars == -1:
return result + self.stream.read()
else:
self.data = data
self.rawStream = BufferedStream(buffer[n:], self.rawStream)
return result + self.stream.read(chars-len(result))
elif not self.data:
return self.stream.read(chars)
else:
result = self.data[:chars]
self.data = self.data[chars:]
return result
def unget(self, data):
if self.data:
self.data += data
else:
self.data = data

self.rawStream = BufferedStream(buffer[n:], self.rawStream)

def detectEncodingMeta(self):
"""Report the encoding declared by the meta element
Expand Down

0 comments on commit a344753

Please sign in to comment.