diff --git a/.pytest.expect b/.pytest.expect index c88e99b9..5f3b6194 100644 Binary files a/.pytest.expect and b/.pytest.expect differ diff --git a/CHANGES.rst b/CHANGES.rst index 4d0a1996..64162ccf 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -22,6 +22,10 @@ Released on XXX * Move testsuite to ``py.test``. +* Fix #124: move to webencodings for decoding the input byte stream; + this makes html5lib compliant with the Encoding Standard, and + introduces a required dependency on webencodings. + 0.9999999/1.0b8 ~~~~~~~~~~~~~~~ diff --git a/html5lib/constants.py b/html5lib/constants.py index d938e0ae..f6e38cbf 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -2846,235 +2846,6 @@ 0x9F: "\u0178", } -encodings = { - '437': 'cp437', - '850': 'cp850', - '852': 'cp852', - '855': 'cp855', - '857': 'cp857', - '860': 'cp860', - '861': 'cp861', - '862': 'cp862', - '863': 'cp863', - '865': 'cp865', - '866': 'cp866', - '869': 'cp869', - 'ansix341968': 'ascii', - 'ansix341986': 'ascii', - 'arabic': 'iso8859-6', - 'ascii': 'ascii', - 'asmo708': 'iso8859-6', - 'big5': 'big5', - 'big5hkscs': 'big5hkscs', - 'chinese': 'gbk', - 'cp037': 'cp037', - 'cp1026': 'cp1026', - 'cp154': 'ptcp154', - 'cp367': 'ascii', - 'cp424': 'cp424', - 'cp437': 'cp437', - 'cp500': 'cp500', - 'cp775': 'cp775', - 'cp819': 'windows-1252', - 'cp850': 'cp850', - 'cp852': 'cp852', - 'cp855': 'cp855', - 'cp857': 'cp857', - 'cp860': 'cp860', - 'cp861': 'cp861', - 'cp862': 'cp862', - 'cp863': 'cp863', - 'cp864': 'cp864', - 'cp865': 'cp865', - 'cp866': 'cp866', - 'cp869': 'cp869', - 'cp936': 'gbk', - 'cpgr': 'cp869', - 'cpis': 'cp861', - 'csascii': 'ascii', - 'csbig5': 'big5', - 'cseuckr': 'cp949', - 'cseucpkdfmtjapanese': 'euc_jp', - 'csgb2312': 'gbk', - 'cshproman8': 'hp-roman8', - 'csibm037': 'cp037', - 'csibm1026': 'cp1026', - 'csibm424': 'cp424', - 'csibm500': 'cp500', - 'csibm855': 'cp855', - 'csibm857': 'cp857', - 'csibm860': 'cp860', - 'csibm861': 'cp861', - 'csibm863': 'cp863', - 'csibm864': 'cp864', - 'csibm865': 'cp865', - 'csibm866': 'cp866', - 'csibm869': 'cp869', - 'csiso2022jp': 'iso2022_jp', - 'csiso2022jp2': 'iso2022_jp_2', - 'csiso2022kr': 'iso2022_kr', - 'csiso58gb231280': 'gbk', - 'csisolatin1': 'windows-1252', - 'csisolatin2': 'iso8859-2', - 'csisolatin3': 'iso8859-3', - 'csisolatin4': 'iso8859-4', - 'csisolatin5': 'windows-1254', - 'csisolatin6': 'iso8859-10', - 'csisolatinarabic': 'iso8859-6', - 'csisolatincyrillic': 'iso8859-5', - 'csisolatingreek': 'iso8859-7', - 'csisolatinhebrew': 'iso8859-8', - 'cskoi8r': 'koi8-r', - 'csksc56011987': 'cp949', - 'cspc775baltic': 'cp775', - 'cspc850multilingual': 'cp850', - 'cspc862latinhebrew': 'cp862', - 'cspc8codepage437': 'cp437', - 'cspcp852': 'cp852', - 'csptcp154': 'ptcp154', - 'csshiftjis': 'shift_jis', - 'csunicode11utf7': 'utf-7', - 'cyrillic': 'iso8859-5', - 'cyrillicasian': 'ptcp154', - 'ebcdiccpbe': 'cp500', - 'ebcdiccpca': 'cp037', - 'ebcdiccpch': 'cp500', - 'ebcdiccphe': 'cp424', - 'ebcdiccpnl': 'cp037', - 'ebcdiccpus': 'cp037', - 'ebcdiccpwt': 'cp037', - 'ecma114': 'iso8859-6', - 'ecma118': 'iso8859-7', - 'elot928': 'iso8859-7', - 'eucjp': 'euc_jp', - 'euckr': 'cp949', - 'extendedunixcodepackedformatforjapanese': 'euc_jp', - 'gb18030': 'gb18030', - 'gb2312': 'gbk', - 'gb231280': 'gbk', - 'gbk': 'gbk', - 'greek': 'iso8859-7', - 'greek8': 'iso8859-7', - 'hebrew': 'iso8859-8', - 'hproman8': 'hp-roman8', - 'hzgb2312': 'hz', - 'ibm037': 'cp037', - 'ibm1026': 'cp1026', - 'ibm367': 'ascii', - 'ibm424': 'cp424', - 'ibm437': 'cp437', - 'ibm500': 'cp500', - 'ibm775': 'cp775', - 'ibm819': 'windows-1252', - 'ibm850': 'cp850', - 'ibm852': 'cp852', - 'ibm855': 'cp855', - 'ibm857': 'cp857', - 'ibm860': 'cp860', - 'ibm861': 'cp861', - 'ibm862': 'cp862', - 'ibm863': 'cp863', - 'ibm864': 'cp864', - 'ibm865': 'cp865', - 'ibm866': 'cp866', - 'ibm869': 'cp869', - 'iso2022jp': 'iso2022_jp', - 'iso2022jp2': 'iso2022_jp_2', - 'iso2022kr': 'iso2022_kr', - 'iso646irv1991': 'ascii', - 'iso646us': 'ascii', - 'iso88591': 'windows-1252', - 'iso885910': 'iso8859-10', - 'iso8859101992': 'iso8859-10', - 'iso885911987': 'windows-1252', - 'iso885913': 'iso8859-13', - 'iso885914': 'iso8859-14', - 'iso8859141998': 'iso8859-14', - 'iso885915': 'iso8859-15', - 'iso885916': 'iso8859-16', - 'iso8859162001': 'iso8859-16', - 'iso88592': 'iso8859-2', - 'iso885921987': 'iso8859-2', - 'iso88593': 'iso8859-3', - 'iso885931988': 'iso8859-3', - 'iso88594': 'iso8859-4', - 'iso885941988': 'iso8859-4', - 'iso88595': 'iso8859-5', - 'iso885951988': 'iso8859-5', - 'iso88596': 'iso8859-6', - 'iso885961987': 'iso8859-6', - 'iso88597': 'iso8859-7', - 'iso885971987': 'iso8859-7', - 'iso88598': 'iso8859-8', - 'iso885981988': 'iso8859-8', - 'iso88599': 'windows-1254', - 'iso885991989': 'windows-1254', - 'isoceltic': 'iso8859-14', - 'isoir100': 'windows-1252', - 'isoir101': 'iso8859-2', - 'isoir109': 'iso8859-3', - 'isoir110': 'iso8859-4', - 'isoir126': 'iso8859-7', - 'isoir127': 'iso8859-6', - 'isoir138': 'iso8859-8', - 'isoir144': 'iso8859-5', - 'isoir148': 'windows-1254', - 'isoir149': 'cp949', - 'isoir157': 'iso8859-10', - 'isoir199': 'iso8859-14', - 'isoir226': 'iso8859-16', - 'isoir58': 'gbk', - 'isoir6': 'ascii', - 'koi8r': 'koi8-r', - 'koi8u': 'koi8-u', - 'korean': 'cp949', - 'ksc5601': 'cp949', - 'ksc56011987': 'cp949', - 'ksc56011989': 'cp949', - 'l1': 'windows-1252', - 'l10': 'iso8859-16', - 'l2': 'iso8859-2', - 'l3': 'iso8859-3', - 'l4': 'iso8859-4', - 'l5': 'windows-1254', - 'l6': 'iso8859-10', - 'l8': 'iso8859-14', - 'latin1': 'windows-1252', - 'latin10': 'iso8859-16', - 'latin2': 'iso8859-2', - 'latin3': 'iso8859-3', - 'latin4': 'iso8859-4', - 'latin5': 'windows-1254', - 'latin6': 'iso8859-10', - 'latin8': 'iso8859-14', - 'latin9': 'iso8859-15', - 'ms936': 'gbk', - 'mskanji': 'shift_jis', - 'pt154': 'ptcp154', - 'ptcp154': 'ptcp154', - 'r8': 'hp-roman8', - 'roman8': 'hp-roman8', - 'shiftjis': 'shift_jis', - 'tis620': 'cp874', - 'unicode11utf7': 'utf-7', - 'us': 'ascii', - 'usascii': 'ascii', - 'utf16': 'utf-16', - 'utf16be': 'utf-16-be', - 'utf16le': 'utf-16-le', - 'utf8': 'utf-8', - 'windows1250': 'cp1250', - 'windows1251': 'cp1251', - 'windows1252': 'cp1252', - 'windows1253': 'cp1253', - 'windows1254': 'cp1254', - 'windows1255': 'cp1255', - 'windows1256': 'cp1256', - 'windows1257': 'cp1257', - 'windows1258': 'cp1258', - 'windows936': 'gbk', - 'x-x-big5': 'big5'} - tokenTypes = { "Doctype": 0, "Characters": 1, diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index ae980c55..ed44a552 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -139,7 +139,7 @@ def documentEncoding(self): """ if not hasattr(self, 'tokenizer'): return None - return self.tokenizer.stream.charEncoding[0] + return self.tokenizer.stream.charEncoding[0].name def isHTMLIntegrationPoint(self, element): if (element.name == "annotation-xml" and diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index 63373db9..20f6c95a 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -1,13 +1,15 @@ from __future__ import absolute_import, division, unicode_literals -from six import text_type +from six import text_type, binary_type from six.moves import http_client, urllib import codecs import re +import webencodings + from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase -from .constants import encodings, ReparseException +from .constants import ReparseException from . import utils from io import StringIO @@ -195,7 +197,7 @@ def __init__(self, source): # List of where new lines occur self.newLines = [0] - self.charEncoding = ("utf-8", "certain") + self.charEncoding = (lookupEncoding("utf-8"), "certain") self.dataStream = self.openStream(source) self.reset() @@ -421,7 +423,7 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True): HTMLUnicodeInputStream.__init__(self, self.rawStream) - self.charEncoding = (codecName(encoding), "certain") + self.charEncoding = (lookupEncoding(encoding), "certain") # Encoding Information # Number of bytes to use when looking for a meta element with @@ -440,8 +442,7 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True): self.reset() def reset(self): - self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream, - 'replace') + self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace') HTMLUnicodeInputStream.reset(self) def openStream(self, source): @@ -491,30 +492,25 @@ def detectEncoding(self, parseMeta=True, chardet=True): buffers.append(buffer) detector.feed(buffer) detector.close() - encoding = detector.result['encoding'] + encoding = lookupEncoding(detector.result['encoding']) self.rawStream.seek(0) except ImportError: pass # If all else fails use the default encoding if encoding is None: confidence = "tentative" - encoding = self.defaultEncoding - - # Substitute for equivalent encodings: - encodingSub = {"iso-8859-1": "windows-1252"} - - if encoding.lower() in encodingSub: - encoding = encodingSub[encoding.lower()] + encoding = lookupEncoding(self.defaultEncoding) return encoding, confidence def changeEncoding(self, newEncoding): assert self.charEncoding[1] != "certain" - newEncoding = codecName(newEncoding) - if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"): - newEncoding = "utf-8" + newEncoding = lookupEncoding(newEncoding) if newEncoding is None: return + if newEncoding.name in ("utf-16be", "utf-16le"): + newEncoding = lookupEncoding("utf-8") + assert newEncoding is not None elif newEncoding == self.charEncoding[0]: self.charEncoding = (self.charEncoding[0], "certain") else: @@ -529,8 +525,8 @@ def detectBOM(self): encoding otherwise return None""" bomDict = { codecs.BOM_UTF8: 'utf-8', - codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be', - codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be' + codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be', + codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be' } # Go to beginning of file and read in 4 bytes @@ -550,9 +546,12 @@ def detectBOM(self): # Set the read position past the BOM if one was found, otherwise # set it to the start of the stream - self.rawStream.seek(encoding and seek or 0) - - return encoding + if encoding: + self.rawStream.seek(seek) + return lookupEncoding(encoding) + else: + self.rawStream.seek(0) + return None def detectEncodingMeta(self): """Report the encoding declared by the meta element @@ -563,8 +562,8 @@ def detectEncodingMeta(self): self.rawStream.seek(0) encoding = parser.getEncoding() - if encoding in ("utf-16", "utf-16-be", "utf-16-le"): - encoding = "utf-8" + if encoding is not None and encoding.name in ("utf-16be", "utf-16le"): + encoding = lookupEncoding("utf-8") return encoding @@ -727,7 +726,7 @@ def handleMeta(self): return False elif attr[0] == b"charset": tentativeEncoding = attr[1] - codec = codecName(tentativeEncoding) + codec = lookupEncoding(tentativeEncoding) if codec is not None: self.encoding = codec return False @@ -735,7 +734,7 @@ def handleMeta(self): contentParser = ContentAttrParser(EncodingBytes(attr[1])) tentativeEncoding = contentParser.parse() if tentativeEncoding is not None: - codec = codecName(tentativeEncoding) + codec = lookupEncoding(tentativeEncoding) if codec is not None: if hasPragma: self.encoding = codec @@ -892,16 +891,19 @@ def parse(self): return None -def codecName(encoding): +def lookupEncoding(encoding): """Return the python codec name corresponding to an encoding or None if the string doesn't correspond to a valid encoding.""" - if isinstance(encoding, bytes): + if isinstance(encoding, binary_type): try: encoding = encoding.decode("ascii") except UnicodeDecodeError: return None - if encoding: - canonicalName = ascii_punctuation_re.sub("", encoding).lower() - return encodings.get(canonicalName, None) + + if encoding is not None: + try: + return webencodings.lookup(encoding) + except AttributeError: + return None else: return None diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py index d774ce0f..837e989f 100644 --- a/html5lib/tests/test_encoding.py +++ b/html5lib/tests/test_encoding.py @@ -12,20 +12,6 @@ from html5lib import HTMLParser, inputstream -class Html5EncodingTestCase(unittest.TestCase): - def test_codec_name_a(self): - self.assertEqual(inputstream.codecName("utf-8"), "utf-8") - - def test_codec_name_b(self): - self.assertEqual(inputstream.codecName("utf8"), "utf-8") - - def test_codec_name_c(self): - self.assertEqual(inputstream.codecName(" utf8 "), "utf-8") - - def test_codec_name_d(self): - self.assertEqual(inputstream.codecName("ISO_8859--1"), "windows-1252") - - def runParserEncodingTest(data, encoding): p = HTMLParser() assert p.documentEncoding is None @@ -43,7 +29,7 @@ def runPreScanEncodingTest(data, encoding): if len(data) > stream.numBytesMeta: return - assert encoding == stream.charEncoding[0], errorMessage(data, encoding, stream.charEncoding[0]) + assert encoding == stream.charEncoding[0].name, errorMessage(data, encoding, stream.charEncoding[0].name) def test_encoding(): @@ -64,4 +50,4 @@ def test_encoding(): def test_chardet(): with open(os.path.join(test_dir, "encoding" , "chardet", "test_big5.txt"), "rb") as fp: encoding = inputstream.HTMLInputStream(fp.read()).charEncoding - assert encoding[0].lower() == "big5" + assert encoding[0].name == "big5" diff --git a/html5lib/tests/test_stream.py b/html5lib/tests/test_stream.py index 4436ef8a..ed203766 100644 --- a/html5lib/tests/test_stream.py +++ b/html5lib/tests/test_stream.py @@ -86,29 +86,29 @@ class HTMLInputStreamTest(unittest.TestCase): def test_char_ascii(self): stream = HTMLInputStream(b"'", encoding='ascii') - self.assertEqual(stream.charEncoding[0], 'ascii') + self.assertEqual(stream.charEncoding[0].name, 'windows-1252') self.assertEqual(stream.char(), "'") def test_char_utf8(self): stream = HTMLInputStream('\u2018'.encode('utf-8'), encoding='utf-8') - self.assertEqual(stream.charEncoding[0], 'utf-8') + self.assertEqual(stream.charEncoding[0].name, 'utf-8') self.assertEqual(stream.char(), '\u2018') def test_char_win1252(self): stream = HTMLInputStream("\xa9\xf1\u2019".encode('windows-1252')) - self.assertEqual(stream.charEncoding[0], 'windows-1252') + self.assertEqual(stream.charEncoding[0].name, 'windows-1252') self.assertEqual(stream.char(), "\xa9") self.assertEqual(stream.char(), "\xf1") self.assertEqual(stream.char(), "\u2019") def test_bom(self): stream = HTMLInputStream(codecs.BOM_UTF8 + b"'") - self.assertEqual(stream.charEncoding[0], 'utf-8') + self.assertEqual(stream.charEncoding[0].name, 'utf-8') self.assertEqual(stream.char(), "'") def test_utf_16(self): stream = HTMLInputStream((' ' * 1025).encode('utf-16')) - self.assertTrue(stream.charEncoding[0] in ['utf-16-le', 'utf-16-be'], stream.charEncoding) + self.assertTrue(stream.charEncoding[0].name in ['utf-16le', 'utf-16be'], stream.charEncoding) self.assertEqual(len(stream.charsUntil(' ', True)), 1025) def test_newlines(self): diff --git a/requirements.txt b/requirements.txt index ffe2fce4..15cae9dc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ six +webencodings diff --git a/setup.py b/setup.py index 7b06b45e..187a4169 100644 --- a/setup.py +++ b/setup.py @@ -57,6 +57,7 @@ packages=packages, install_requires=[ 'six', + 'webencodings', ], extras_require={ # A empty extra that only has a conditional marker will be diff --git a/tox.ini b/tox.ini index e66298d5..4a29b553 100644 --- a/tox.ini +++ b/tox.ini @@ -7,6 +7,8 @@ deps = pytest pytest-expect>=1.0,<2.0 mock + base: six + base: webencodings py26-base: ordereddict optional: -r{toxinidir}/requirements-optional.txt diff --git a/utils/iana_parse.py b/utils/iana_parse.py deleted file mode 100644 index 6dde94c2..00000000 --- a/utils/iana_parse.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python -import sys -import urllib.request, urllib.error, urllib.parse -import codecs - -def main(): - encodings = [] - f = urllib.request.urlopen(sys.argv[1]) - for line in f: - if line.startswith("Name: ") or line.startswith("Alias: "): - enc = line.split()[1] - try: - codecs.lookup(enc) - if enc.lower not in encodings: - encodings.append(enc.lower()) - except LookupError: - pass - sys.stdout.write("encodings = frozenset((\n") - for enc in encodings: - sys.stdout.write(' "%s",\n'%enc) - sys.stdout.write(' ))') - -if __name__ == "__main__": - main() \ No newline at end of file