forked from html5lib/html5lib-python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_parser.py
96 lines (73 loc) · 3.53 KB
/
test_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from __future__ import absolute_import, division, unicode_literals
import os
import sys
import traceback
import warnings
import re
warnings.simplefilter("error")
from .support import get_data_files
from .support import TestData, convert, convertExpected, treeTypes
from html5lib import html5parser, constants
# Run the parse error checks
checkParseErrors = False
# XXX - There should just be one function here but for some reason the testcase
# format differs from the treedump format by a single space character
def convertTreeDump(data):
return "\n".join(convert(3)(data).split("\n")[1:])
namespaceExpected = re.compile(r"^(\s*)<(\S+)>", re.M).sub
def runParserTest(innerHTML, input, expected, errors, treeClass,
namespaceHTMLElements):
with warnings.catch_warnings(record=True) as caughtWarnings:
warnings.simplefilter("always")
p = html5parser.HTMLParser(tree=treeClass,
namespaceHTMLElements=namespaceHTMLElements)
try:
if innerHTML:
document = p.parseFragment(input, innerHTML)
else:
document = p.parse(input)
except:
errorMsg = "\n".join(["\n\nInput:", input, "\nExpected:", expected,
"\nTraceback:", traceback.format_exc()])
assert False, errorMsg
otherWarnings = [x for x in caughtWarnings
if not issubclass(x.category, constants.DataLossWarning)]
assert len(otherWarnings) == 0, [(x.category, x.message) for x in otherWarnings]
if len(caughtWarnings):
return
output = convertTreeDump(p.tree.testSerializer(document))
expected = convertExpected(expected)
if namespaceHTMLElements:
expected = namespaceExpected(r"\1<html \2>", expected)
errorMsg = "\n".join(["\n\nInput:", input, "\nExpected:", expected,
"\nReceived:", output])
assert expected == output, errorMsg
errStr = []
for (line, col), errorcode, datavars in p.errors:
assert isinstance(datavars, dict), "%s, %s" % (errorcode, repr(datavars))
errStr.append("Line: %i Col: %i %s" % (line, col,
constants.E[errorcode] % datavars))
errorMsg2 = "\n".join(["\n\nInput:", input,
"\nExpected errors (" + str(len(errors)) + "):\n" + "\n".join(errors),
"\nActual errors (" + str(len(p.errors)) + "):\n" + "\n".join(errStr)])
if checkParseErrors:
assert len(p.errors) == len(errors), errorMsg2
def test_parser():
sys.stderr.write('Testing tree builders ' + " ".join(list(treeTypes.keys())) + "\n")
files = get_data_files('tree-construction')
for filename in files:
testName = os.path.basename(filename).replace(".dat", "")
if testName in ("template",):
continue
tests = TestData(filename, "data")
for index, test in enumerate(tests):
input, errors, innerHTML, expected = [test[key] for key in
('data', 'errors',
'document-fragment',
'document')]
if errors:
errors = errors.split("\n")
for treeName, treeCls in treeTypes.items():
for namespaceHTMLElements in (True, False):
yield (runParserTest, innerHTML, input, expected, errors, treeCls,
namespaceHTMLElements)