forked from html5lib/html5lib-python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse.py
executable file
·106 lines (90 loc) · 3.29 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env python
"""usage: %prog [options] filename
Parse a document to a simpletree tree, with optional profiling
"""
import sys
import os
from optparse import OptionParser
from src import html5parser, treebuilders
def convertTreeDump(treedump):
"""convert the output of str(document) to something more readable
"""
treedump = treedump.split("\n")[1:]
rv = []
for line in treedump:
if line.startswith("|"):
rv.append(line[3:])
else:
rv.append(line)
return "\n".join(rv)
def parse():
optParser = getOptParser()
opts,args = optParser.parse_args()
try:
f = args[-1]
except IndexError:
print "No filename provided. Use -h for help"
sys.exit(1)
if opts.treebuilder is not None:
try:
treebuilder = __import__("treebuilders." + opts.treebuilder,
None,None,"treebuilders").TreeBuilder
except ImportError, name:
print "Treebuilder %s not found"%name
raise
except Exception, foo:
import treebuilders.simpletree
treebuilder = treebuilders.simpletree.TreeBuilder
else:
import treebuilders.simpletree
treebuilder = treebuilders.simpletree.TreeBuilder
p = html5parser.HTMLParser(tree=treebuilder)
if opts.profile:
import hotshot
import hotshot.stats
prof = hotshot.Profile('stats.prof')
prof.runcall(p.parse, f, False)
prof.close()
# XXX - We should use a temp file here
stats = hotshot.stats.load('stats.prof')
stats.strip_dirs()
stats.sort_stats('time')
stats.print_stats()
elif opts.time:
import time
t0 = time.time()
document = p.parse(f)
t1 = time.time()
if opts.xml:
print document.toxml('utf-8')
else:
print p.tree.testSerializer(document).encode("utf-8")
if opts.error:
print "\nParse errors:\n" + "\n".join(p.errors)
t2 = time.time()
print "\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1)
else:
document = p.parse(f)
if opts.xml:
print document.toxml('utf-8')
else:
print p.tree.testSerializer(document).encode("utf-8")
if opts.error:
print "\nParse errors:\n" + "\n".join(p.errors)
def getOptParser():
parser = OptionParser(usage=__doc__)
parser.add_option("-p", "--profile", action="store_true", default=False,
dest="profile", help="Use the hotshot profiler to "
"produce a detailed log of the run")
parser.add_option("-t", "--time",
action="store_true", default=False, dest="time",
help="Time the run using time.time (may not be accurate on all platforms, especially for short runs)")
parser.add_option("-b", "--treebuilder", action="store", type="string",
dest="treebuilder")
parser.add_option("-e", "--error", action="store_true", default=False,
dest="error", help="Print a list of parse errors")
parser.add_option("-x", "--xml", action="store_true", default=False,
dest="xml", help="output as xml")
return parser
if __name__ == "__main__":
parse()