Skip to content

Commit

Permalink
Add some parser benchmarks
Browse files Browse the repository at this point in the history
These tests are based on two things: some from WPT, and some based on Anolis
  • Loading branch information
gsnedders committed Jun 17, 2020
1 parent d49afd3 commit 0fdd819
Show file tree
Hide file tree
Showing 36 changed files with 8,220 additions and 0 deletions.
57 changes: 57 additions & 0 deletions benchmarks/bench_html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import io
import os
import sys

import pyperf

sys.path[0:0] = [os.path.join(os.path.dirname(__file__), "..")]
import html5lib # noqa: E402


def bench_parse(fh, treebuilder):
fh.seek(0)
html5lib.parse(fh, treebuilder=treebuilder, useChardet=False)


def bench_serialize(loops, fh, treebuilder):
fh.seek(0)
doc = html5lib.parse(fh, treebuilder=treebuilder, useChardet=False)

range_it = range(loops)
t0 = pyperf.perf_counter()

for loops in range_it:
html5lib.serialize(doc, tree=treebuilder, encoding="ascii", inject_meta_charset=False)

return pyperf.perf_counter() - t0


BENCHMARKS = ["parse", "serialize"]


def add_cmdline_args(cmd, args):
if args.benchmark:
cmd.append(args.benchmark)


if __name__ == "__main__":
runner = pyperf.Runner(add_cmdline_args=add_cmdline_args)
runner.metadata["description"] = "Run benchmarks based on Anolis"
runner.argparser.add_argument("benchmark", nargs="?", choices=BENCHMARKS)

args = runner.parse_args()
if args.benchmark:
benchmarks = (args.benchmark,)
else:
benchmarks = BENCHMARKS

with open(os.path.join(os.path.dirname(__file__), "data", "html.html"), "rb") as fh:
source = io.BytesIO(fh.read())

if "parse" in benchmarks:
for tb in ("etree", "dom", "lxml"):
runner.bench_func("html_parse_%s" % tb, bench_parse, source, tb)

if "serialize" in benchmarks:
for tb in ("etree", "dom", "lxml"):
runner.bench_time_func("html_serialize_%s" % tb, bench_serialize, source, tb)
45 changes: 45 additions & 0 deletions benchmarks/bench_wpt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import io
import os
import sys

import pyperf

sys.path[0:0] = [os.path.join(os.path.dirname(__file__), "..")]
import html5lib # noqa: E402


def bench_html5lib(fh):
fh.seek(0)
html5lib.parse(fh, treebuilder="etree", useChardet=False)


def add_cmdline_args(cmd, args):
if args.benchmark:
cmd.append(args.benchmark)


BENCHMARKS = {}
for root, dirs, files in os.walk(os.path.join(os.path.dirname(os.path.abspath(__file__)), "data", "wpt")):
for f in files:
if f.endswith(".html"):
BENCHMARKS[f[: -len(".html")]] = os.path.join(root, f)


if __name__ == "__main__":
runner = pyperf.Runner(add_cmdline_args=add_cmdline_args)
runner.metadata["description"] = "Run parser benchmarks from WPT"
runner.argparser.add_argument("benchmark", nargs="?", choices=sorted(BENCHMARKS))

args = runner.parse_args()
if args.benchmark:
benchmarks = (args.benchmark,)
else:
benchmarks = sorted(BENCHMARKS)

for bench in benchmarks:
name = "wpt_%s" % bench
path = BENCHMARKS[bench]
with open(path, "rb") as fh:
fh2 = io.BytesIO(fh.read())

runner.bench_func(name, bench_html5lib, fh2)
8 changes: 8 additions & 0 deletions benchmarks/data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
The files in this data are derived from:

* `html.html`: from [html](http:https://github.com/whatwg/html), revision
77db356a293f2b152b648c836b6989d17afe42bb. This is the first 5000 lines of `source`. (This is
representative of the input to [Anolis](https://bitbucket.org/ms2ger/anolis/); first 5000 lines
chosen to make it parse in a reasonable time.)

* `wpt`: see `wpt/README.md`.
Loading

0 comments on commit 0fdd819

Please sign in to comment.