Skip to content

Commit

Permalink
Remove simpletree, changing the default tree builder to etree.
Browse files Browse the repository at this point in the history
  • Loading branch information
ambv authored and gsnedders committed May 7, 2013
1 parent b0dda81 commit 96da7f5
Show file tree
Hide file tree
Showing 15 changed files with 95 additions and 413 deletions.
4 changes: 4 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ Change Log

Released on XXX, 2013

* Removed ``simpletree`` from the package. The default tree builder is
now ``etree`` (using the ``xml.etree.ElementTree/cElementTree``
implementation).


0.95
~~~~
Expand Down
12 changes: 6 additions & 6 deletions html5lib/html5parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

from . import treebuilders
from .treebuilders._base import Marker
from .treebuilders import simpletree

from . import utils
from . import constants
Expand All @@ -20,15 +19,15 @@
from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements


def parse(doc, treebuilder="simpletree", encoding=None,
def parse(doc, treebuilder="etree", encoding=None,
namespaceHTMLElements=True):
"""Parse a string or file-like object into a tree"""
tb = treebuilders.getTreeBuilder(treebuilder)
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
return p.parse(doc, encoding=encoding)


def parseFragment(doc, container="div", treebuilder="simpletree", encoding=None,
def parseFragment(doc, container="div", treebuilder="etree", encoding=None,
namespaceHTMLElements=True):
tb = treebuilders.getTreeBuilder(treebuilder)
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
Expand All @@ -51,9 +50,8 @@ class HTMLParser(object):
"""HTML parser. Generates a tree structure from a stream of (possibly
malformed) HTML"""

def __init__(self, tree=simpletree.TreeBuilder,
tokenizer=tokenizer.HTMLTokenizer, strict=False,
namespaceHTMLElements=True, debug=False):
def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
strict=False, namespaceHTMLElements=True, debug=False):
"""
strict - raise an exception when a parse error is encountered
Expand All @@ -69,6 +67,8 @@ def __init__(self, tree=simpletree.TreeBuilder,
# Raise an exception on the first error encountered
self.strict = strict

if tree is None:
tree = treebuilders.getTreeBuilder("etree")
self.tree = tree(namespaceHTMLElements)
self.tokenizer_class = tokenizer
self.errors = []
Expand Down
2 changes: 1 addition & 1 deletion html5lib/serializer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from .htmlserializer import HTMLSerializer


def serialize(input, tree="simpletree", format="html", encoding=None,
def serialize(input, tree="etree", format="html", encoding=None,
**serializer_opts):
# XXX: Should we cache this?
walker = treewalkers.getTreeWalker(tree)
Expand Down
5 changes: 2 additions & 3 deletions html5lib/tests/support.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@
del base_path

# Build a dict of avaliable trees
treeTypes = {"simpletree": treebuilders.getTreeBuilder("simpletree"),
"DOM": treebuilders.getTreeBuilder("dom")}
treeTypes = {"DOM": treebuilders.getTreeBuilder("dom")}

# Try whatever etree implementations are avaliable from a list that are
#"supposed" to work
Expand Down Expand Up @@ -64,7 +63,7 @@ def __getitem__(self, key):

class TestData(object):
def __init__(self, filename, newTestHeading="data", encoding="utf8"):
if encoding == None:
if encoding is None:
self.f = open(filename, mode="rb")
else:
self.f = codecs.open(filename, encoding=encoding)
Expand Down
29 changes: 21 additions & 8 deletions html5lib/tests/test_parser2.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from . import support # flake8: noqa
from html5lib import html5parser
from html5lib.constants import namespaces
from html5lib.treebuilders import dom
from html5lib import treebuilders

import unittest

Expand All @@ -14,29 +14,42 @@

class MoreParserTests(unittest.TestCase):

def setUp(self):
self.dom_tree = treebuilders.getTreeBuilder("dom")

def test_assertDoctypeCloneable(self):
parser = html5parser.HTMLParser(tree=dom.TreeBuilder)
parser = html5parser.HTMLParser(tree=self.dom_tree)
doc = parser.parse('<!DOCTYPE HTML>')
self.assertTrue(doc.cloneNode(True))

def test_line_counter(self):
# http:https://groups.google.com/group/html5lib-discuss/browse_frm/thread/f4f00e4a2f26d5c0
parser = html5parser.HTMLParser(tree=dom.TreeBuilder)
parser = html5parser.HTMLParser(tree=self.dom_tree)
parser.parse("<pre>\nx\n&gt;\n</pre>")

def test_namespace_html_elements_0(self):
def test_namespace_html_elements_0_dom(self):
parser = html5parser.HTMLParser(tree=self.dom_tree, namespaceHTMLElements=True)
doc = parser.parse("<html></html>")
self.assertTrue(doc.childNodes[0].namespaceURI == namespaces["html"])

def test_namespace_html_elements_1_dom(self):
parser = html5parser.HTMLParser(tree=self.dom_tree, namespaceHTMLElements=False)
doc = parser.parse("<html></html>")
self.assertTrue(doc.childNodes[0].namespaceURI is None)

def test_namespace_html_elements_0_etree(self):
parser = html5parser.HTMLParser(namespaceHTMLElements=True)
doc = parser.parse("<html></html>")
self.assertTrue(doc.childNodes[0].namespace == namespaces["html"])
self.assertTrue(list(doc)[0].tag == "{%s}html" % (namespaces["html"],))

def test_namespace_html_elements_1(self):
def test_namespace_html_elements_1_etree(self):
parser = html5parser.HTMLParser(namespaceHTMLElements=False)
doc = parser.parse("<html></html>")
self.assertTrue(doc.childNodes[0].namespace == None)
self.assertTrue(list(doc)[0].tag == "html")

def test_unicode_file(self):
parser = html5parser.HTMLParser()
doc = parser.parse(io.StringIO("a"))
parser.parse(io.StringIO("a"))


def buildTestSuite():
Expand Down
59 changes: 42 additions & 17 deletions html5lib/tests/test_sanitizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,43 @@
except ImportError:
import simplejson as json

from html5lib import html5parser, sanitizer, constants
from html5lib import html5parser, sanitizer, constants, treebuilders


def runSanitizerTest(name, expected, input):
expected = ''.join([token.toxml() for token in html5parser.HTMLParser().
parseFragment(expected).childNodes])
def toxmlFactory():
tree = treebuilders.getTreeBuilder("etree")

def toxml(element):
# encode/decode roundtrip required for Python 2.6 compatibility
result_bytes = tree.implementation.tostring(element, encoding="utf-8")
return result_bytes.decode("utf-8")

return toxml


def runSanitizerTest(name, expected, input, toxml=None):
if toxml is None:
toxml = toxmlFactory()
expected = ''.join([toxml(token) for token in html5parser.HTMLParser().
parseFragment(expected)])
expected = json.loads(json.dumps(expected))
assert expected == sanitize_html(input)


def sanitize_html(stream):
return ''.join([token.toxml() for token in
def sanitize_html(stream, toxml=None):
if toxml is None:
toxml = toxmlFactory()
return ''.join([toxml(token) for token in
html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer).
parseFragment(stream).childNodes])
parseFragment(stream)])


def test_should_handle_astral_plane_characters():
assert "<p>\U0001d4b5 \U0001d538</p>" == sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
assert '<html:p xmlns:html="http:https://www.w3.org/1999/xhtml">\U0001d4b5 \U0001d538</html:p>' == sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")


def test_sanitizer():
toxml = toxmlFactory()
for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr']:
continue # TODO
Expand All @@ -34,25 +50,30 @@ def test_sanitizer():
if tag_name == 'image':
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
"<img title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz",
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
toxml)
elif tag_name == 'br':
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
"<br title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz<br/>",
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
toxml)
elif tag_name in constants.voidElements:
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
"<%s title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz" % tag_name,
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
toxml)
else:
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
"<%s title=\"1\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</%s>" % (tag_name, tag_name),
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
toxml)

for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
tag_name = tag_name.upper()
yield (runSanitizerTest, "test_should_forbid_%s_tag" % tag_name,
"&lt;%s title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/%s&gt;" % (tag_name, tag_name),
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
toxml)

for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
if attribute_name != attribute_name.lower():
Expand All @@ -61,20 +82,24 @@ def test_sanitizer():
continue
yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name,
"<p %s=\"foo\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % attribute_name,
"<p %s='foo'>foo <bad>bar</bad> baz</p>" % attribute_name)
"<p %s='foo'>foo <bad>bar</bad> baz</p>" % attribute_name,
toxml)

for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
attribute_name = attribute_name.upper()
yield (runSanitizerTest, "test_should_forbid_%s_attribute" % attribute_name,
"<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>",
"<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name)
"<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name,
toxml)

for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
yield (runSanitizerTest, "test_should_allow_%s_uris" % protocol,
"<a href=\"%s\">foo</a>" % protocol,
"""<a href="%s">foo</a>""" % protocol)
"""<a href="%s">foo</a>""" % protocol,
toxml)

for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
"<a href=\"%s\">foo</a>" % protocol,
"""<a href="%s">foo</a>""" % protocol)
"""<a href="%s">foo</a>""" % protocol,
toxml)
16 changes: 7 additions & 9 deletions html5lib/tests/test_treewalkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,11 @@ def PullDOMAdapter(node):
raise NotImplementedError("Node type not supported: " + str(node.nodeType))

treeTypes = {
"simpletree": {"builder": treebuilders.getTreeBuilder("simpletree"),
"walker": treewalkers.getTreeWalker("simpletree")},
"DOM": {"builder": treebuilders.getTreeBuilder("dom"),
"walker": treewalkers.getTreeWalker("dom")},
"PullDOM": {"builder": treebuilders.getTreeBuilder("dom"),
"adapter": PullDOMAdapter,
"walker": treewalkers.getTreeWalker("pulldom")},
"DOM": {"builder": treebuilders.getTreeBuilder("dom"),
"walker": treewalkers.getTreeWalker("dom")},
"PullDOM": {"builder": treebuilders.getTreeBuilder("dom"),
"adapter": PullDOMAdapter,
"walker": treewalkers.getTreeWalker("pulldom")},
}

# Try whatever etree implementations are available from a list that are
Expand Down Expand Up @@ -103,7 +101,7 @@ def PullDOMAdapter(node):
else:
def GenshiAdapter(tree):
text = None
for token in treewalkers.getTreeWalker("simpletree")(tree):
for token in treewalkers.getTreeWalker("dom")(tree):
type = token["type"]
if type in ("Characters", "SpaceCharacters"):
if text is None:
Expand Down Expand Up @@ -147,7 +145,7 @@ def GenshiAdapter(tree):
yield TEXT, text, (None, -1, -1)

treeTypes["genshi"] = \
{"builder": treebuilders.getTreeBuilder("simpletree"),
{"builder": treebuilders.getTreeBuilder("dom"),
"adapter": GenshiAdapter,
"walker": treewalkers.getTreeWalker("genshi")}

Expand Down
13 changes: 2 additions & 11 deletions html5lib/treebuilders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
1) A set of classes for various types of elements: Document, Doctype,
Comment, Element. These must implement the interface of
_base.treebuilders.Node (although comment nodes have a different
signature for their constructor, see treebuilders.simpletree.Comment)
signature for their constructor, see treebuilders.etree.Comment)
Textual content may also be implemented as another node type, or not, as
your tree implementation requires.
Expand All @@ -24,10 +24,6 @@
testSerializer method on your treebuilder which accepts a node and
returns a string containing Node and its children serialized according
to the format used in the unittests
The supplied simpletree module provides a python-only implementation
of a full treebuilder and is a useful reference for the semantics of
the various methods.
"""

from __future__ import absolute_import, division, unicode_literals
Expand All @@ -39,10 +35,8 @@ def getTreeBuilder(treeType, implementation=None, **kwargs):
"""Get a TreeBuilder class for various types of tree with built-in support
treeType - the name of the tree type required (case-insensitive). Supported
values are "simpletree", "dom", and "etree"
values are:
"simpletree" - a built-in DOM-ish tree type with support for some
more pythonic idioms.
"dom" - A generic builder for DOM implementations, defaulting to
a xml.dom.minidom based implementation for the sake of
backwards compatibility (as releases up until 0.10 had a
Expand All @@ -65,9 +59,6 @@ def getTreeBuilder(treeType, implementation=None, **kwargs):
implementation = minidom
# XXX: NEVER cache here, caching is done in the dom submodule
return dom.getDomModule(implementation, **kwargs).TreeBuilder
elif treeType == "simpletree":
from . import simpletree
treeBuilderCache[treeType] = simpletree.TreeBuilder
elif treeType == "lxml":
from . import etree_lxml
treeBuilderCache[treeType] = etree_lxml.TreeBuilder
Expand Down
1 change: 1 addition & 0 deletions html5lib/treebuilders/dom.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ def insertText(self, data, parent=None):
self.dom._child_node_types.append(Node.TEXT_NODE)
self.dom.appendChild(self.dom.createTextNode(data))

implementation = DomImplementation
name = None

def testSerializer(element):
Expand Down
1 change: 1 addition & 0 deletions html5lib/treebuilders/etree.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,7 @@ class TreeBuilder(_base.TreeBuilder):
elementClass = Element
commentClass = Comment
fragmentClass = DocumentFragment
implementation = ElementTreeImplementation

def testSerializer(self, element):
return testSerializer(element)
Expand Down
7 changes: 4 additions & 3 deletions html5lib/treebuilders/etree_lxml.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def serializeElement(element, indent=0):
if element.text:
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
indent += 2
for child in element.getchildren():
for child in element:
serializeElement(child, indent)
if hasattr(element, "tail") and element.tail:
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
Expand Down Expand Up @@ -163,7 +163,7 @@ def serializeElement(element):
if element.text:
rv.append(element.text)

for child in element.getchildren():
for child in element:
serializeElement(child)

rv.append("</%s>" % (element.tag,))
Expand All @@ -185,6 +185,7 @@ class TreeBuilder(_base.TreeBuilder):
elementClass = None
commentClass = None
fragmentClass = Document
implementation = etree

def __init__(self, namespaceHTMLElements, fullTree=False):
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
Expand Down Expand Up @@ -280,7 +281,7 @@ def getFragment(self):
element = self.openElements[0]._element
if element.text:
fragment.append(element.text)
fragment.extend(element.getchildren())
fragment.extend(list(element))
if element.tail:
fragment.append(element.tail)
return fragment
Expand Down
Loading

0 comments on commit 96da7f5

Please sign in to comment.