Skip to content

Commit

Permalink
Sync options
Browse files Browse the repository at this point in the history
--HG--
extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40821
  • Loading branch information
rubys committed Jun 26, 2007
1 parent 77700c0 commit a69f4f5
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 49 deletions.
11 changes: 6 additions & 5 deletions parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,7 @@ def printOutput(parser, document, opts):
sys.stdout.write(document.hilite("utf-8"))
elif opts.html:
kwargs = {}
for opt in ['inject_meta_charset', 'strip_whitespace', 'sanitize',
'omit_optional_tags', 'quote_attr_values', 'quote_char',
'use_best_quote_char', 'minimize_boolean_attributes',
'use_trailing_solidus', 'escape_lt_in_attrs',
'escape_rcdata']:
for opt in serializer.HTMLSerializer.options:
kwargs[opt] = getattr(opts,opt)
if not kwargs['quote_char']: del kwargs['quote_char']
tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
Expand Down Expand Up @@ -176,6 +172,11 @@ def getOptParser():
default=False, dest="use_trailing_solidus",
help="use trailing solidus")

parser.add_option("", "--space-before-trailing-solidus",
action="store_true", default=False,
dest="space_before_trailing_solidus",
help="add space before trailing solidus")

parser.add_option("", "--escape-lt-in-attrs", action="store_true",
default=False, dest="escape_lt_in_attrs",
help="escape less than signs in attribute values")
Expand Down
8 changes: 8 additions & 0 deletions src/html5lib/filters/sanitizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import _base
from html5lib.sanitizer import HTMLSanitizerMixin

class Filter(_base.Filter, HTMLSanitizerMixin):
def __iter__(self):
for token in _base.Filter.__iter__(self):
token = self.sanitize_token(token)
if token: yield token
69 changes: 37 additions & 32 deletions src/html5lib/sanitizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from xml.sax.saxutils import escape, unescape
from tokenizer import HTMLTokenizer

class HTMLSanitizer(HTMLTokenizer):
class HTMLSanitizerMixin:
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""

acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
Expand Down Expand Up @@ -130,38 +130,37 @@ class HTMLSanitizer(HTMLTokenizer):
# => <script> do_nasty_stuff() </script>
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a>
def __iter__(self):
for token in HTMLTokenizer.__iter__(self):
if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
if token["name"] in self.allowed_elements:
if token.has_key("data"):
attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
for attr in self.attr_val_is_uri:
if not attrs.has_key(attr): continue
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
del attrs[attr]
if attrs.has_key('style'):
attrs['style'] = self.sanitize_css(attrs['style'])
token["data"] = [[name,val] for name,val in attrs.items()]
yield token
else:
if token["type"] == "EndTag":
token["data"] = "</%s>" % token["name"]
elif token["data"]:
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
token["data"] = "<%s%s>" % (token["name"],attrs)
else:
token["data"] = "<%s>" % token["name"]
if token["type"] == "EmptyTag":
token["data"]=token["data"][:-1] + "/>"
token["type"] = "Characters"
del token["name"]
yield token
elif token["type"] == "Comment":
pass
def sanitize_token(self, token):
if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
if token["name"] in self.allowed_elements:
if token.has_key("data"):
attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
for attr in self.attr_val_is_uri:
if not attrs.has_key(attr): continue
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
del attrs[attr]
if attrs.has_key('style'):
attrs['style'] = self.sanitize_css(attrs['style'])
token["data"] = [[name,val] for name,val in attrs.items()]
return token
else:
yield token
if token["type"] == "EndTag":
token["data"] = "</%s>" % token["name"]
elif token["data"]:
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
token["data"] = "<%s%s>" % (token["name"],attrs)
else:
token["data"] = "<%s>" % token["name"]
if token["type"] == "EmptyTag":
token["data"]=token["data"][:-1] + "/>"
token["type"] = "Characters"
del token["name"]
return token
elif token["type"] == "Comment":
pass
else:
return token

def sanitize_css(self, style):
# disallow urls
Expand All @@ -187,3 +186,9 @@ def sanitize_css(self, style):
clean.append(prop + ': ' + value + ';')

return ' '.join(clean)

class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
def __iter__(self):
for token in HTMLTokenizer.__iter__(self):
token = self.sanitize_token(token)
if token: yield token
25 changes: 13 additions & 12 deletions src/html5lib/serializer/htmlserializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,6 @@
import gettext
_ = gettext.gettext

from html5lib.filters.whitespace import Filter as WhitespaceFilter
from html5lib.filters.optionaltags import Filter as OptionalTagFilter
from html5lib.filters.inject_meta_charset import Filter as InjectMetaCharsetFilter

from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
from html5lib.constants import rcdataElements

Expand Down Expand Up @@ -67,17 +63,16 @@ class HTMLSerializer(object):
escape_lt_in_attrs = False
escape_rcdata = False

omit_optional_tags = True

strip_whitespace = False

inject_meta_charset = True
strip_whitespace = False
sanitize = False
omit_optional_tags = True

options = ("quote_attr_values", "quote_char", "use_best_quote_char",
"minimize_boolean_attributes", "use_trailing_solidus",
"space_before_trailing_solidus", "omit_optional_tags",
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
"escape_rcdata")
"escape_rcdata", 'use_trailing_solidus', "sanitize")

def __init__(self, **kwargs):
if kwargs.has_key('quote_char'):
Expand All @@ -91,13 +86,19 @@ def serialize(self, treewalker, encoding=None):
in_cdata = False
self.errors = []
if encoding and self.inject_meta_charset:
treewalker = InjectMetaCharsetFilter(treewalker, encoding)
from html5lib.filters.inject_meta_charset import Filter
treewalker = Filter(treewalker, encoding)
# XXX: WhitespaceFilter should be used before OptionalTagFilter
# for maximum efficiently of this latter filter
if self.strip_whitespace:
treewalker = WhitespaceFilter(treewalker)
from html5lib.filters.whitespace import Filter
treewalker = Filter(treewalker)
if self.sanitize:
from html5lib.filters.sanitizer import Filter
treewalker = Filter(treewalker)
if self.omit_optional_tags:
treewalker = OptionalTagFilter(treewalker)
from html5lib.filters.optionaltags import Filter
treewalker = Filter(treewalker)
for token in treewalker:
type = token["type"]
if type == "Doctype":
Expand Down

0 comments on commit a69f4f5

Please sign in to comment.