get optionaltags tests working again

--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40655
ssameerr · May 31, 2007 · c98097f · c98097f
1 parent 7e8123a
commit c98097f
Show file tree

Hide file tree

Showing 2 changed files with 177 additions and 168 deletions.
diff --git a/src/serializer.py b/src/serializer.py
@@ -45,13 +45,182 @@ def htmlentityreplace_errors(exc):
 
  del register_error
 
-def _slide(iterator):
- previous = None
- for token in iterator:
- if previous is not None:
- yield previous, token
- previous = token
- yield previous, None
+class OptionalTagFilter:
+ def __init__(self, source):
+ self.source = source
+
+ def slider(self):
+ previous1 = previous2 = None
+ for token in self.source:
+ if previous1 is not None:
+ yield previous2, previous1, token
+ previous2 = previous1
+ previous1 = token
+ yield previous2, previous1, None
+
+ def __iter__(self):
+ for previous, token, next in self.slider():
+ type = token["type"]
+ if type == "StartTag":
+ if token["data"] or not self.is_optional_start(token["name"], previous, next):
+ yield token
+ elif type == "EndTag":
+ if not self.is_optional_end(token["name"], next):
+ yield token
+ else:
+ yield token
+
+ def is_optional_start(self, tagname, previous, next):
+ type = next and next["type"] or None
+ if tagname in 'html':
+ # An html element's start tag may be omitted if the first thing
+ # inside the html element is not a space character or a comment.
+ return type not in ("Comment", "SpaceCharacters")
+ elif tagname == 'head':
+ # A head element's start tag may be omitted if the first thing
+ # inside the head element is an element.
+ return type == "StartTag"
+ elif tagname == 'body':
+ # A body element's start tag may be omitted if the first thing
+ # inside the body element is not a space character or a comment,
+ # except if the first thing inside the body element is a script
+ # or style element and the node immediately preceding the body
+ # element is a head element whose end tag has been omitted.
+ if type in ("Comment", "SpaceCharacters"):
+ return False
+ elif type == "StartTag":
+ # XXX: we do not look at the preceding event, so we never omit
+ # the body element's start tag if it's followed by a script or
+ # a style element.
+ return next["name"] not in ('script', 'style')
+ else:
+ return True
+ elif tagname == 'colgroup':
+ # A colgroup element's start tag may be omitted if the first thing
+ # inside the colgroup element is a col element, and if the element
+ # is not immediately preceeded by another colgroup element whose
+ # end tag has been omitted.
+ if type == "StartTag":
+ # XXX: we do not look at the preceding event, so instead we never
+ # omit the colgroup element's end tag when it is immediately
+ # followed by another colgroup element. See is_optional_end.
+ return next["name"] == "col"
+ else:
+ return False
+ elif tagname == 'tbody':
+ # A tbody element's start tag may be omitted if the first thing
+ # inside the tbody element is a tr element, and if the element is
+ # not immediately preceeded by a tbody, thead, or tfoot element
+ # whose end tag has been omitted.
+ if type == "StartTag":
+ # omit the thead and tfoot elements' end tag when they are
+ # immediately followed by a tbody element. See is_optional_end.
+ if previous and previous['type'] == 'EndTag' and \
+ previous['name'] in ('tbody','thead','tfoot'):
+ return False
+ return next["name"] == 'tr'
+ else:
+ return False
+ return False
+
+ def is_optional_end(self, tagname, next):
+ type = next and next["type"] or None
+ if tagname in ('html', 'head', 'body'):
+ # An html element's end tag may be omitted if the html element
+ # is not immediately followed by a space character or a comment.
+ return type not in ("Comment", "SpaceCharacters")
+ elif tagname in ('li', 'optgroup', 'option', 'tr'):
+ # A li element's end tag may be omitted if the li element is
+ # immediately followed by another li element or if there is
+ # no more content in the parent element.
+ # An optgroup element's end tag may be omitted if the optgroup
+ # element is immediately followed by another optgroup element,
+ # or if there is no more content in the parent element.
+ # An option element's end tag may be omitted if the option
+ # element is immediately followed by another option element,
+ # or if there is no more content in the parent element.
+ # A tr element's end tag may be omitted if the tr element is
+ # immediately followed by another tr element, or if there is
+ # no more content in the parent element.
+ if type == "StartTag":
+ return next["name"] == tagname
+ else:
+ return type == "EndTag" or type is None
+ elif tagname in ('dt', 'dd'):
+ # A dt element's end tag may be omitted if the dt element is
+ # immediately followed by another dt element or a dd element.
+ # A dd element's end tag may be omitted if the dd element is
+ # immediately followed by another dd element or a dt element,
+ # or if there is no more content in the parent element.
+ if type == "StartTag":
+ return next["name"] in ('dt', 'dd')
+ elif tagname == 'dd':
+ return type == "EndTag" or type is None
+ else:
+ return False
+ elif tagname == 'p':
+ # A p element's end tag may be omitted if the p element is
+ # immediately followed by an address, blockquote, dl, fieldset,
+ # form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
+ # or ul element, or if there is no more content in the parent
+ # element.
+ if type == "StartTag":
+ return next["name"] in ('address', 'blockquote', \
+ 'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', \
+ 'h6', 'hr', 'menu', 'ol', 'p', 'pre', 'table', 'ul')
+ else:
+ return type == "EndTag" or type is None
+ elif tagname == 'colgroup':
+ # A colgroup element's end tag may be omitted if the colgroup
+ # element is not immediately followed by a space character or
+ # a comment.
+ if type in ("Comment", "SpaceCharacters"):
+ return False
+ elif type == "StartTag":
+ # XXX: we also look for an immediately following colgroup
+ # element. See is_optional_start.
+ return next["name"] != 'colgroup'
+ else:
+ return True
+ elif tagname in ('thead', 'tbody'):
+ # A thead element's end tag may be omitted if the thead element
+ # is immediately followed by a tbody or tfoot element.
+ # A tbody element's end tag may be omitted if the tbody element
+ # is immediately followed by a tbody or tfoot element, or if
+ # there is no more content in the parent element.
+ # A tfoot element's end tag may be omitted if the tfoot element
+ # is immediately followed by a tbody element, or if there is no
+ # more content in the parent element.
+ # XXX: we never omit the end tag when the following element is
+ # a tbody. See is_optional_start.
+ if type == "StartTag":
+ return next["name"] in ['tbody', 'tfoot']
+ elif tagname == 'tbody':
+ return type == "EndTag" or type is None
+ else:
+ return False
+ elif tagname == 'tfoot':
+ # A tfoot element's end tag may be omitted if the tfoot element
+ # is immediately followed by a tbody element, or if there is no
+ # more content in the parent element.
+ # XXX: we never omit the end tag when the following element is
+ # a tbody. See is_optional_start.
+ if type == "StartTag":
+ return next["name"] == 'tbody'
+ else:
+ return type == "EndTag" or type is None
+ elif tagname in ('td', 'th'):
+ # A td element's end tag may be omitted if the td element is
+ # immediately followed by a td or th element, or if there is
+ # no more content in the parent element.
+ # A th element's end tag may be omitted if the th element is
+ # immediately followed by a td or th element, or if there is
+ # no more content in the parent element.
+ if type == "StartTag":
+ return next["name"] in ('td', 'th')
+ else:
+ return type == "EndTag" or type is None
+ return False
 
 class HTMLSerializer(object):
  cdata_elements = frozenset(("style", "script", "xmp", "iframe", "noembed", "noframes", "noscript"))
@@ -89,7 +258,7 @@ def serialize(self, treewalker, encoding=None):
  if self.strip_whitespace:
  treewalker = self.filter_whitespace(treewalker)
  if self.omit_optional_tags:
- treewalker = self.filter_optional_tags(treewalker)
+ treewalker = OptionalTagFilter(treewalker)
  for token in treewalker:
  type = token["type"]
  if type == "Doctype":
@@ -218,165 +387,6 @@ def filter_inject_meta_charset(self, treewalker, encoding):
  def filter_whitespace(self, treewalker):
  raise NotImplementedError
 
- def filter_optional_tags(self, treewalker):
- for token, next in _slide(treewalker):
- type = token["type"]
- if type == "StartTag":
- if token["data"] or not self.is_optional_start(token["name"], next):
- yield token
- elif type == "EndTag":
- if not self.is_optional_end(token["name"], next):
- yield token
- else:
- yield token
-
- def is_optional_start(self, tagname, next):
- type = next and next["type"] or None
- if tagname in 'html':
- # An html element's start tag may be omitted if the first thing
- # inside the html element is not a space character or a comment.
- return type not in ("Comment", "SpaceCharacters")
- elif tagname == 'head':
- # A head element's start tag may be omitted if the first thing
- # inside the head element is an element.
- return type == "StartTag"
- elif tagname == 'body':
- # A body element's start tag may be omitted if the first thing
- # inside the body element is not a space character or a comment,
- # except if the first thing inside the body element is a script
- # or style element and the node immediately preceding the body
- # element is a head element whose end tag has been omitted.
- if type in ("Comment", "SpaceCharacters"):
- return False
- elif type == "StartTag":
- # XXX: we do not look at the preceding event, so we never omit
- # the body element's start tag if it's followed by a script or
- # a style element.
- return next["name"] not in ('script', 'style')
- else:
- return True
- elif tagname == 'colgroup':
- # A colgroup element's start tag may be omitted if the first thing
- # inside the colgroup element is a col element, and if the element
- # is not immediately preceeded by another colgroup element whose
- # end tag has been omitted.
- if type == "StartTag":
- # XXX: we do not look at the preceding event, so instead we never
- # omit the colgroup element's end tag when it is immediately
- # followed by another colgroup element. See is_optional_end.
- return next["name"] == "col"
- else:
- return False
- elif tagname == 'tbody':
- # A tbody element's start tag may be omitted if the first thing
- # inside the tbody element is a tr element, and if the element is
- # not immediately preceeded by a tbody, thead, or tfoot element
- # whose end tag has been omitted.
- if type == "StartTag":
- # XXX: we do not look at the preceding event, so instead we never
- # omit the thead and tfoot elements' end tag when they are
- # immediately followed by a tbody element. See is_optional_end.
- return next["name"] == 'tr'
- else:
- return False
- return False
-
- def is_optional_end(self, tagname, next):
- type = next and next["type"] or None
- if tagname in ('html', 'head', 'body'):
- # An html element's end tag may be omitted if the html element
- # is not immediately followed by a space character or a comment.
- return type not in ("Comment", "SpaceCharacters")
- elif tagname in ('li', 'optgroup', 'option', 'tr'):
- # A li element's end tag may be omitted if the li element is
- # immediately followed by another li element or if there is
- # no more content in the parent element.
- # An optgroup element's end tag may be omitted if the optgroup
- # element is immediately followed by another optgroup element,
- # or if there is no more content in the parent element.
- # An option element's end tag may be omitted if the option
- # element is immediately followed by another option element,
- # or if there is no more content in the parent element.
- # A tr element's end tag may be omitted if the tr element is
- # immediately followed by another tr element, or if there is
- # no more content in the parent element.
- if type == "StartTag":
- return next["name"] == tagname
- else:
- return type == "EndTag" or type is None
- elif tagname in ('dt', 'dd'):
- # A dt element's end tag may be omitted if the dt element is
- # immediately followed by another dt element or a dd element.
- # A dd element's end tag may be omitted if the dd element is
- # immediately followed by another dd element or a dt element,
- # or if there is no more content in the parent element.
- if type == "StartTag":
- return next["name"] in ('dt', 'dd')
- elif tagname == 'dd':
- return type == "EndTag" or type is None
- else:
- return False
- elif tagname == 'p':
- # A p element's end tag may be omitted if the p element is
- # immediately followed by an address, blockquote, dl, fieldset,
- # form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
- # or ul element, or if there is no more content in the parent
- # element.
- if type == "StartTag":
- return next["name"] in ('address', 'blockquote', \
- 'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', \
- 'h6', 'hr', 'menu', 'ol', 'p', 'pre', 'table', 'ul')
- else:
- return type == "EndTag" or type is None
- elif tagname == 'colgroup':
- # A colgroup element's end tag may be omitted if the colgroup
- # element is not immediately followed by a space character or
- # a comment.
- if type in ("Comment", "SpaceCharacters"):
- return False
- elif type == "StartTag":
- # XXX: we also look for an immediately following colgroup
- # element. See is_optional_start.
- return next["name"] != 'colgroup'
- else:
- return True
- elif tagname in ('thead', 'tbody'):
- # A thead element's end tag may be omitted if the thead element
- # is immediately followed by a tbody or tfoot element.
- # A tbody element's end tag may be omitted if the tbody element
- # is immediately followed by a tbody or tfoot element, or if
- # there is no more content in the parent element.
- # A tfoot element's end tag may be omitted if the tfoot element
- # is immediately followed by a tbody element, or if there is no
- # more content in the parent element.
- # XXX: we never omit the end tag when the following element is
- # a tbody. See is_optional_start.
- if type == "StartTag":
- return next["name"] == 'tfoot'
- elif tagname == 'tbody':
- return type == "EndTag" or type is None
- else:
- return False
- elif tagname == 'tfoot':
- # A tfoot element's end tag may be omitted if the tfoot element
- # is immediately followed by a tbody element, or if there is no
- # more content in the parent element.
- # XXX: we never omit the end tag when the following element is
- # a tbody. See is_optional_start.
- return type == "EndTag" or type is None
- elif tagname in ('td', 'th'):
- # A td element's end tag may be omitted if the td element is
- # immediately followed by a td or th element, or if there is
- # no more content in the parent element.
- # A th element's end tag may be omitted if the th element is
- # immediately followed by a td or th element, or if there is
- # no more content in the parent element.
- if type == "StartTag":
- return next["name"] in ('td', 'th')
- else:
- return type == "EndTag" or type is None
- return False
-
 def SerializeError(Exception):
  """Error in serialized tree"""
  pass
diff --git a/tests/test_serializer.py b/tests/test_serializer.py
@@ -77,7 +77,6 @@ def serialize_html(self, input, options):
 
 def test_serializer():
  for filename in glob.glob('serializer/*.test'):
- if filename.find('optionaltags')>=0: continue # TODO
  tests = simplejson.load(file(filename))
  for test in tests['tests']:
  yield test