Fix all the files outside of html5lib to flake8 cleanly

html5lib · gsnedders · May 20, 2016 · Dec 3, 2015 · Dec 4, 2015 · May 20, 2016
commit 823864882ee969ebb7c16986a80388d5785cb9ea
diff --git a/flake8-run.sh b/flake8-run.sh
@@ -5,5 +5,5 @@ if [[ ! -x $(which flake8) ]]; then
  exit 1
 fi
 
-flake8 html5lib
+flake8 `dirname $0`
 exit $?
diff --git a/parse.py b/parse.py
@@ -5,7 +5,6 @@
 """
 
 import sys
-import os
 import traceback
 from optparse import OptionParser
 
@@ -15,17 +14,21 @@
 from html5lib import constants
 from html5lib import utils
 
+
 def parse():
  optParser = getOptParser()
- opts,args = optParser.parse_args()
+ opts, args = optParser.parse_args()
  encoding = "utf8"
 
  try:
  f = args[-1]
  # Try opening from the internet
  if f.startswith('http:https://'):
  try:
- import urllib.request, urllib.parse, urllib.error, cgi
+ import urllib.request
+ import urllib.parse
+ import urllib.error
+ import cgi
  f = urllib.request.urlopen(f)
  contentType = f.headers.get('content-type')
  if contentType:
@@ -41,7 +44,7 @@ def parse():
  try:
  # Try opening from file system
  f = open(f, "rb")
- except IOError as e: 
+ except IOError as e:
  sys.stderr.write("Unable to open file: %s\n" % e)
  sys.exit(1)
  except IndexError:
@@ -82,14 +85,15 @@ def parse():
  if document:
  printOutput(p, document, opts)
  t2 = time.time()
- sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1))
+ sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)" % (t1 - t0, t2 - t1))
  else:
- sys.stderr.write("\n\nRun took: %fs"%(t1-t0))
+ sys.stderr.write("\n\nRun took: %fs" % (t1 - t0))
  else:
  document = run(parseMethod, f, encoding, opts.scripting)
  if document:
  printOutput(p, document, opts)
 
+
 def run(parseMethod, f, encoding, scripting):
  try:
  document = parseMethod(f, encoding=encoding, scripting=scripting)
@@ -98,6 +102,7 @@ def run(parseMethod, f, encoding, scripting):
  traceback.print_exc()
  return document
 
+
 def printOutput(parser, document, opts):
  if opts.encoding:
  print("Encoding:", parser.tokenizer.stream.charEncoding)
@@ -116,7 +121,7 @@ def printOutput(parser, document, opts):
  elif tb == "etree":
  sys.stdout.write(utils.default_etree.tostring(document))
  elif opts.tree:
- if not hasattr(document,'__getitem__'):
+ if not hasattr(document, '__getitem__'):
  document = [document]
  for fragment in document:
  print(parser.tree.testSerializer(fragment))
@@ -126,7 +131,7 @@ def printOutput(parser, document, opts):
  kwargs = {}
  for opt in serializer.HTMLSerializer.options:
  try:
- kwargs[opt] = getattr(opts,opt)
+ kwargs[opt] = getattr(opts, opt)
  except:
  pass
  if not kwargs['quote_char']:
@@ -142,12 +147,14 @@ def printOutput(parser, document, opts):
  encoding = "utf-8"
  for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding):
  sys.stdout.write(text)
- if not text.endswith('\n'): sys.stdout.write('\n')
+ if not text.endswith('\n'):
+ sys.stdout.write('\n')
  if opts.error:
- errList=[]
+ errList = []
  for pos, errorcode, datavars in parser.errors:
- errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
- sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n")
+ errList.append("Line %i Col %i" % pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
+ sys.stdout.write("\nParse errors:\n" + "\n".join(errList) + "\n")
+
 
 def getOptParser():
  parser = OptionParser(usage=__doc__)

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 from setuptools import setup
 
 
-classifiers=[
+classifiers = [
  'Development Status :: 5 - Production/Stable',
  'Intended Audience :: Developers',
  'License :: OSI Approved :: MIT License',
@@ -20,9 +20,9 @@
  'Programming Language :: Python :: 3.5',
  'Topic :: Software Development :: Libraries :: Python Modules',
  'Topic :: Text Processing :: Markup :: HTML'
- ]
+]
 
-packages = ['html5lib'] + ['html5lib.'+name
+packages = ['html5lib'] + ['html5lib.' + name
  for name in os.listdir(os.path.join('html5lib'))
  if os.path.isdir(os.path.join('html5lib', name)) and
  not name.startswith('.') and name != 'tests']
@@ -39,9 +39,9 @@
  assignments = filter(lambda x: isinstance(x, ast.Assign), t.body)
  for a in assignments:
  if (len(a.targets) == 1 and
- isinstance(a.targets[0], ast.Name) and
- a.targets[0].id == "__version__" and
- isinstance(a.value, ast.Str)):
+  isinstance(a.targets[0], ast.Name) and
+  a.targets[0].id == "__version__" and
+  isinstance(a.value, ast.Str)):
  version = a.value.s
 
 setup(name='html5lib',

diff --git a/utils/entities.py b/utils/entities.py
@@ -2,57 +2,67 @@
 
 import html5lib
 
+
 def parse(path="html5ents.xml"):
  return html5lib.parse(open(path), treebuilder="lxml")
 
+
 def entity_table(tree):
  return dict((entity_name("".join(tr[0].xpath(".//text()"))),
  entity_characters(tr[1].text))
  for tr in tree.xpath("//h:tbody/h:tr",
- namespaces={"h":"http:https://www.w3.org/1999/xhtml"}))
+ namespaces={"h": "http:https://www.w3.org/1999/xhtml"}))
+
 
 def entity_name(inp):
  return inp.strip()
 
+
 def entity_characters(inp):
  return "".join(codepoint_to_character(item)
- for item in inp.split()
- if item)
+ for item in inp.split()
+ if item)
+
 
 def codepoint_to_character(inp):
- return ("\U000"+inp[2:]).decode("unicode-escape")
+ return ("\\U000" + inp[2:]).decode("unicode-escape")
+
 
 def make_tests_json(entities):
  test_list = make_test_list(entities)
  tests_json = {"tests":
-  [make_test(*item) for item in test_list]
+ [make_test(*item) for item in test_list]
  }
  return tests_json
 
+
 def make_test(name, characters, good):
  return {
- "description":test_description(name, good),
- "input":"&%s"%name,
- "output":test_expected(name, characters, good)
- }
+ "description": test_description(name, good),
+ "input": "&%s" % name,
+ "output": test_expected(name, characters, good)
+ }
+
 
 def test_description(name, good):
  with_semicolon = name.endswith(";")
- semicolon_text = {True:"with a semi-colon",
- False:"without a semi-colon"}[with_semicolon]
+ semicolon_text = {True: "with a semi-colon",
+ False: "without a semi-colon"}[with_semicolon]
  if good:
- text = "Named entity: %s %s"%(name, semicolon_text)
+ text = "Named entity: %s %s" % (name, semicolon_text)
  else:
- text = "Bad named entity: %s %s"%(name, semicolon_text)
+ text = "Bad named entity: %s %s" % (name, semicolon_text)
  return text
 
+
 def test_expected(name, characters, good):
  rv = []
  if not good or not name.endswith(";"):
  rv.append("ParseError")
  rv.append(["Character", characters])
  return rv
 
+
 def make_test_list(entities):
  tests = []
  for entity_name, characters in entities.items():
@@ -61,20 +71,23 @@ def make_test_list(entities):
  tests.append((entity_name, characters, True))
  return sorted(tests)
 
+
 def subentity_exists(entity_name, entities):
  for i in range(1, len(entity_name)):
  if entity_name[:-i] in entities:
  return True
  return False
 
+
 def make_entities_code(entities):
- entities_text = "\n".join(" \"%s\": u\"%s\","%(
-  name, entities[name].encode(
-  "unicode-escape").replace("\"", "\\\""))
-  for name in sorted(entities.keys()))
+ entities_text = "\n".join(" \"%s\": u\"%s\"," % (
+ name, entities[name].encode(
+ "unicode-escape").replace("\"", "\\\""))
+ for name in sorted(entities.keys()))
  return """entities = {
 %s
-}"""%entities_text
+}""" % entities_text
+
 
 def main():
  entities = entity_table(parse())
@@ -85,4 +98,3 @@ def main():
 
 if __name__ == "__main__":
  main()
-
diff --git a/utils/spider.py b/utils/spider.py
@@ -7,7 +7,9 @@
 s.spider("http:https://www.google.com", maxURLs=100)
 """
 
-import urllib.request, urllib.error, urllib.parse
+import urllib.request
+import urllib.error
+import urllib.parse
 import urllib.robotparser
 import md5
 
@@ -16,11 +18,13 @@
 import html5lib
 from html5lib.treebuilders import etree
 
+
 class Spider(object):
+
  def __init__(self):
  self.unvisitedURLs = set()
  self.visitedURLs = set()
- self.buggyURLs=set()
+ self.buggyURLs = set()
  self.robotParser = urllib.robotparser.RobotFileParser()
  self.contentDigest = {}
  self.http = httplib2.Http(".cache")
@@ -70,18 +74,18 @@ def updateURLs(self, tree):
  update the list of visited and unvisited URLs according to whether we
  have seen them before or not"""
  urls = set()
- #Remove all links we have already visited
+ # Remove all links we have already visited
  for link in tree.findall(".//a"):
-  try:
-  url = urllib.parse.urldefrag(link.attrib['href'])[0]
-  if (url and url not in self.unvisitedURLs and url
+ try:
+ url = urllib.parse.urldefrag(link.attrib['href'])[0]
+ if (url and url not in self.unvisitedURLs and url
  not in self.visitedURLs):
-  urls.add(url)
-  except KeyError:
-  pass
+ urls.add(url)
+ except KeyError:
+ pass
 
- #Remove all non-http URLs and add a suitable base URL where that is
- #missing
+ # Remove all non-http URLs and add a suitable base URL where that is
+ # missing
  newUrls = set()
  for url in urls:
  splitURL = list(urllib.parse.urlsplit(url))
@@ -93,23 +97,22 @@ def updateURLs(self, tree):
  urls = newUrls
 
  responseHeaders = {}
- #Now we want to find the content types of the links we haven't visited
+ # Now we want to find the content types of the links we haven't visited
  for url in urls:
  try:
  resp, content = self.http.request(url, "HEAD")
  responseHeaders[url] = resp
- except AttributeError as KeyError:
- #Don't know why this happens
+ except AttributeError:
+ # Don't know why this happens
  pass
 
-
- #Remove links not of content-type html or pages not found
- #XXX - need to deal with other status codes?
+ # Remove links not of content-type html or pages not found
+ # XXX - need to deal with other status codes?
  toVisit = set([url for url in urls if url in responseHeaders and
- "html" in responseHeaders[url]['content-type'] and
- responseHeaders[url]['status'] == "200"])
+  "html" in responseHeaders[url]['content-type'] and
+  responseHeaders[url]['status'] == "200"])
 
- #Now check we are allowed to spider the page
+ # Now check we are allowed to spider the page
  for url in toVisit:
  robotURL = list(urllib.parse.urlsplit(url)[:2])
  robotURL.extend(["robots.txt", "", ""])