Skip to content

Commit

Permalink
Fix all the files outside of html5lib to flake8 cleanly
Browse files Browse the repository at this point in the history
  • Loading branch information
gsnedders committed May 20, 2016
1 parent 2c3b64b commit 8238648
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 58 deletions.
2 changes: 1 addition & 1 deletion flake8-run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@ if [[ ! -x $(which flake8) ]]; then
exit 1
fi

flake8 html5lib
flake8 `dirname $0`
exit $?
31 changes: 19 additions & 12 deletions parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
"""

import sys
import os
import traceback
from optparse import OptionParser

Expand All @@ -15,17 +14,21 @@
from html5lib import constants
from html5lib import utils


def parse():
optParser = getOptParser()
opts,args = optParser.parse_args()
opts, args = optParser.parse_args()
encoding = "utf8"

try:
f = args[-1]
# Try opening from the internet
if f.startswith('http:https://'):
try:
import urllib.request, urllib.parse, urllib.error, cgi
import urllib.request
import urllib.parse
import urllib.error
import cgi
f = urllib.request.urlopen(f)
contentType = f.headers.get('content-type')
if contentType:
Expand All @@ -41,7 +44,7 @@ def parse():
try:
# Try opening from file system
f = open(f, "rb")
except IOError as e:
except IOError as e:
sys.stderr.write("Unable to open file: %s\n" % e)
sys.exit(1)
except IndexError:
Expand Down Expand Up @@ -82,14 +85,15 @@ def parse():
if document:
printOutput(p, document, opts)
t2 = time.time()
sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1))
sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)" % (t1 - t0, t2 - t1))
else:
sys.stderr.write("\n\nRun took: %fs"%(t1-t0))
sys.stderr.write("\n\nRun took: %fs" % (t1 - t0))
else:
document = run(parseMethod, f, encoding, opts.scripting)
if document:
printOutput(p, document, opts)


def run(parseMethod, f, encoding, scripting):
try:
document = parseMethod(f, encoding=encoding, scripting=scripting)
Expand All @@ -98,6 +102,7 @@ def run(parseMethod, f, encoding, scripting):
traceback.print_exc()
return document


def printOutput(parser, document, opts):
if opts.encoding:
print("Encoding:", parser.tokenizer.stream.charEncoding)
Expand All @@ -116,7 +121,7 @@ def printOutput(parser, document, opts):
elif tb == "etree":
sys.stdout.write(utils.default_etree.tostring(document))
elif opts.tree:
if not hasattr(document,'__getitem__'):
if not hasattr(document, '__getitem__'):
document = [document]
for fragment in document:
print(parser.tree.testSerializer(fragment))
Expand All @@ -126,7 +131,7 @@ def printOutput(parser, document, opts):
kwargs = {}
for opt in serializer.HTMLSerializer.options:
try:
kwargs[opt] = getattr(opts,opt)
kwargs[opt] = getattr(opts, opt)
except:
pass
if not kwargs['quote_char']:
Expand All @@ -142,12 +147,14 @@ def printOutput(parser, document, opts):
encoding = "utf-8"
for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding):
sys.stdout.write(text)
if not text.endswith('\n'): sys.stdout.write('\n')
if not text.endswith('\n'):
sys.stdout.write('\n')
if opts.error:
errList=[]
errList = []
for pos, errorcode, datavars in parser.errors:
errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n")
errList.append("Line %i Col %i" % pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
sys.stdout.write("\nParse errors:\n" + "\n".join(errList) + "\n")


def getOptParser():
parser = OptionParser(usage=__doc__)
Expand Down
12 changes: 6 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from setuptools import setup


classifiers=[
classifiers = [
'Development Status :: 5 - Production/Stable',
'Intended Audience :: Developers',
'License :: OSI Approved :: MIT License',
Expand All @@ -20,9 +20,9 @@
'Programming Language :: Python :: 3.5',
'Topic :: Software Development :: Libraries :: Python Modules',
'Topic :: Text Processing :: Markup :: HTML'
]
]

packages = ['html5lib'] + ['html5lib.'+name
packages = ['html5lib'] + ['html5lib.' + name
for name in os.listdir(os.path.join('html5lib'))
if os.path.isdir(os.path.join('html5lib', name)) and
not name.startswith('.') and name != 'tests']
Expand All @@ -39,9 +39,9 @@
assignments = filter(lambda x: isinstance(x, ast.Assign), t.body)
for a in assignments:
if (len(a.targets) == 1 and
isinstance(a.targets[0], ast.Name) and
a.targets[0].id == "__version__" and
isinstance(a.value, ast.Str)):
isinstance(a.targets[0], ast.Name) and
a.targets[0].id == "__version__" and
isinstance(a.value, ast.Str)):
version = a.value.s

setup(name='html5lib',
Expand Down
50 changes: 31 additions & 19 deletions utils/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,57 +2,67 @@

import html5lib


def parse(path="html5ents.xml"):
return html5lib.parse(open(path), treebuilder="lxml")


def entity_table(tree):
return dict((entity_name("".join(tr[0].xpath(".//text()"))),
entity_characters(tr[1].text))
for tr in tree.xpath("//h:tbody/h:tr",
namespaces={"h":"http:https://www.w3.org/1999/xhtml"}))
namespaces={"h": "http:https://www.w3.org/1999/xhtml"}))


def entity_name(inp):
return inp.strip()


def entity_characters(inp):
return "".join(codepoint_to_character(item)
for item in inp.split()
if item)
for item in inp.split()
if item)


def codepoint_to_character(inp):
return ("\U000"+inp[2:]).decode("unicode-escape")
return ("\\U000" + inp[2:]).decode("unicode-escape")


def make_tests_json(entities):
test_list = make_test_list(entities)
tests_json = {"tests":
[make_test(*item) for item in test_list]
[make_test(*item) for item in test_list]
}
return tests_json


def make_test(name, characters, good):
return {
"description":test_description(name, good),
"input":"&%s"%name,
"output":test_expected(name, characters, good)
}
"description": test_description(name, good),
"input": "&%s" % name,
"output": test_expected(name, characters, good)
}


def test_description(name, good):
with_semicolon = name.endswith(";")
semicolon_text = {True:"with a semi-colon",
False:"without a semi-colon"}[with_semicolon]
semicolon_text = {True: "with a semi-colon",
False: "without a semi-colon"}[with_semicolon]
if good:
text = "Named entity: %s %s"%(name, semicolon_text)
text = "Named entity: %s %s" % (name, semicolon_text)
else:
text = "Bad named entity: %s %s"%(name, semicolon_text)
text = "Bad named entity: %s %s" % (name, semicolon_text)
return text


def test_expected(name, characters, good):
rv = []
if not good or not name.endswith(";"):
rv.append("ParseError")
rv.append(["Character", characters])
return rv


def make_test_list(entities):
tests = []
for entity_name, characters in entities.items():
Expand All @@ -61,20 +71,23 @@ def make_test_list(entities):
tests.append((entity_name, characters, True))
return sorted(tests)


def subentity_exists(entity_name, entities):
for i in range(1, len(entity_name)):
if entity_name[:-i] in entities:
return True
return False


def make_entities_code(entities):
entities_text = "\n".join(" \"%s\": u\"%s\","%(
name, entities[name].encode(
"unicode-escape").replace("\"", "\\\""))
for name in sorted(entities.keys()))
entities_text = "\n".join(" \"%s\": u\"%s\"," % (
name, entities[name].encode(
"unicode-escape").replace("\"", "\\\""))
for name in sorted(entities.keys()))
return """entities = {
%s
}"""%entities_text
}""" % entities_text


def main():
entities = entity_table(parse())
Expand All @@ -85,4 +98,3 @@ def main():

if __name__ == "__main__":
main()

43 changes: 23 additions & 20 deletions utils/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
s.spider("http:https://www.google.com", maxURLs=100)
"""

import urllib.request, urllib.error, urllib.parse
import urllib.request
import urllib.error
import urllib.parse
import urllib.robotparser
import md5

Expand All @@ -16,11 +18,13 @@
import html5lib
from html5lib.treebuilders import etree


class Spider(object):

def __init__(self):
self.unvisitedURLs = set()
self.visitedURLs = set()
self.buggyURLs=set()
self.buggyURLs = set()
self.robotParser = urllib.robotparser.RobotFileParser()
self.contentDigest = {}
self.http = httplib2.Http(".cache")
Expand Down Expand Up @@ -70,18 +74,18 @@ def updateURLs(self, tree):
update the list of visited and unvisited URLs according to whether we
have seen them before or not"""
urls = set()
#Remove all links we have already visited
# Remove all links we have already visited
for link in tree.findall(".//a"):
try:
url = urllib.parse.urldefrag(link.attrib['href'])[0]
if (url and url not in self.unvisitedURLs and url
try:
url = urllib.parse.urldefrag(link.attrib['href'])[0]
if (url and url not in self.unvisitedURLs and url
not in self.visitedURLs):
urls.add(url)
except KeyError:
pass
urls.add(url)
except KeyError:
pass

#Remove all non-http URLs and add a suitable base URL where that is
#missing
# Remove all non-http URLs and add a suitable base URL where that is
# missing
newUrls = set()
for url in urls:
splitURL = list(urllib.parse.urlsplit(url))
Expand All @@ -93,23 +97,22 @@ def updateURLs(self, tree):
urls = newUrls

responseHeaders = {}
#Now we want to find the content types of the links we haven't visited
# Now we want to find the content types of the links we haven't visited
for url in urls:
try:
resp, content = self.http.request(url, "HEAD")
responseHeaders[url] = resp
except AttributeError as KeyError:
#Don't know why this happens
except AttributeError:
# Don't know why this happens
pass


#Remove links not of content-type html or pages not found
#XXX - need to deal with other status codes?
# Remove links not of content-type html or pages not found
# XXX - need to deal with other status codes?
toVisit = set([url for url in urls if url in responseHeaders and
"html" in responseHeaders[url]['content-type'] and
responseHeaders[url]['status'] == "200"])
"html" in responseHeaders[url]['content-type'] and
responseHeaders[url]['status'] == "200"])

#Now check we are allowed to spider the page
# Now check we are allowed to spider the page
for url in toVisit:
robotURL = list(urllib.parse.urlsplit(url)[:2])
robotURL.extend(["robots.txt", "", ""])
Expand Down

0 comments on commit 8238648

Please sign in to comment.