Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add prospector and fix some bugs #218

Merged
merged 27 commits into from
May 20, 2016
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
c64bfca
Get rid of mutable default arguments
gsnedders Dec 3, 2015
c1c16ce
Avoid noisiness from pylint and the parser's set patterns
gsnedders Dec 4, 2015
2c3b64b
add pep8/flake8 config to get something useful happening with them
gsnedders May 20, 2016
8238648
Fix all the files outside of html5lib to flake8 cleanly
gsnedders May 20, 2016
de6bcf2
Fix incorrectly hidden flake8 errors
gsnedders May 20, 2016
0bd31c4
Get rid of type()-based type-check
gsnedders May 20, 2016
d440a83
Silence pytest unused-variable warnings
gsnedders May 20, 2016
5c1d8e2
Remove duplicate entry from constants.replacementCharacters
gsnedders May 20, 2016
1b86ccb
Remove gratuitious argument in sanitizer
gsnedders May 20, 2016
82d623b
Silence redefined-variable-type
gsnedders May 20, 2016
a017b88
Silence unused-argument
gsnedders May 20, 2016
e5d395c
Silence wrong-import-position
gsnedders May 20, 2016
b64df28
Change which way around we overwrite this for clarity's sake
gsnedders May 20, 2016
df0b2ba
Remove unused import
gsnedders May 20, 2016
742715d
Fix invalid_unicode_re on platforms supporting lone surrogates
gsnedders May 20, 2016
cd74ec7
Fix comment
gsnedders May 20, 2016
15e126f
Silence eval-used
gsnedders May 20, 2016
bfc278a
Silence bare-except
gsnedders May 20, 2016
b46fcdf
Silence too-many-nested-blocks
gsnedders May 20, 2016
6945bc4
Silence not-callable
gsnedders May 20, 2016
0c290e0
Kill long-dead finalText code
gsnedders May 20, 2016
da099dc
Silence a buggily output non-parent-init-called
gsnedders May 20, 2016
97427de
Fix indentation
gsnedders May 20, 2016
2afe09b
Make this in practice unreachable code work on Py2
gsnedders May 20, 2016
c0df867
Silence arguments-differ
gsnedders May 20, 2016
5dce4f2
Silence protected-access
gsnedders May 20, 2016
a2b8c11
Add prospector/pylint config for the sake of Landscape.
gsnedders Dec 4, 2015
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Fix all the files outside of html5lib to flake8 cleanly
  • Loading branch information
gsnedders committed May 20, 2016
commit 823864882ee969ebb7c16986a80388d5785cb9ea
2 changes: 1 addition & 1 deletion flake8-run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@ if [[ ! -x $(which flake8) ]]; then
exit 1
fi

flake8 html5lib
flake8 `dirname $0`
exit $?
31 changes: 19 additions & 12 deletions parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
"""

import sys
import os
import traceback
from optparse import OptionParser

Expand All @@ -15,17 +14,21 @@
from html5lib import constants
from html5lib import utils


def parse():
optParser = getOptParser()
opts,args = optParser.parse_args()
opts, args = optParser.parse_args()
encoding = "utf8"

try:
f = args[-1]
# Try opening from the internet
if f.startswith('http:https://'):
try:
import urllib.request, urllib.parse, urllib.error, cgi
import urllib.request
import urllib.parse
import urllib.error
import cgi
f = urllib.request.urlopen(f)
contentType = f.headers.get('content-type')
if contentType:
Expand All @@ -41,7 +44,7 @@ def parse():
try:
# Try opening from file system
f = open(f, "rb")
except IOError as e:
except IOError as e:
sys.stderr.write("Unable to open file: %s\n" % e)
sys.exit(1)
except IndexError:
Expand Down Expand Up @@ -82,14 +85,15 @@ def parse():
if document:
printOutput(p, document, opts)
t2 = time.time()
sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1))
sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)" % (t1 - t0, t2 - t1))
else:
sys.stderr.write("\n\nRun took: %fs"%(t1-t0))
sys.stderr.write("\n\nRun took: %fs" % (t1 - t0))
else:
document = run(parseMethod, f, encoding, opts.scripting)
if document:
printOutput(p, document, opts)


def run(parseMethod, f, encoding, scripting):
try:
document = parseMethod(f, encoding=encoding, scripting=scripting)
Expand All @@ -98,6 +102,7 @@ def run(parseMethod, f, encoding, scripting):
traceback.print_exc()
return document


def printOutput(parser, document, opts):
if opts.encoding:
print("Encoding:", parser.tokenizer.stream.charEncoding)
Expand All @@ -116,7 +121,7 @@ def printOutput(parser, document, opts):
elif tb == "etree":
sys.stdout.write(utils.default_etree.tostring(document))
elif opts.tree:
if not hasattr(document,'__getitem__'):
if not hasattr(document, '__getitem__'):
document = [document]
for fragment in document:
print(parser.tree.testSerializer(fragment))
Expand All @@ -126,7 +131,7 @@ def printOutput(parser, document, opts):
kwargs = {}
for opt in serializer.HTMLSerializer.options:
try:
kwargs[opt] = getattr(opts,opt)
kwargs[opt] = getattr(opts, opt)
except:
pass
if not kwargs['quote_char']:
Expand All @@ -142,12 +147,14 @@ def printOutput(parser, document, opts):
encoding = "utf-8"
for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding):
sys.stdout.write(text)
if not text.endswith('\n'): sys.stdout.write('\n')
if not text.endswith('\n'):
sys.stdout.write('\n')
if opts.error:
errList=[]
errList = []
for pos, errorcode, datavars in parser.errors:
errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n")
errList.append("Line %i Col %i" % pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
sys.stdout.write("\nParse errors:\n" + "\n".join(errList) + "\n")


def getOptParser():
parser = OptionParser(usage=__doc__)
Expand Down
12 changes: 6 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from setuptools import setup


classifiers=[
classifiers = [
'Development Status :: 5 - Production/Stable',
'Intended Audience :: Developers',
'License :: OSI Approved :: MIT License',
Expand All @@ -20,9 +20,9 @@
'Programming Language :: Python :: 3.5',
'Topic :: Software Development :: Libraries :: Python Modules',
'Topic :: Text Processing :: Markup :: HTML'
]
]

packages = ['html5lib'] + ['html5lib.'+name
packages = ['html5lib'] + ['html5lib.' + name
for name in os.listdir(os.path.join('html5lib'))
if os.path.isdir(os.path.join('html5lib', name)) and
not name.startswith('.') and name != 'tests']
Expand All @@ -39,9 +39,9 @@
assignments = filter(lambda x: isinstance(x, ast.Assign), t.body)
for a in assignments:
if (len(a.targets) == 1 and
isinstance(a.targets[0], ast.Name) and
a.targets[0].id == "__version__" and
isinstance(a.value, ast.Str)):
isinstance(a.targets[0], ast.Name) and
a.targets[0].id == "__version__" and
isinstance(a.value, ast.Str)):
version = a.value.s

setup(name='html5lib',
Expand Down
50 changes: 31 additions & 19 deletions utils/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,57 +2,67 @@

import html5lib


def parse(path="html5ents.xml"):
return html5lib.parse(open(path), treebuilder="lxml")


def entity_table(tree):
return dict((entity_name("".join(tr[0].xpath(".//text()"))),
entity_characters(tr[1].text))
for tr in tree.xpath("//h:tbody/h:tr",
namespaces={"h":"http:https://www.w3.org/1999/xhtml"}))
namespaces={"h": "http:https://www.w3.org/1999/xhtml"}))


def entity_name(inp):
return inp.strip()


def entity_characters(inp):
return "".join(codepoint_to_character(item)
for item in inp.split()
if item)
for item in inp.split()
if item)


def codepoint_to_character(inp):
return ("\U000"+inp[2:]).decode("unicode-escape")
return ("\\U000" + inp[2:]).decode("unicode-escape")


def make_tests_json(entities):
test_list = make_test_list(entities)
tests_json = {"tests":
[make_test(*item) for item in test_list]
[make_test(*item) for item in test_list]
}
return tests_json


def make_test(name, characters, good):
return {
"description":test_description(name, good),
"input":"&%s"%name,
"output":test_expected(name, characters, good)
}
"description": test_description(name, good),
"input": "&%s" % name,
"output": test_expected(name, characters, good)
}


def test_description(name, good):
with_semicolon = name.endswith(";")
semicolon_text = {True:"with a semi-colon",
False:"without a semi-colon"}[with_semicolon]
semicolon_text = {True: "with a semi-colon",
False: "without a semi-colon"}[with_semicolon]
if good:
text = "Named entity: %s %s"%(name, semicolon_text)
text = "Named entity: %s %s" % (name, semicolon_text)
else:
text = "Bad named entity: %s %s"%(name, semicolon_text)
text = "Bad named entity: %s %s" % (name, semicolon_text)
return text


def test_expected(name, characters, good):
rv = []
if not good or not name.endswith(";"):
rv.append("ParseError")
rv.append(["Character", characters])
return rv


def make_test_list(entities):
tests = []
for entity_name, characters in entities.items():
Expand All @@ -61,20 +71,23 @@ def make_test_list(entities):
tests.append((entity_name, characters, True))
return sorted(tests)


def subentity_exists(entity_name, entities):
for i in range(1, len(entity_name)):
if entity_name[:-i] in entities:
return True
return False


def make_entities_code(entities):
entities_text = "\n".join(" \"%s\": u\"%s\","%(
name, entities[name].encode(
"unicode-escape").replace("\"", "\\\""))
for name in sorted(entities.keys()))
entities_text = "\n".join(" \"%s\": u\"%s\"," % (
name, entities[name].encode(
"unicode-escape").replace("\"", "\\\""))
for name in sorted(entities.keys()))
return """entities = {
%s
}"""%entities_text
}""" % entities_text


def main():
entities = entity_table(parse())
Expand All @@ -85,4 +98,3 @@ def main():

if __name__ == "__main__":
main()

43 changes: 23 additions & 20 deletions utils/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
s.spider("http:https://www.google.com", maxURLs=100)
"""

import urllib.request, urllib.error, urllib.parse
import urllib.request
import urllib.error
import urllib.parse
import urllib.robotparser
import md5

Expand All @@ -16,11 +18,13 @@
import html5lib
from html5lib.treebuilders import etree


class Spider(object):

def __init__(self):
self.unvisitedURLs = set()
self.visitedURLs = set()
self.buggyURLs=set()
self.buggyURLs = set()
self.robotParser = urllib.robotparser.RobotFileParser()
self.contentDigest = {}
self.http = httplib2.Http(".cache")
Expand Down Expand Up @@ -70,18 +74,18 @@ def updateURLs(self, tree):
update the list of visited and unvisited URLs according to whether we
have seen them before or not"""
urls = set()
#Remove all links we have already visited
# Remove all links we have already visited
for link in tree.findall(".//a"):
try:
url = urllib.parse.urldefrag(link.attrib['href'])[0]
if (url and url not in self.unvisitedURLs and url
try:
url = urllib.parse.urldefrag(link.attrib['href'])[0]
if (url and url not in self.unvisitedURLs and url
not in self.visitedURLs):
urls.add(url)
except KeyError:
pass
urls.add(url)
except KeyError:
pass

#Remove all non-http URLs and add a suitable base URL where that is
#missing
# Remove all non-http URLs and add a suitable base URL where that is
# missing
newUrls = set()
for url in urls:
splitURL = list(urllib.parse.urlsplit(url))
Expand All @@ -93,23 +97,22 @@ def updateURLs(self, tree):
urls = newUrls

responseHeaders = {}
#Now we want to find the content types of the links we haven't visited
# Now we want to find the content types of the links we haven't visited
for url in urls:
try:
resp, content = self.http.request(url, "HEAD")
responseHeaders[url] = resp
except AttributeError as KeyError:
#Don't know why this happens
except AttributeError:
# Don't know why this happens
pass


#Remove links not of content-type html or pages not found
#XXX - need to deal with other status codes?
# Remove links not of content-type html or pages not found
# XXX - need to deal with other status codes?
toVisit = set([url for url in urls if url in responseHeaders and
"html" in responseHeaders[url]['content-type'] and
responseHeaders[url]['status'] == "200"])
"html" in responseHeaders[url]['content-type'] and
responseHeaders[url]['status'] == "200"])

#Now check we are allowed to spider the page
# Now check we are allowed to spider the page
for url in toVisit:
robotURL = list(urllib.parse.urlsplit(url)[:2])
robotURL.extend(["robots.txt", "", ""])
Expand Down