Skip to content

Commit

Permalink
[tests] refactors tests, adds tests for spider attr
Browse files Browse the repository at this point in the history
* refactors tests from functions to objects inheriting
from unittest.TestCase
* adds tests for enabling middleware with spider attribute
  • Loading branch information
pawelmhm committed May 26, 2015
1 parent 2e7407d commit bed8998
Showing 1 changed file with 150 additions and 125 deletions.
275 changes: 150 additions & 125 deletions tests/test_middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,154 +2,179 @@
from __future__ import absolute_import
import copy
import json
from twisted.trial import unittest

import scrapy
from scrapy.core.engine import ExecutionEngine
from scrapy.utils.test import get_crawler
from scrapy.utils.httpobj import urlparse_cached

import scrapyjs
from scrapyjs.middleware import SplashMiddleware
from scrapyjs.request import SplashRequest


def _get_mw():
crawler = get_crawler(settings_dict={
'DOWNLOAD_HANDLERS': {'s3': None}, # for faster test running
})
if not hasattr(crawler, 'logformatter'):
crawler.logformatter = None
crawler.engine = ExecutionEngine(crawler, lambda _: None)
# spider = crawler._create_spider("foo")
return SplashMiddleware.from_crawler(crawler)


def test_nosplash():
mw = _get_mw()
req = scrapy.Request("http:https://example.com")
old_meta = copy.deepcopy(req.meta)
assert mw.process_request(req, None) is None
assert old_meta == req.meta


def test_splash_request():
mw = _get_mw()
req = SplashRequest("http:https://example.com?foo=bar&url=1&wait=100")

req2 = mw.process_request(req, None)
assert req2 is not None
assert req2 is not req
assert req2.url == "http:https://127.0.0.1:8050/render.html"
assert req2.headers == {'Content-Type': ['application/json']}
assert req2.method == 'POST'

expected_body = {'url': req.url}
expected_body.update(SplashRequest.default_splash_meta['args'])
assert json.loads(req2.body) == expected_body


def test_splash_request_no_url():
mw = _get_mw()
lua_source = "function main(splash) return {result='ok'} end"
req1 = SplashRequest(meta={'splash': {
'args': {'lua_source': lua_source},
'endpoint': 'execute',
}})
req = mw.process_request(req1, None)
assert req.url == 'http:https://127.0.0.1:8050/execute'
assert json.loads(req.body) == {
'url': 'about:blank',
'lua_source': lua_source
}


def test_override_splash_url():
mw = _get_mw()
req1 = scrapy.Request("http:https://example.com", meta={
'splash': {
'endpoint': 'render.png',
'splash_url': 'http:https://splash.example.com'
}
})
req = mw.process_request(req1, None)
assert req.url == 'http:https://splash.example.com/render.png'
assert json.loads(req.body) == {'url': req1.url}


def test_float_wait_arg():
mw = _get_mw()
req1 = scrapy.Request("http:https://example.com", meta={
'splash': {
'endpoint': 'render.html',
'args': {'wait': 0.5}
}
})
req = mw.process_request(req1, None)
assert json.loads(req.body) == {'url': req1.url, 'wait': 0.5}

class MockedSlot(object):

def __init__(self, delay=0.0):
self.delay = delay

def test_slot_policy_single_slot():
mw = _get_mw()
meta = {'splash': {
'slot_policy': scrapyjs.SlotPolicy.SINGLE_SLOT
}}

req1 = scrapy.Request("http:https://example.com/path?key=value", meta=meta)
req1 = mw.process_request(req1, None)
class MockedDownloader(object):

req2 = scrapy.Request("http:https://fooexample.com/path?key=value", meta=meta)
req2 = mw.process_request(req2, None)
def __init__(self):
self.slots = {}

assert req1.meta.get('download_slot')
assert req1.meta['download_slot'] == req2.meta['download_slot']
def _get_slot_key(self, request, spider):
if 'download_slot' in request.meta:
return request.meta['download_slot']

key = urlparse_cached(request).hostname or ''
return key

def test_slot_policy_per_domain():
mw = _get_mw()
meta = {'splash': {
'slot_policy': scrapyjs.SlotPolicy.PER_DOMAIN
}}

req1 = scrapy.Request("http:https://example.com/path?key=value", meta=meta)
req1 = mw.process_request(req1, None)
class MockedEngine(object):
downloader = MockedDownloader()

req2 = scrapy.Request("http:https://example.com/path2", meta=meta)
req2 = mw.process_request(req2, None)

req3 = scrapy.Request("http:https://fooexample.com/path?key=value", meta=meta)
req3 = mw.process_request(req3, None)
class MiddlewareTest(unittest.TestCase):

assert req1.meta.get('download_slot')
assert req3.meta.get('download_slot')
def setUp(self):
self.crawler = get_crawler(settings_dict={
'DOWNLOAD_HANDLERS': {'s3': None}, # for faster test running
})
if not hasattr(self.crawler, 'logformatter'):
self.crawler.logformatter = None
self.crawler.engine = MockedEngine()
self.mw = SplashMiddleware.from_crawler(self.crawler)

assert req1.meta['download_slot'] == req2.meta['download_slot']
assert req1.meta['download_slot'] != req3.meta['download_slot']
def test_nosplash(self):
req = scrapy.Request("http:https://example.com")
old_meta = copy.deepcopy(req.meta)
assert self.mw.process_request(req, None) is None
assert old_meta == req.meta

def test_splash_request(self):
req = SplashRequest("http:https://example.com?foo=bar&url=1&wait=100")

def test_slot_policy_scrapy_default():
mw = _get_mw()
req = scrapy.Request("http:https://example.com", meta = {'splash': {
'slot_policy': scrapyjs.SlotPolicy.SCRAPY_DEFAULT
}})
req = mw.process_request(req, None)
assert 'download_slot' not in req.meta
req2 = self.mw.process_request(req, None)
assert req2 is not None
assert req2 is not req
assert req2.url == "http:https://127.0.0.1:8050/render.html"
assert req2.headers == {'Content-Type': ['application/json']}
assert req2.method == 'POST'

expected_body = {'url': req.url}
expected_body.update(SplashRequest.default_splash_meta['args'])
assert json.loads(req2.body) == expected_body

def test_adjust_timeout():
mw = _get_mw()
req1 = scrapy.Request("http:https://example.com", meta = {
'splash': {'args': {'timeout': 60, 'html': 1}},

# download_timeout is always present,
# it is set by DownloadTimeoutMiddleware
'download_timeout': 30,
})
req1 = mw.process_request(req1, None)
assert req1.meta['download_timeout'] > 60
def test_splash_request_no_url(self):
lua_source = "function main(splash) return {result='ok'} end"
req1 = SplashRequest(meta={'splash': {
'args': {'lua_source': lua_source},
'endpoint': 'execute',
}})
req = self.mw.process_request(req1, None)
assert req.url == 'http:https://127.0.0.1:8050/execute'
assert json.loads(req.body) == {
'url': 'about:blank',
'lua_source': lua_source
}

req2 = scrapy.Request("http:https://example.com", meta = {
'splash': {'args': {'html': 1}},
'download_timeout': 30,
})
req2 = mw.process_request(req2, None)
assert req2.meta['download_timeout'] == 30
def test_override_splash_url(self):
req1 = scrapy.Request("http:https://example.com", meta={
'splash': {
'endpoint': 'render.png',
'splash_url': 'http:https://splash.example.com'
}
})
req = self.mw.process_request(req1, None)
assert req.url == 'http:https://splash.example.com/render.png'
assert json.loads(req.body) == {'url': req1.url}

def test_float_wait_arg(self):
req1 = scrapy.Request("http:https://example.com", meta={
'splash': {
'endpoint': 'render.html',
'args': {'wait': 0.5}
}
})
req = self.mw.process_request(req1, None)
assert json.loads(req.body) == {'url': req1.url, 'wait': 0.5}

def test_slot_policy_single_slot(self):
meta = {'splash': {
'slot_policy': scrapyjs.SlotPolicy.SINGLE_SLOT
}}

req1 = scrapy.Request("http:https://example.com/path?key=value", meta=meta)
req1 = self.mw.process_request(req1, None)

req2 = scrapy.Request("http:https://fooexample.com/path?key=value", meta=meta)
req2 = self.mw.process_request(req2, None)

assert req1.meta.get('download_slot')
assert req1.meta['download_slot'] == req2.meta['download_slot']

def test_slot_policy_per_domain(self):
meta = {'splash': {
'slot_policy': scrapyjs.SlotPolicy.PER_DOMAIN
}}

req1 = scrapy.Request("http:https://example.com/path?key=value", meta=meta)
req1 = self.mw.process_request(req1, None)

req2 = scrapy.Request("http:https://example.com/path2", meta=meta)
req2 = self.mw.process_request(req2, None)

req3 = scrapy.Request("http:https://fooexample.com/path?key=value", meta=meta)
req3 = self.mw.process_request(req3, None)

assert req1.meta.get('download_slot')
assert req3.meta.get('download_slot')

assert req1.meta['download_slot'] == req2.meta['download_slot']
assert req1.meta['download_slot'] != req3.meta['download_slot']

def test_slot_policy_scrapy_default(self):
req = scrapy.Request("http:https://example.com", meta = {'splash': {
'slot_policy': scrapyjs.SlotPolicy.SCRAPY_DEFAULT
}})
req = self.mw.process_request(req, None)
assert 'download_slot' not in req.meta

def test_adjust_timeout(self):
req1 = scrapy.Request("http:https://example.com", meta = {
'splash': {'args': {'timeout': 60, 'html': 1}},

# download_timeout is always present,
# it is set by DownloadTimeoutMiddleware
'download_timeout': 30,
})
req1 = self.mw.process_request(req1, None)
assert req1.meta['download_timeout'] > 60

req2 = scrapy.Request("http:https://example.com", meta = {
'splash': {'args': {'html': 1}},
'download_timeout': 30,
})
req2 = self.mw.process_request(req2, None)
assert req2.meta['download_timeout'] == 30

def test_spider_attribute(self):
req_url = "http:https://scrapy.org"
req1 = scrapy.Request(req_url)

spider = self.crawler._create_spider("foo")
spider.splash = {"args": {"images": 0}}

req1 = self.mw.process_request(req1, spider)
self.assertIn("_splash_processed", req1.meta)
self.assertIn("render.json", req1.url)
self.assertIn("url", json.loads(req1.body))
self.assertEqual(json.loads(req1.body).get("url"), req_url)
self.assertIn("images", json.loads(req1.body))

# spider attribute blank middleware disabled
spider.splash = {}
req2 = self.mw.process_request(req1, spider)
self.assertIsNone(req2)

0 comments on commit bed8998

Please sign in to comment.