Skip to content

Commit bed8998

Browse files
committed
[tests] refactors tests, adds tests for spider attr
* refactors tests from functions to objects inheriting from unittest.TestCase * adds tests for enabling middleware with spider attribute
1 parent 2e7407d commit bed8998

File tree

1 file changed

+150
-125
lines changed

1 file changed

+150
-125
lines changed

tests/test_middleware.py

Lines changed: 150 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -2,154 +2,179 @@
22
from __future__ import absolute_import
33
import copy
44
import json
5+
from twisted.trial import unittest
56

67
import scrapy
7-
from scrapy.core.engine import ExecutionEngine
88
from scrapy.utils.test import get_crawler
9+
from scrapy.utils.httpobj import urlparse_cached
910

1011
import scrapyjs
1112
from scrapyjs.middleware import SplashMiddleware
1213
from scrapyjs.request import SplashRequest
1314

1415

15-
def _get_mw():
16-
crawler = get_crawler(settings_dict={
17-
'DOWNLOAD_HANDLERS': {'s3': None}, # for faster test running
18-
})
19-
if not hasattr(crawler, 'logformatter'):
20-
crawler.logformatter = None
21-
crawler.engine = ExecutionEngine(crawler, lambda _: None)
22-
# spider = crawler._create_spider("foo")
23-
return SplashMiddleware.from_crawler(crawler)
24-
25-
26-
def test_nosplash():
27-
mw = _get_mw()
28-
req = scrapy.Request("http://example.com")
29-
old_meta = copy.deepcopy(req.meta)
30-
assert mw.process_request(req, None) is None
31-
assert old_meta == req.meta
32-
33-
34-
def test_splash_request():
35-
mw = _get_mw()
36-
req = SplashRequest("http://example.com?foo=bar&url=1&wait=100")
37-
38-
req2 = mw.process_request(req, None)
39-
assert req2 is not None
40-
assert req2 is not req
41-
assert req2.url == "http://127.0.0.1:8050/render.html"
42-
assert req2.headers == {'Content-Type': ['application/json']}
43-
assert req2.method == 'POST'
44-
45-
expected_body = {'url': req.url}
46-
expected_body.update(SplashRequest.default_splash_meta['args'])
47-
assert json.loads(req2.body) == expected_body
48-
49-
50-
def test_splash_request_no_url():
51-
mw = _get_mw()
52-
lua_source = "function main(splash) return {result='ok'} end"
53-
req1 = SplashRequest(meta={'splash': {
54-
'args': {'lua_source': lua_source},
55-
'endpoint': 'execute',
56-
}})
57-
req = mw.process_request(req1, None)
58-
assert req.url == 'http://127.0.0.1:8050/execute'
59-
assert json.loads(req.body) == {
60-
'url': 'about:blank',
61-
'lua_source': lua_source
62-
}
63-
64-
65-
def test_override_splash_url():
66-
mw = _get_mw()
67-
req1 = scrapy.Request("http://example.com", meta={
68-
'splash': {
69-
'endpoint': 'render.png',
70-
'splash_url': 'http://splash.example.com'
71-
}
72-
})
73-
req = mw.process_request(req1, None)
74-
assert req.url == 'http://splash.example.com/render.png'
75-
assert json.loads(req.body) == {'url': req1.url}
76-
77-
78-
def test_float_wait_arg():
79-
mw = _get_mw()
80-
req1 = scrapy.Request("http://example.com", meta={
81-
'splash': {
82-
'endpoint': 'render.html',
83-
'args': {'wait': 0.5}
84-
}
85-
})
86-
req = mw.process_request(req1, None)
87-
assert json.loads(req.body) == {'url': req1.url, 'wait': 0.5}
88-
16+
class MockedSlot(object):
8917

18+
def __init__(self, delay=0.0):
19+
self.delay = delay
9020

91-
def test_slot_policy_single_slot():
92-
mw = _get_mw()
93-
meta = {'splash': {
94-
'slot_policy': scrapyjs.SlotPolicy.SINGLE_SLOT
95-
}}
9621

97-
req1 = scrapy.Request("http://example.com/path?key=value", meta=meta)
98-
req1 = mw.process_request(req1, None)
22+
class MockedDownloader(object):
9923

100-
req2 = scrapy.Request("http://fooexample.com/path?key=value", meta=meta)
101-
req2 = mw.process_request(req2, None)
24+
def __init__(self):
25+
self.slots = {}
10226

103-
assert req1.meta.get('download_slot')
104-
assert req1.meta['download_slot'] == req2.meta['download_slot']
27+
def _get_slot_key(self, request, spider):
28+
if 'download_slot' in request.meta:
29+
return request.meta['download_slot']
10530

31+
key = urlparse_cached(request).hostname or ''
32+
return key
10633

107-
def test_slot_policy_per_domain():
108-
mw = _get_mw()
109-
meta = {'splash': {
110-
'slot_policy': scrapyjs.SlotPolicy.PER_DOMAIN
111-
}}
11234

113-
req1 = scrapy.Request("http://example.com/path?key=value", meta=meta)
114-
req1 = mw.process_request(req1, None)
35+
class MockedEngine(object):
36+
downloader = MockedDownloader()
11537

116-
req2 = scrapy.Request("http://example.com/path2", meta=meta)
117-
req2 = mw.process_request(req2, None)
11838

119-
req3 = scrapy.Request("http://fooexample.com/path?key=value", meta=meta)
120-
req3 = mw.process_request(req3, None)
39+
class MiddlewareTest(unittest.TestCase):
12140

122-
assert req1.meta.get('download_slot')
123-
assert req3.meta.get('download_slot')
41+
def setUp(self):
42+
self.crawler = get_crawler(settings_dict={
43+
'DOWNLOAD_HANDLERS': {'s3': None}, # for faster test running
44+
})
45+
if not hasattr(self.crawler, 'logformatter'):
46+
self.crawler.logformatter = None
47+
self.crawler.engine = MockedEngine()
48+
self.mw = SplashMiddleware.from_crawler(self.crawler)
12449

125-
assert req1.meta['download_slot'] == req2.meta['download_slot']
126-
assert req1.meta['download_slot'] != req3.meta['download_slot']
50+
def test_nosplash(self):
51+
req = scrapy.Request("http://example.com")
52+
old_meta = copy.deepcopy(req.meta)
53+
assert self.mw.process_request(req, None) is None
54+
assert old_meta == req.meta
12755

56+
def test_splash_request(self):
57+
req = SplashRequest("http://example.com?foo=bar&url=1&wait=100")
12858

129-
def test_slot_policy_scrapy_default():
130-
mw = _get_mw()
131-
req = scrapy.Request("http://example.com", meta = {'splash': {
132-
'slot_policy': scrapyjs.SlotPolicy.SCRAPY_DEFAULT
133-
}})
134-
req = mw.process_request(req, None)
135-
assert 'download_slot' not in req.meta
59+
req2 = self.mw.process_request(req, None)
60+
assert req2 is not None
61+
assert req2 is not req
62+
assert req2.url == "http://127.0.0.1:8050/render.html"
63+
assert req2.headers == {'Content-Type': ['application/json']}
64+
assert req2.method == 'POST'
13665

66+
expected_body = {'url': req.url}
67+
expected_body.update(SplashRequest.default_splash_meta['args'])
68+
assert json.loads(req2.body) == expected_body
13769

138-
def test_adjust_timeout():
139-
mw = _get_mw()
140-
req1 = scrapy.Request("http://example.com", meta = {
141-
'splash': {'args': {'timeout': 60, 'html': 1}},
142-
143-
# download_timeout is always present,
144-
# it is set by DownloadTimeoutMiddleware
145-
'download_timeout': 30,
146-
})
147-
req1 = mw.process_request(req1, None)
148-
assert req1.meta['download_timeout'] > 60
70+
def test_splash_request_no_url(self):
71+
lua_source = "function main(splash) return {result='ok'} end"
72+
req1 = SplashRequest(meta={'splash': {
73+
'args': {'lua_source': lua_source},
74+
'endpoint': 'execute',
75+
}})
76+
req = self.mw.process_request(req1, None)
77+
assert req.url == 'http://127.0.0.1:8050/execute'
78+
assert json.loads(req.body) == {
79+
'url': 'about:blank',
80+
'lua_source': lua_source
81+
}
14982

150-
req2 = scrapy.Request("http://example.com", meta = {
151-
'splash': {'args': {'html': 1}},
152-
'download_timeout': 30,
153-
})
154-
req2 = mw.process_request(req2, None)
155-
assert req2.meta['download_timeout'] == 30
83+
def test_override_splash_url(self):
84+
req1 = scrapy.Request("http://example.com", meta={
85+
'splash': {
86+
'endpoint': 'render.png',
87+
'splash_url': 'http://splash.example.com'
88+
}
89+
})
90+
req = self.mw.process_request(req1, None)
91+
assert req.url == 'http://splash.example.com/render.png'
92+
assert json.loads(req.body) == {'url': req1.url}
93+
94+
def test_float_wait_arg(self):
95+
req1 = scrapy.Request("http://example.com", meta={
96+
'splash': {
97+
'endpoint': 'render.html',
98+
'args': {'wait': 0.5}
99+
}
100+
})
101+
req = self.mw.process_request(req1, None)
102+
assert json.loads(req.body) == {'url': req1.url, 'wait': 0.5}
103+
104+
def test_slot_policy_single_slot(self):
105+
meta = {'splash': {
106+
'slot_policy': scrapyjs.SlotPolicy.SINGLE_SLOT
107+
}}
108+
109+
req1 = scrapy.Request("http://example.com/path?key=value", meta=meta)
110+
req1 = self.mw.process_request(req1, None)
111+
112+
req2 = scrapy.Request("http://fooexample.com/path?key=value", meta=meta)
113+
req2 = self.mw.process_request(req2, None)
114+
115+
assert req1.meta.get('download_slot')
116+
assert req1.meta['download_slot'] == req2.meta['download_slot']
117+
118+
def test_slot_policy_per_domain(self):
119+
meta = {'splash': {
120+
'slot_policy': scrapyjs.SlotPolicy.PER_DOMAIN
121+
}}
122+
123+
req1 = scrapy.Request("http://example.com/path?key=value", meta=meta)
124+
req1 = self.mw.process_request(req1, None)
125+
126+
req2 = scrapy.Request("http://example.com/path2", meta=meta)
127+
req2 = self.mw.process_request(req2, None)
128+
129+
req3 = scrapy.Request("http://fooexample.com/path?key=value", meta=meta)
130+
req3 = self.mw.process_request(req3, None)
131+
132+
assert req1.meta.get('download_slot')
133+
assert req3.meta.get('download_slot')
134+
135+
assert req1.meta['download_slot'] == req2.meta['download_slot']
136+
assert req1.meta['download_slot'] != req3.meta['download_slot']
137+
138+
def test_slot_policy_scrapy_default(self):
139+
req = scrapy.Request("http://example.com", meta = {'splash': {
140+
'slot_policy': scrapyjs.SlotPolicy.SCRAPY_DEFAULT
141+
}})
142+
req = self.mw.process_request(req, None)
143+
assert 'download_slot' not in req.meta
144+
145+
def test_adjust_timeout(self):
146+
req1 = scrapy.Request("http://example.com", meta = {
147+
'splash': {'args': {'timeout': 60, 'html': 1}},
148+
149+
# download_timeout is always present,
150+
# it is set by DownloadTimeoutMiddleware
151+
'download_timeout': 30,
152+
})
153+
req1 = self.mw.process_request(req1, None)
154+
assert req1.meta['download_timeout'] > 60
155+
156+
req2 = scrapy.Request("http://example.com", meta = {
157+
'splash': {'args': {'html': 1}},
158+
'download_timeout': 30,
159+
})
160+
req2 = self.mw.process_request(req2, None)
161+
assert req2.meta['download_timeout'] == 30
162+
163+
def test_spider_attribute(self):
164+
req_url = "http://scrapy.org"
165+
req1 = scrapy.Request(req_url)
166+
167+
spider = self.crawler._create_spider("foo")
168+
spider.splash = {"args": {"images": 0}}
169+
170+
req1 = self.mw.process_request(req1, spider)
171+
self.assertIn("_splash_processed", req1.meta)
172+
self.assertIn("render.json", req1.url)
173+
self.assertIn("url", json.loads(req1.body))
174+
self.assertEqual(json.loads(req1.body).get("url"), req_url)
175+
self.assertIn("images", json.loads(req1.body))
176+
177+
# spider attribute blank middleware disabled
178+
spider.splash = {}
179+
req2 = self.mw.process_request(req1, spider)
180+
self.assertIsNone(req2)

0 commit comments

Comments
 (0)