Skip to content

Commit a2b52ed

Browse files
authored
Use request.meta than response.meta in the middleware (#92)
* Updating the tests so that unpinned responses would be used * Access request.meta instead in the middleware since responses here shall not be pinned yet * Properly performing the Response.__init__ check override -- only in the test case setup
1 parent dc777a5 commit a2b52ed

File tree

2 files changed

+25
-26
lines changed

2 files changed

+25
-26
lines changed

scrapy_crawlera/middleware.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ def process_response(self, request, response, spider):
201201
if self._is_auth_error(response):
202202
# When crawlera has issues it might not be able to authenticate users
203203
# we must retry
204-
retries = response.meta.get('crawlera_auth_retry_times', 0)
204+
retries = request.meta.get('crawlera_auth_retry_times', 0)
205205
if retries < self.max_auth_retry_times:
206206
return self._retry_auth(response, request, spider)
207207
else:
@@ -257,7 +257,7 @@ def _retry_auth(self, response, request, spider):
257257
"Retrying crawlera request for authentication issue",
258258
extra={'spider': self.spider},
259259
)
260-
retries = response.meta.get('crawlera_auth_retry_times', 0) + 1
260+
retries = request.meta.get('crawlera_auth_retry_times', 0) + 1
261261
retryreq = request.copy()
262262
retryreq.meta['crawlera_auth_retry_times'] = retries
263263
retryreq.dont_filter = True

tests/test_crawlera.py

Lines changed: 23 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,13 @@ class CrawleraMiddlewareTestCase(TestCase):
3535
def setUp(self):
3636
self.spider = Spider('foo')
3737
self.settings = {'CRAWLERA_APIKEY': 'apikey'}
38+
Response_init_orig = Response.__init__
39+
40+
def Response_init_new(self, *args, **kwargs):
41+
assert not kwargs.get('request'), 'response objects at this stage shall not be pinned'
42+
return Response_init_orig(self, *args, **kwargs)
43+
44+
Response.__init__ = Response_init_new
3845

3946
def _mock_crawlera_response(self, url, headers=None, **kwargs):
4047
crawlera_version = choice(("1.36.3-cd5e44", "", None))
@@ -70,9 +77,9 @@ def _assert_disabled(self, spider, settings=None):
7077
self.assertEqual(req.meta.get('proxy'), None)
7178
self.assertEqual(req.meta.get('download_timeout'), None)
7279
self.assertEqual(req.headers.get('Proxy-Authorization'), None)
73-
res = Response(req.url, request=req)
80+
res = Response(req.url)
7481
assert mw.process_response(req, res, spider) is res
75-
res = Response(req.url, status=mw.ban_code, request=req)
82+
res = Response(req.url, status=mw.ban_code)
7683
assert mw.process_response(req, res, spider) is res
7784

7885
def _assert_enabled(self, spider,
@@ -89,7 +96,7 @@ def _assert_enabled(self, spider,
8996
self.assertEqual(req.meta.get('proxy'), proxyurl)
9097
self.assertEqual(req.meta.get('download_timeout'), download_timeout)
9198
self.assertEqual(req.headers.get('Proxy-Authorization'), proxyauth)
92-
res = self._mock_crawlera_response(req.url, request=req)
99+
res = self._mock_crawlera_response(req.url)
93100
assert mw.process_response(req, res, spider) is res
94101

95102
# disabled if 'dont_proxy=True' is set
@@ -99,7 +106,7 @@ def _assert_enabled(self, spider,
99106
self.assertEqual(req.meta.get('proxy'), None)
100107
self.assertEqual(req.meta.get('download_timeout'), None)
101108
self.assertEqual(req.headers.get('Proxy-Authorization'), None)
102-
res = self._mock_crawlera_response(req.url, request=req)
109+
res = self._mock_crawlera_response(req.url)
103110
assert mw.process_response(req, res, spider) is res
104111
del req.meta['dont_proxy']
105112

@@ -270,7 +277,6 @@ def test_delay_adjustment(self):
270277
ban_url,
271278
status=self.bancode,
272279
headers=headers,
273-
request=req
274280
)
275281
mw.process_response(req, res, self.spider)
276282
self.assertEqual(slot.delay, delay)
@@ -286,7 +292,6 @@ def test_delay_adjustment(self):
286292
ban_url,
287293
status=self.bancode,
288294
headers=headers,
289-
request=req
290295
)
291296
mw.process_response(req, res, self.spider)
292297
self.assertEqual(slot.delay, retry_after)
@@ -295,7 +300,7 @@ def test_delay_adjustment(self):
295300
# DNS cache should be cleared in case of errors
296301
dnscache['proxy.crawlera.com'] = '1.1.1.1'
297302

298-
res = self._mock_crawlera_response(url, request=req)
303+
res = self._mock_crawlera_response(url)
299304
mw.process_response(req, res, self.spider)
300305
self.assertEqual(slot.delay, delay)
301306
self.assertEqual(self.spider.download_delay, delay)
@@ -308,7 +313,7 @@ def test_delay_adjustment(self):
308313
self.assertNotIn('proxy.crawlera.com', dnscache)
309314

310315
dnscache['proxy.crawlera.com'] = '1.1.1.1'
311-
res = self._mock_crawlera_response(ban_url, request=req)
316+
res = self._mock_crawlera_response(ban_url)
312317
mw.process_response(req, res, self.spider)
313318
self.assertEqual(slot.delay, delay)
314319
self.assertEqual(self.spider.download_delay, delay)
@@ -320,7 +325,7 @@ def test_delay_adjustment(self):
320325
self.assertNotIn('proxy.crawlera.com', dnscache)
321326

322327
dnscache['proxy.crawlera.com'] = '1.1.1.1'
323-
res = self._mock_crawlera_response(ban_url, status=self.bancode, request=req)
328+
res = self._mock_crawlera_response(ban_url, status=self.bancode)
324329
mw.process_response(req, res, self.spider)
325330
self.assertEqual(slot.delay, delay)
326331
self.assertEqual(self.spider.download_delay, delay)
@@ -562,7 +567,6 @@ def test_noslaves_delays(self, random_uniform_patch):
562567
ban_url,
563568
status=self.bancode,
564569
headers=headers,
565-
request=noslaves_req
566570
)
567571

568572
# delays grow exponentially
@@ -585,7 +589,6 @@ def test_noslaves_delays(self, random_uniform_patch):
585589
ban_url,
586590
status=self.bancode,
587591
headers=ban_headers,
588-
request=ban_req
589592
)
590593
mw.process_response(ban_req, ban_res, self.spider)
591594
self.assertEqual(slot.delay, default_delay)
@@ -597,7 +600,6 @@ def test_noslaves_delays(self, random_uniform_patch):
597600
good_res = self._mock_crawlera_response(
598601
url,
599602
status=200,
600-
request=good_req
601603
)
602604
mw.process_response(good_req, good_res, self.spider)
603605
self.assertEqual(slot.delay, default_delay)
@@ -631,7 +633,6 @@ def test_auth_error_retries(self, random_uniform_patch):
631633
auth_error_response = self._mock_crawlera_response(
632634
ban_url,
633635
status=self.auth_error_code,
634-
request=auth_error_req,
635636
headers=auth_error_headers
636637
)
637638

@@ -641,34 +642,33 @@ def test_auth_error_retries(self, random_uniform_patch):
641642
retry_times = req.meta["crawlera_auth_retry_times"]
642643
self.assertEqual(retry_times, 1)
643644

644-
auth_error_response.meta["crawlera_auth_retry_times"] = retry_times
645+
auth_error_req.meta["crawlera_auth_retry_times"] = retry_times
645646
req = mw.process_response(auth_error_req, auth_error_response, self.spider)
646647
self.assertEqual(slot.delay, backoff_step * 2 ** 1)
647648
retry_times = req.meta["crawlera_auth_retry_times"]
648649
self.assertEqual(retry_times, 2)
649650

650-
auth_error_response.meta["crawlera_auth_retry_times"] = retry_times
651+
auth_error_req.meta["crawlera_auth_retry_times"] = retry_times
651652
req = mw.process_response(auth_error_req, auth_error_response, self.spider)
652653
self.assertEqual(slot.delay, backoff_step * 2 ** 2)
653654
retry_times = req.meta["crawlera_auth_retry_times"]
654655
self.assertEqual(retry_times, 3)
655656

656-
auth_error_response.meta["crawlera_auth_retry_times"] = retry_times
657+
auth_error_req.meta["crawlera_auth_retry_times"] = retry_times
657658
req = mw.process_response(auth_error_req, auth_error_response, self.spider)
658659
self.assertEqual(slot.delay, max_delay)
659660
retry_times = req.meta["crawlera_auth_retry_times"]
660661
self.assertEqual(retry_times, 4)
661662

662663
# Should return a response when after max number of retries
663-
auth_error_response.meta["crawlera_auth_retry_times"] = retry_times
664+
auth_error_req.meta["crawlera_auth_retry_times"] = retry_times
664665
res = mw.process_response(auth_error_req, auth_error_response, self.spider)
665666
self.assertIsInstance(res, Response)
666667

667668
# non crawlera 407 is not retried
668669
non_crawlera_407_response = self._mock_crawlera_response(
669670
ban_url,
670671
status=self.auth_error_code,
671-
request=auth_error_req,
672672
)
673673
res = mw.process_response(auth_error_req, non_crawlera_407_response, self.spider)
674674
self.assertIsInstance(res, Response)
@@ -706,7 +706,7 @@ def test_process_response_enables_crawlera(self):
706706

707707
# A good code response should not enable it
708708
req = Request(url)
709-
res = Response(url, status=200, request=req)
709+
res = Response(url, status=200)
710710
mw.process_request(req, self.spider)
711711
out = mw.process_response(req, res, self.spider)
712712
self.assertIsInstance(out, Response)
@@ -715,7 +715,7 @@ def test_process_response_enables_crawlera(self):
715715
self.assertEqual(mw.crawler.stats.get_stats(), {})
716716

717717
# A bad code response should enable it
718-
res = Response(url, status=403, request=req)
718+
res = Response(url, status=403)
719719
mw.process_request(req, self.spider)
720720
out = mw.process_response(req, res, self.spider)
721721
self.assertIsInstance(out, Request)
@@ -727,7 +727,7 @@ def test_process_response_enables_crawlera(self):
727727

728728
# Another regular response with bad code should be done on crawlera
729729
# and not be retried
730-
res = Response(url, status=403, request=req)
730+
res = Response(url, status=403)
731731
mw.process_request(req, self.spider)
732732
out = mw.process_response(req, res, self.spider)
733733
self.assertIsInstance(out, Response)
@@ -737,7 +737,7 @@ def test_process_response_enables_crawlera(self):
737737

738738
# A crawlera response with bad code should not be retried as well
739739
mw.process_request(req, self.spider)
740-
res = self._mock_crawlera_response(url, status=403, request=req)
740+
res = self._mock_crawlera_response(url, status=403)
741741
out = mw.process_response(req, res, self.spider)
742742
self.assertIsInstance(out, Response)
743743
self.assertEqual(mw.enabled, False)
@@ -756,7 +756,7 @@ def test_process_response_from_file_scheme(self):
756756

757757
# A good code response should not enable it
758758
req = Request(url)
759-
res = Response(url, status=200, request=req)
759+
res = Response(url, status=200)
760760
mw.process_request(req, self.spider)
761761
out = mw.process_response(req, res, self.spider)
762762
self.assertIsInstance(out, Response)
@@ -872,7 +872,6 @@ def test_no_slot(self):
872872
ban_url,
873873
status=self.bancode,
874874
headers=headers,
875-
request=noslaves_req
876875
)
877876
# checking that response was processed
878877
response = mw.process_response(noslaves_req, noslaves_res, self.spider)

0 commit comments

Comments
 (0)