Skip to content

Commit 8808c1e

Browse files
authored
Merge pull request #78 from scrapy-plugins/fix-middleware
Fix process response getting duplicated requests
2 parents c454d9e + 95bbcc4 commit 8808c1e

File tree

2 files changed

+71
-20
lines changed

2 files changed

+71
-20
lines changed

scrapy_crawlera/middleware.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ class CrawleraMiddleware(object):
3030
force_enable_on_http_codes = []
3131
max_auth_retry_times = 10
3232
enabled_for_domain = {}
33+
apikey = ""
3334

3435
_settings = [
3536
('apikey', str),
@@ -176,7 +177,7 @@ def process_response(self, request, response, spider):
176177
return self._handle_not_enabled_response(request, response)
177178

178179
if not self._is_crawlera_response(response):
179-
return request
180+
return response
180181

181182
key = self._get_slot_key(request)
182183
self._restore_original_delay(request)
@@ -232,7 +233,10 @@ def _handle_not_enabled_response(self, request, response):
232233
if self._should_enable_for_response(response):
233234
domain = self._get_url_domain(request.url)
234235
self.enabled_for_domain[domain] = True
235-
return request
236+
237+
retryreq = request.copy()
238+
retryreq.dont_filter = True
239+
return retryreq
236240
return response
237241

238242
def _retry_auth(self, response, request, spider):

tests/test_crawlera.py

Lines changed: 65 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,15 @@ def test_delay_adjustment(self):
299299
self.assertEqual(self.spider.download_delay, delay)
300300
self.assertNotIn('proxy.crawlera.com', dnscache)
301301

302+
def test_process_exception_outside_crawlera(self):
303+
self.spider.crawlera_enabled = False
304+
crawler = self._mock_crawler(self.spider, self.settings)
305+
mw = self.mwcls.from_crawler(crawler)
306+
mw.open_spider(self.spider)
307+
308+
req = Request("https://scrapy.org")
309+
assert mw.process_exception(req, ConnectionDone(), self.spider) is None
310+
302311
def test_jobid_header(self):
303312
# test without the environment variable 'SCRAPY_JOB'
304313
self.spider.crawlera_enabled = True
@@ -416,8 +425,9 @@ def test_crawlera_default_headers(self):
416425
self.assertEqual(req.headers['X-Crawlera-Cookies'], b'disable')
417426
self.assertNotIn('X-Crawlera-Profile', req.headers)
418427

428+
@patch('scrapy_crawlera.middleware.warnings')
419429
@patch('scrapy_crawlera.middleware.logging')
420-
def test_crawlera_default_headers_conflicting_headers(self, mock_logger):
430+
def test_crawlera_default_headers_conflicting_headers(self, mock_logger, mock_warnings):
421431
spider = self.spider
422432
self.spider.crawlera_enabled = True
423433

@@ -433,6 +443,12 @@ def test_crawlera_default_headers_conflicting_headers(self, mock_logger):
433443
assert mw.process_request(req, spider) is None
434444
self.assertEqual(req.headers['X-Crawlera-UA'], b'desktop')
435445
self.assertEqual(req.headers['X-Crawlera-Profile'], b'desktop')
446+
mock_warnings.warn.assert_called_with(
447+
"The headers ('X-Crawlera-Profile', 'X-Crawlera-UA') are conflictin"
448+
"g on some of your requests. Please check https://doc.scrapinghub.c"
449+
"om/crawlera.html for more information. You can set LOG_LEVEL=DEBUG"
450+
" to see the urls with problems"
451+
)
436452
mock_logger.debug.assert_called_with(
437453
"The headers ('X-Crawlera-Profile', 'X-Crawlera-UA') are conflictin"
438454
"g on request http://www.scrapytest.org/other. X-Crawlera-UA will b"
@@ -447,6 +463,12 @@ def test_crawlera_default_headers_conflicting_headers(self, mock_logger):
447463
assert mw.process_request(req, spider) is None
448464
self.assertEqual(req.headers['X-Crawlera-UA'], b'desktop')
449465
self.assertEqual(req.headers['X-Crawlera-Profile'], b'desktop')
466+
mock_warnings.warn.assert_called_with(
467+
"The headers ('X-Crawlera-Profile', 'X-Crawlera-UA') are conflictin"
468+
"g on some of your requests. Please check https://doc.scrapinghub.c"
469+
"om/crawlera.html for more information. You can set LOG_LEVEL=DEBUG"
470+
" to see the urls with problems"
471+
)
450472
mock_logger.debug.assert_called_with(
451473
"The headers ('X-Crawlera-Profile', 'X-Crawlera-UA') are conflictin"
452474
"g on request http://www.scrapytest.org/other. X-Crawlera-UA will b"
@@ -650,42 +672,67 @@ def test_process_response_enables_crawlera(self):
650672
mw = self.mwcls.from_crawler(crawler)
651673
mw.open_spider(self.spider)
652674

653-
req = Request(url)
654-
res = Response(url, status=403, request=req)
655-
out = mw.process_response(req, res, self.spider)
656-
self.assertIsInstance(out, Request)
657-
self.assertEqual(mw.enabled_for_domain["scrapy.org"], True)
658-
self.assertEqual(mw.enabled, False)
659-
660-
# A good response shouldnt enable it
661-
mw.enabled_for_domain = {}
675+
# A good code response should not enable it
662676
req = Request(url)
663677
res = Response(url, status=200, request=req)
678+
mw.process_request(req, self.spider)
664679
out = mw.process_response(req, res, self.spider)
665680
self.assertIsInstance(out, Response)
666681
self.assertEqual(mw.enabled_for_domain, {})
667682
self.assertEqual(mw.enabled, False)
683+
self.assertEqual(mw.crawler.stats.get_stats(), {})
668684

669-
req = Request(url)
685+
# A bad code response should enable it
670686
res = Response(url, status=403, request=req)
687+
mw.process_request(req, self.spider)
671688
out = mw.process_response(req, res, self.spider)
672689
self.assertIsInstance(out, Request)
673690
self.assertEqual(mw.enabled, False)
674691
self.assertEqual(mw.enabled_for_domain["scrapy.org"], True)
675-
# Another regular response with bad code should be retried
692+
self.assertEqual(mw.crawler.stats.get_stats(), {})
693+
694+
# Another regular response with bad code should be done on crawlera
695+
# and not be retried
696+
res = Response(url, status=403, request=req)
697+
mw.process_request(req, self.spider)
676698
out = mw.process_response(req, res, self.spider)
677-
self.assertIsInstance(out, Request)
699+
self.assertIsInstance(out, Response)
678700
self.assertEqual(mw.enabled, False)
679701
self.assertEqual(mw.enabled_for_domain["scrapy.org"], True)
680-
# A crawlera response with bad code should not
702+
self.assertEqual(mw.crawler.stats.get_value("crawlera/request"), 1)
703+
704+
# A crawlera response with bad code should not be retried as well
705+
mw.process_request(req, self.spider)
681706
res = self._mock_crawlera_response(url, status=403, request=req)
682707
out = mw.process_response(req, res, self.spider)
683708
self.assertIsInstance(out, Response)
684709
self.assertEqual(mw.enabled, False)
685710
self.assertEqual(mw.enabled_for_domain["scrapy.org"], True)
711+
self.assertEqual(mw.crawler.stats.get_value("crawlera/request"), 2)
712+
713+
def test_process_response_from_file_scheme(self):
714+
url = "file:///tmp/foobar.txt"
715+
716+
self.spider.crawlera_enabled = False
717+
self.settings['CRAWLERA_FORCE_ENABLE_ON_HTTP_CODES'] = [403]
718+
crawler = self._mock_crawler(self.spider, self.settings)
719+
mw = self.mwcls.from_crawler(crawler)
720+
mw.enabled_for_domain = {}
721+
mw.open_spider(self.spider)
722+
723+
# A good code response should not enable it
724+
req = Request(url)
725+
res = Response(url, status=200, request=req)
726+
mw.process_request(req, self.spider)
727+
out = mw.process_response(req, res, self.spider)
728+
self.assertIsInstance(out, Response)
729+
self.assertEqual(mw.enabled_for_domain, {})
730+
self.assertEqual(mw.enabled, False)
731+
self.assertEqual(mw.crawler.stats.get_stats(), {})
732+
self.assertEqual(out.status, 200)
686733

687734
@patch('scrapy_crawlera.middleware.logging')
688-
def test_no_apikey_warning_crawlera_disabled(self, mock_logger):
735+
def test_apikey_warning_crawlera_disabled(self, mock_logger):
689736
self.spider.crawlera_enabled = False
690737
settings = {}
691738
crawler = self._mock_crawler(self.spider, settings)
@@ -695,7 +742,7 @@ def test_no_apikey_warning_crawlera_disabled(self, mock_logger):
695742
mock_logger.warning.assert_not_called()
696743

697744
@patch('scrapy_crawlera.middleware.logging')
698-
def test_apikey_warning_crawlera_enabled(self, mock_logger):
745+
def test_no_apikey_warning_crawlera_enabled(self, mock_logger):
699746
self.spider.crawlera_enabled = True
700747
settings = {}
701748
crawler = self._mock_crawler(self.spider, settings)
@@ -708,7 +755,7 @@ def test_apikey_warning_crawlera_enabled(self, mock_logger):
708755
)
709756

710757
@patch('scrapy_crawlera.middleware.logging')
711-
def test_apikey_warning_force_enable(self, mock_logger):
758+
def test_no_apikey_warning_force_enable(self, mock_logger):
712759
self.spider.crawlera_enabled = False
713760
settings = {'CRAWLERA_FORCE_ENABLE_ON_HTTP_CODES': [403]}
714761
crawler = self._mock_crawler(self.spider, settings)
@@ -721,7 +768,7 @@ def test_apikey_warning_force_enable(self, mock_logger):
721768
)
722769

723770
@patch('scrapy_crawlera.middleware.logging')
724-
def test_no_apikey_warning_force_enable(self, mock_logger):
771+
def test_apikey_warning_force_enable(self, mock_logger):
725772
self.spider.crawlera_enabled = False
726773
settings = {
727774
'CRAWLERA_FORCE_ENABLE_ON_HTTP_CODES': [403],

0 commit comments

Comments
 (0)