@@ -299,6 +299,15 @@ def test_delay_adjustment(self):
299299 self .assertEqual (self .spider .download_delay , delay )
300300 self .assertNotIn ('proxy.crawlera.com' , dnscache )
301301
302+ def test_process_exception_outside_crawlera (self ):
303+ self .spider .crawlera_enabled = False
304+ crawler = self ._mock_crawler (self .spider , self .settings )
305+ mw = self .mwcls .from_crawler (crawler )
306+ mw .open_spider (self .spider )
307+
308+ req = Request ("https://scrapy.org" )
309+ assert mw .process_exception (req , ConnectionDone (), self .spider ) is None
310+
302311 def test_jobid_header (self ):
303312 # test without the environment variable 'SCRAPY_JOB'
304313 self .spider .crawlera_enabled = True
@@ -416,8 +425,9 @@ def test_crawlera_default_headers(self):
416425 self .assertEqual (req .headers ['X-Crawlera-Cookies' ], b'disable' )
417426 self .assertNotIn ('X-Crawlera-Profile' , req .headers )
418427
428+ @patch ('scrapy_crawlera.middleware.warnings' )
419429 @patch ('scrapy_crawlera.middleware.logging' )
420- def test_crawlera_default_headers_conflicting_headers (self , mock_logger ):
430+ def test_crawlera_default_headers_conflicting_headers (self , mock_logger , mock_warnings ):
421431 spider = self .spider
422432 self .spider .crawlera_enabled = True
423433
@@ -433,6 +443,12 @@ def test_crawlera_default_headers_conflicting_headers(self, mock_logger):
433443 assert mw .process_request (req , spider ) is None
434444 self .assertEqual (req .headers ['X-Crawlera-UA' ], b'desktop' )
435445 self .assertEqual (req .headers ['X-Crawlera-Profile' ], b'desktop' )
446+ mock_warnings .warn .assert_called_with (
447+ "The headers ('X-Crawlera-Profile', 'X-Crawlera-UA') are conflictin"
448+ "g on some of your requests. Please check https://doc.scrapinghub.c"
449+ "om/crawlera.html for more information. You can set LOG_LEVEL=DEBUG"
450+ " to see the urls with problems"
451+ )
436452 mock_logger .debug .assert_called_with (
437453 "The headers ('X-Crawlera-Profile', 'X-Crawlera-UA') are conflictin"
438454 "g on request http://www.scrapytest.org/other. X-Crawlera-UA will b"
@@ -447,6 +463,12 @@ def test_crawlera_default_headers_conflicting_headers(self, mock_logger):
447463 assert mw .process_request (req , spider ) is None
448464 self .assertEqual (req .headers ['X-Crawlera-UA' ], b'desktop' )
449465 self .assertEqual (req .headers ['X-Crawlera-Profile' ], b'desktop' )
466+ mock_warnings .warn .assert_called_with (
467+ "The headers ('X-Crawlera-Profile', 'X-Crawlera-UA') are conflictin"
468+ "g on some of your requests. Please check https://doc.scrapinghub.c"
469+ "om/crawlera.html for more information. You can set LOG_LEVEL=DEBUG"
470+ " to see the urls with problems"
471+ )
450472 mock_logger .debug .assert_called_with (
451473 "The headers ('X-Crawlera-Profile', 'X-Crawlera-UA') are conflictin"
452474 "g on request http://www.scrapytest.org/other. X-Crawlera-UA will b"
@@ -650,42 +672,67 @@ def test_process_response_enables_crawlera(self):
650672 mw = self .mwcls .from_crawler (crawler )
651673 mw .open_spider (self .spider )
652674
653- req = Request (url )
654- res = Response (url , status = 403 , request = req )
655- out = mw .process_response (req , res , self .spider )
656- self .assertIsInstance (out , Request )
657- self .assertEqual (mw .enabled_for_domain ["scrapy.org" ], True )
658- self .assertEqual (mw .enabled , False )
659-
660- # A good response shouldnt enable it
661- mw .enabled_for_domain = {}
675+ # A good code response should not enable it
662676 req = Request (url )
663677 res = Response (url , status = 200 , request = req )
678+ mw .process_request (req , self .spider )
664679 out = mw .process_response (req , res , self .spider )
665680 self .assertIsInstance (out , Response )
666681 self .assertEqual (mw .enabled_for_domain , {})
667682 self .assertEqual (mw .enabled , False )
683+ self .assertEqual (mw .crawler .stats .get_stats (), {})
668684
669- req = Request ( url )
685+ # A bad code response should enable it
670686 res = Response (url , status = 403 , request = req )
687+ mw .process_request (req , self .spider )
671688 out = mw .process_response (req , res , self .spider )
672689 self .assertIsInstance (out , Request )
673690 self .assertEqual (mw .enabled , False )
674691 self .assertEqual (mw .enabled_for_domain ["scrapy.org" ], True )
675- # Another regular response with bad code should be retried
692+ self .assertEqual (mw .crawler .stats .get_stats (), {})
693+
694+ # Another regular response with bad code should be done on crawlera
695+ # and not be retried
696+ res = Response (url , status = 403 , request = req )
697+ mw .process_request (req , self .spider )
676698 out = mw .process_response (req , res , self .spider )
677- self .assertIsInstance (out , Request )
699+ self .assertIsInstance (out , Response )
678700 self .assertEqual (mw .enabled , False )
679701 self .assertEqual (mw .enabled_for_domain ["scrapy.org" ], True )
680- # A crawlera response with bad code should not
702+ self .assertEqual (mw .crawler .stats .get_value ("crawlera/request" ), 1 )
703+
704+ # A crawlera response with bad code should not be retried as well
705+ mw .process_request (req , self .spider )
681706 res = self ._mock_crawlera_response (url , status = 403 , request = req )
682707 out = mw .process_response (req , res , self .spider )
683708 self .assertIsInstance (out , Response )
684709 self .assertEqual (mw .enabled , False )
685710 self .assertEqual (mw .enabled_for_domain ["scrapy.org" ], True )
711+ self .assertEqual (mw .crawler .stats .get_value ("crawlera/request" ), 2 )
712+
713+ def test_process_response_from_file_scheme (self ):
714+ url = "file:///tmp/foobar.txt"
715+
716+ self .spider .crawlera_enabled = False
717+ self .settings ['CRAWLERA_FORCE_ENABLE_ON_HTTP_CODES' ] = [403 ]
718+ crawler = self ._mock_crawler (self .spider , self .settings )
719+ mw = self .mwcls .from_crawler (crawler )
720+ mw .enabled_for_domain = {}
721+ mw .open_spider (self .spider )
722+
723+ # A good code response should not enable it
724+ req = Request (url )
725+ res = Response (url , status = 200 , request = req )
726+ mw .process_request (req , self .spider )
727+ out = mw .process_response (req , res , self .spider )
728+ self .assertIsInstance (out , Response )
729+ self .assertEqual (mw .enabled_for_domain , {})
730+ self .assertEqual (mw .enabled , False )
731+ self .assertEqual (mw .crawler .stats .get_stats (), {})
732+ self .assertEqual (out .status , 200 )
686733
687734 @patch ('scrapy_crawlera.middleware.logging' )
688- def test_no_apikey_warning_crawlera_disabled (self , mock_logger ):
735+ def test_apikey_warning_crawlera_disabled (self , mock_logger ):
689736 self .spider .crawlera_enabled = False
690737 settings = {}
691738 crawler = self ._mock_crawler (self .spider , settings )
@@ -695,7 +742,7 @@ def test_no_apikey_warning_crawlera_disabled(self, mock_logger):
695742 mock_logger .warning .assert_not_called ()
696743
697744 @patch ('scrapy_crawlera.middleware.logging' )
698- def test_apikey_warning_crawlera_enabled (self , mock_logger ):
745+ def test_no_apikey_warning_crawlera_enabled (self , mock_logger ):
699746 self .spider .crawlera_enabled = True
700747 settings = {}
701748 crawler = self ._mock_crawler (self .spider , settings )
@@ -708,7 +755,7 @@ def test_apikey_warning_crawlera_enabled(self, mock_logger):
708755 )
709756
710757 @patch ('scrapy_crawlera.middleware.logging' )
711- def test_apikey_warning_force_enable (self , mock_logger ):
758+ def test_no_apikey_warning_force_enable (self , mock_logger ):
712759 self .spider .crawlera_enabled = False
713760 settings = {'CRAWLERA_FORCE_ENABLE_ON_HTTP_CODES' : [403 ]}
714761 crawler = self ._mock_crawler (self .spider , settings )
@@ -721,7 +768,7 @@ def test_apikey_warning_force_enable(self, mock_logger):
721768 )
722769
723770 @patch ('scrapy_crawlera.middleware.logging' )
724- def test_no_apikey_warning_force_enable (self , mock_logger ):
771+ def test_apikey_warning_force_enable (self , mock_logger ):
725772 self .spider .crawlera_enabled = False
726773 settings = {
727774 'CRAWLERA_FORCE_ENABLE_ON_HTTP_CODES' : [403 ],
0 commit comments