Skip to content

Commit 2ebce38

Browse files
authored
Merge pull request #79 from starrify/more-stats-to-include
Added: More stats entries to include
2 parents cd5fb5f + 1a51c6c commit 2ebce38

File tree

2 files changed

+18
-5
lines changed

2 files changed

+18
-5
lines changed

scrapy_crawlera/middleware.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -191,8 +191,13 @@ def process_response(self, request, response, spider):
191191
self._restore_original_delay(request)
192192

193193
if self._is_no_available_proxies(response) or self._is_auth_error(response):
194-
self._set_custom_delay(request, next(self.exp_backoff))
194+
if self._is_no_available_proxies(response):
195+
reason = 'noslaves'
196+
else:
197+
reason = 'autherror'
198+
self._set_custom_delay(request, next(self.exp_backoff), reason=reason)
195199
else:
200+
self.crawler.stats.inc_value('crawlera/delay/reset_backoff')
196201
self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max)
197202

198203
if self._is_auth_error(response):
@@ -202,6 +207,7 @@ def process_response(self, request, response, spider):
202207
if retries < self.max_auth_retry_times:
203208
return self._retry_auth(response, request, spider)
204209
else:
210+
self.crawler.stats.inc_value('crawlera/retries/auth/max_reached')
205211
logging.warning(
206212
"Max retries for authentication issues reached, please check auth"
207213
" information settings",
@@ -215,7 +221,7 @@ def process_response(self, request, response, spider):
215221
else:
216222
after = response.headers.get('retry-after')
217223
if after:
218-
self._set_custom_delay(request, float(after))
224+
self._set_custom_delay(request, float(after), reason='banned')
219225
self.crawler.stats.inc_value('crawlera/response/banned')
220226
else:
221227
self._bans[key] = 0
@@ -235,7 +241,7 @@ def process_exception(self, request, exception, spider):
235241
if isinstance(exception, (ConnectionRefusedError, ConnectionDone)):
236242
# Handle crawlera downtime
237243
self._clear_dns_cache()
238-
self._set_custom_delay(request, self.connection_refused_delay)
244+
self._set_custom_delay(request, self.connection_refused_delay, reason='conn_refused')
239245

240246
def _handle_not_enabled_response(self, request, response):
241247
if self._should_enable_for_response(response):
@@ -244,6 +250,7 @@ def _handle_not_enabled_response(self, request, response):
244250

245251
retryreq = request.copy()
246252
retryreq.dont_filter = True
253+
self.crawler.stats.inc_value('crawlera/retries/should_have_been_enabled')
247254
return retryreq
248255
return response
249256

@@ -256,6 +263,7 @@ def _retry_auth(self, response, request, spider):
256263
retryreq = request.copy()
257264
retryreq.meta['crawlera_auth_retry_times'] = retries
258265
retryreq.dont_filter = True
266+
self.crawler.stats.inc_value('crawlera/retries/auth')
259267
return retryreq
260268

261269
def _clear_dns_cache(self):
@@ -286,14 +294,17 @@ def _get_slot(self, request):
286294
key = self._get_slot_key(request)
287295
return key, self.crawler.engine.downloader.slots.get(key)
288296

289-
def _set_custom_delay(self, request, delay):
297+
def _set_custom_delay(self, request, delay, reason=None):
290298
"""Set custom delay for slot and save original one."""
291299
key, slot = self._get_slot(request)
292300
if not slot:
293301
return
294302
if self._saved_delays[key] is None:
295303
self._saved_delays[key] = slot.delay
296304
slot.delay = delay
305+
if reason is not None:
306+
self.crawler.stats.inc_value('crawlera/delay/%s' % reason)
307+
self.crawler.stats.inc_value('crawlera/delay/%s/total' % reason, delay)
297308

298309
def _restore_original_delay(self, request):
299310
"""Restore original delay for slot if it was changed."""

tests/test_crawlera.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -702,7 +702,9 @@ def test_process_response_enables_crawlera(self):
702702
self.assertIsInstance(out, Request)
703703
self.assertEqual(mw.enabled, False)
704704
self.assertEqual(mw.enabled_for_domain["scrapy.org"], True)
705-
self.assertEqual(mw.crawler.stats.get_stats(), {})
705+
self.assertEqual(mw.crawler.stats.get_stats(), {
706+
'crawlera/retries/should_have_been_enabled': 1,
707+
})
706708

707709
# Another regular response with bad code should be done on crawlera
708710
# and not be retried

0 commit comments

Comments
 (0)