Skip to content

Commit 34e7dad

Browse files
Merge pull request scrapy#1610 from darshanime/scheduler_debug
[MGR+1] Change, document `LOG_UNSERIALIZABLE_REQUESTS`
2 parents fe68a45 + d8e62e6 commit 34e7dad

File tree

5 files changed

+33
-5
lines changed

5 files changed

+33
-5
lines changed

docs/topics/jobs.rst

+4
Original file line numberDiff line numberDiff line change
@@ -96,4 +96,8 @@ But this will::
9696
somearg = response.meta['somearg']
9797
print "the argument passed is:", somearg
9898

99+
If you wish to log the requests that couldn't be serialized, you can set the
100+
:setting:`SCHEDULER_DEBUG` setting to ``True`` in the project's settings page.
101+
It is ``False`` by default.
102+
99103
.. _pickle: http://docs.python.org/library/pickle.html

docs/topics/settings.rst

+18
Original file line numberDiff line numberDiff line change
@@ -1017,6 +1017,24 @@ Default: ``'scrapy.core.scheduler.Scheduler'``
10171017

10181018
The scheduler to use for crawling.
10191019

1020+
.. setting:: SCHEDULER_DEBUG
1021+
1022+
SCHEDULER_DEBUG
1023+
---------------
1024+
1025+
Default: ``False``
1026+
1027+
Setting to ``True`` will log debug information about the requests scheduler.
1028+
This currently logs (only once) if the requests cannot be serialized to disk.
1029+
Stats counter (``scheduler/unserializable``) tracks the number of times this happens.
1030+
1031+
Example entry in logs::
1032+
1033+
1956-01-31 00:00:00+0800 [scrapy] ERROR: Unable to serialize request:
1034+
<GET http://example.com> - reason: cannot serialize <Request at 0x9a7c7ec>
1035+
(type Request)> - no more unserializable requests will be logged
1036+
(see 'scheduler/unserializable' stats counter)
1037+
10201038
.. setting:: SPIDER_CONTRACTS
10211039

10221040
SPIDER_CONTRACTS

scrapy/core/scheduler.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def from_crawler(cls, crawler):
3030
pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
3131
dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
3232
mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
33-
logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS')
33+
logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG'))
3434
return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser,
3535
stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass)
3636

@@ -84,11 +84,16 @@ def _dqpush(self, request):
8484
try:
8585
reqd = request_to_dict(request, self.spider)
8686
self.dqs.push(reqd, -request.priority)
87-
except ValueError as e: # non serializable request
87+
except ValueError as e: # non serializable request
8888
if self.logunser:
89-
logger.error("Unable to serialize request: %(request)s - reason: %(reason)s",
90-
{'request': request, 'reason': e},
89+
msg = ("Unable to serialize request: %(request)s - reason:"
90+
" %(reason)s - no more unserializable requests will be"
91+
" logged (stats being collected)")
92+
logger.error(msg, {'request': request, 'reason': e},
9193
exc_info=True, extra={'spider': self.spider})
94+
self.logunser = False
95+
self.stats.inc_value('scheduler/unserializable',
96+
spider=self.spider)
9297
return
9398
else:
9499
return True

scrapy/settings/default_settings.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@
193193
LOG_LEVEL = 'DEBUG'
194194
LOG_FILE = None
195195

196-
LOG_UNSERIALIZABLE_REQUESTS = False
196+
SCHEDULER_DEBUG = False
197197

198198
LOGSTATS_INTERVAL = 60.0
199199

scrapy/settings/deprecated.py

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
('AUTOTHROTTLE_MAX_CONCURRENCY', 'use CONCURRENT_REQUESTS_PER_DOMAIN instead'),
1515
('AUTOTHROTTLE_MAX_CONCURRENCY', 'use CONCURRENT_REQUESTS_PER_DOMAIN instead'),
1616
('REDIRECT_MAX_METAREFRESH_DELAY', 'use METAREFRESH_MAXDELAY instead'),
17+
('LOG_UNSERIALIZABLE_REQUESTS', 'use SCHEDULER_DEBUG instead'),
1718
]
1819

1920

0 commit comments

Comments
 (0)