Merge pull request scrapy#1610 from darshanime/scheduler_debug

eliasdorneles · web-flow · commit 34e7dadf38ba · 2016-07-29T10:12:52.000-03:00
[MGR+1] Change, document `LOG_UNSERIALIZABLE_REQUESTS`
diff --git a/docs/topics/jobs.rst b/docs/topics/jobs.rst
@@ -96,4 +96,8 @@ But this will::
         somearg = response.meta['somearg']
         print "the argument passed is:", somearg
 
+If you wish to log the requests that couldn't be serialized, you can set the
+:setting:`SCHEDULER_DEBUG` setting to ``True`` in the project's settings page.
+It is ``False`` by default.
+
 .. _pickle: http://docs.python.org/library/pickle.html
diff --git a/docs/topics/settings.rst b/docs/topics/settings.rst
@@ -1017,6 +1017,24 @@ Default: ``'scrapy.core.scheduler.Scheduler'``
 
 The scheduler to use for crawling.
 
+.. setting:: SCHEDULER_DEBUG
+
+SCHEDULER_DEBUG
+---------------
+
+Default: ``False``
+
+Setting to ``True`` will log debug information about the requests scheduler.
+This currently logs (only once) if the requests cannot be serialized to disk.
+Stats counter (``scheduler/unserializable``) tracks the number of times this happens.
+
+Example entry in logs::
+
+    1956-01-31 00:00:00+0800 [scrapy] ERROR: Unable to serialize request:
+    <GET http://example.com> - reason: cannot serialize <Request at 0x9a7c7ec>
+    (type Request)> - no more unserializable requests will be logged
+    (see 'scheduler/unserializable' stats counter)
+
 .. setting:: SPIDER_CONTRACTS
 
 SPIDER_CONTRACTS
diff --git a/scrapy/core/scheduler.py b/scrapy/core/scheduler.py
@@ -30,7 +30,7 @@ def from_crawler(cls, crawler):
         pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
         dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
         mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
-        logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS')
+        logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG'))
         return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser,
                    stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass)
 
@@ -84,11 +84,16 @@ def _dqpush(self, request):
         try:
             reqd = request_to_dict(request, self.spider)
             self.dqs.push(reqd, -request.priority)
-        except ValueError as e: # non serializable request
+        except ValueError as e:  # non serializable request
             if self.logunser:
-                logger.error("Unable to serialize request: %(request)s - reason: %(reason)s",
-                             {'request': request, 'reason': e},
+                msg = ("Unable to serialize request: %(request)s - reason:"
+                       " %(reason)s - no more unserializable requests will be"
+                       " logged (stats being collected)")
+                logger.error(msg, {'request': request, 'reason': e},
                              exc_info=True, extra={'spider': self.spider})
+                self.logunser = False
+            self.stats.inc_value('scheduler/unserializable',
+                                 spider=self.spider)
             return
         else:
             return True
diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py
@@ -193,7 +193,7 @@
 LOG_LEVEL = 'DEBUG'
 LOG_FILE = None
 
-LOG_UNSERIALIZABLE_REQUESTS = False
+SCHEDULER_DEBUG = False
 
 LOGSTATS_INTERVAL = 60.0
 
diff --git a/scrapy/settings/deprecated.py b/scrapy/settings/deprecated.py
@@ -14,6 +14,7 @@
     ('AUTOTHROTTLE_MAX_CONCURRENCY', 'use CONCURRENT_REQUESTS_PER_DOMAIN instead'),
     ('AUTOTHROTTLE_MAX_CONCURRENCY', 'use CONCURRENT_REQUESTS_PER_DOMAIN instead'),
     ('REDIRECT_MAX_METAREFRESH_DELAY', 'use METAREFRESH_MAXDELAY instead'),
+    ('LOG_UNSERIALIZABLE_REQUESTS', 'use SCHEDULER_DEBUG instead'),
 ]
 
 

Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,7 @@`
`14`	`14`	`('AUTOTHROTTLE_MAX_CONCURRENCY', 'use CONCURRENT_REQUESTS_PER_DOMAIN instead'),`
`15`	`15`	`('AUTOTHROTTLE_MAX_CONCURRENCY', 'use CONCURRENT_REQUESTS_PER_DOMAIN instead'),`
`16`	`16`	`('REDIRECT_MAX_METAREFRESH_DELAY', 'use METAREFRESH_MAXDELAY instead'),`
	`17`	`+ ('LOG_UNSERIALIZABLE_REQUESTS', 'use SCHEDULER_DEBUG instead'),`
`17`	`18`	`]`
`18`	`19`
`19`	`20`