Skip to content

Commit

Permalink
Fix (and test and document) logging on py2 with non-ascii bytestrings.
Browse files Browse the repository at this point in the history
The immediate motivation was an error in request logging when the
request contains non-ascii characters, but a more general fix is
desirable since there are probably more cases where we log utf8 byte
strings, and in general a logging system that blows up on data that
has a perfectly useful representation is not ideal.

Closes tornadoweb#544.
  • Loading branch information
bdarnell committed Jun 27, 2012
1 parent c469936 commit 83b3882
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 2 deletions.
25 changes: 24 additions & 1 deletion tornado/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,14 +421,37 @@ def format(self, record):
record.message = record.getMessage()
except Exception, e:
record.message = "Bad message (%r): %r" % (e, record.__dict__)
assert isinstance(record.message, basestring) # guaranteed by logging
record.asctime = time.strftime(
"%y%m%d %H:%M:%S", self.converter(record.created))
prefix = '[%(levelname)1.1s %(asctime)s %(module)s:%(lineno)d]' % \
record.__dict__
if self._color:
prefix = (self._colors.get(record.levelno, self._normal) +
prefix + self._normal)
formatted = prefix + " " + record.message

# Encoding notes: The logging module prefers to work with character
# strings, but only enforces that log messages are instances of
# basestring. In python 2, non-ascii bytestrings will make
# their way through the logging framework until they blow up with
# an unhelpful decoding error (with this formatter it happens
# when we attach the prefix, but there are other opportunities for
# exceptions further along in the framework).
#
# If a byte string makes it this far, convert it to unicode to
# ensure it will make it out to the logs. Use repr() as a fallback
# to ensure that all byte strings can be converted successfully,
# but don't do it by default so we don't add extra quotes to ascii
# bytestrings. This is a bit of a hacky place to do this, but
# it's worth it since the encoding errors that would otherwise
# result are so useless (and tornado is fond of using utf8-encoded
# byte strings whereever possible).
try:
message = _unicode(record.message)
except UnicodeDecodeError:
message = repr(record.message)

formatted = prefix + " " + message
if record.exc_info:
if not record.exc_text:
record.exc_text = self.formatException(record.exc_info)
Expand Down
85 changes: 84 additions & 1 deletion tornado/test/options_test.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
from __future__ import absolute_import, division, with_statement
import logging
import os
import re
import tempfile
import unittest

from tornado.options import _Options
from tornado.escape import utf8
from tornado.options import _Options, _LogFormatter
from tornado.util import b, bytes_type


class OptionsTest(unittest.TestCase):
Expand All @@ -17,3 +23,80 @@ def setUp(self):
def test_parse_command_line(self):
self.options.parse_command_line(["main.py", "--port=443"])
self.assertEqual(self.options.port, 443)


class LogFormatterTest(unittest.TestCase):
LINE_RE = re.compile(b("\x01\\[E [0-9]{6} [0-9]{2}:[0-9]{2}:[0-9]{2} options_test:[0-9]+\\]\x02 (.*)"))

def setUp(self):
self.formatter = _LogFormatter(color=False)
# Fake color support. We can't guarantee anything about the $TERM
# variable when the tests are run, so just patch in some values
# for testing. (testing with color off fails to expose some potential
# encoding issues from the control characters)
self.formatter._colors = {
logging.ERROR: u"\u0001",
}
self.formatter._normal = u"\u0002"
self.formatter._color = True
# construct a Logger directly to bypass getLogger's caching
self.logger = logging.Logger('LogFormatterTest')
self.logger.propagate = False
self.tempdir = tempfile.mkdtemp()
self.filename = os.path.join(self.tempdir, 'log.out')
self.handler = self.make_handler(self.filename)
self.handler.setFormatter(self.formatter)
self.logger.addHandler(self.handler)

def tearDown(self):
os.unlink(self.filename)
os.rmdir(self.tempdir)

def make_handler(self, filename):
# Base case: default setup without explicit encoding.
# In python 2, supports arbitrary byte strings and unicode objects
# that contain only ascii. In python 3, supports ascii-only unicode
# strings (but byte strings will be repr'd automatically.
return logging.FileHandler(filename)

def get_output(self):
with open(self.filename, "rb") as f:
line = f.read().strip()
m = LogFormatterTest.LINE_RE.match(line)
if m:
return m.group(1)
else:
raise Exception("output didn't match regex: %r" % line)

def test_basic_logging(self):
self.logger.error("foo")
self.assertEqual(self.get_output(), b("foo"))

def test_bytes_logging(self):
self.logger.error(b("\xe9"))
# This will be "\xe9" on python 2 or "b'\xe9'" on python 3
self.assertEqual(self.get_output(), utf8(repr(b("\xe9"))))

def test_utf8_logging(self):
self.logger.error(u"\u00e9".encode("utf8"))
if issubclass(bytes_type, basestring):
# on python 2, utf8 byte strings (and by extension ascii byte
# strings) are passed through as-is.
self.assertEqual(self.get_output(), utf8(u"\u00e9"))
else:
# on python 3, byte strings always get repr'd even if
# they're ascii-only, so this degenerates into another
# copy of test_bytes_logging.
self.assertEqual(self.get_output(), utf8(repr(utf8(u"\u00e9"))))


class UnicodeLogFormatterTest(LogFormatterTest):
def make_handler(self, filename):
# Adding an explicit encoding configuration allows non-ascii unicode
# strings in both python 2 and 3, without changing the behavior
# for byte strings.
return logging.FileHandler(filename, encoding="utf8")

def test_unicode_logging(self):
self.logger.error(u"\u00e9")
self.assertEqual(self.get_output(), utf8(u"\u00e9"))

0 comments on commit 83b3882

Please sign in to comment.