Try to autodetect encoding for bynary string while parsing.

Deal with UTF-8 BOM
forging2012 · May 14, 2017 · cea880b · cea880b
1 parent 70d2d11
commit cea880b
Show file tree

Hide file tree

Showing 4 changed files with 37 additions and 8 deletions.
diff --git a/gixy/cli/main.py b/gixy/cli/main.py
@@ -151,10 +151,10 @@ def main():
 
     with Gixy(config=config) as yoda:
         if path == '-':
-            with os.fdopen(sys.stdin.fileno(), 'r') as fdata:
+            with os.fdopen(sys.stdin.fileno(), 'rb') as fdata:
                 yoda.audit('<stdin>', fdata, is_stdin=True)
         else:
-            with open(path, mode='r') as fdata:
+            with open(path, mode='rb') as fdata:
                 yoda.audit(path, fdata, is_stdin=False)
 
         formatted = formatters()[config.output_format]().format(yoda)

diff --git a/gixy/parser/nginx_parser.py b/gixy/parser/nginx_parser.py
@@ -23,7 +23,7 @@ def __init__(self, cwd='', allow_includes=True):
 
     def parse_file(self, path, root=None):
         LOG.debug("Parse file: {}".format(path))
-        content = open(path).read()
+        content = open(path, mode='rb').read()
         return self.parse(content=content, root=root, path_info=path)
 
     def parse(self, content, root=None, path_info=None):

diff --git a/gixy/parser/raw_parser.py b/gixy/parser/raw_parser.py
@@ -1,4 +1,6 @@
 import logging
+import codecs
+import six
 from cached_property import cached_property
 
 from pyparsing import (
@@ -27,11 +29,19 @@ def parse(self, data):
         """
         Returns the parsed tree.
         """
-        content = data.strip()
+        if isinstance(data, six.binary_type):
+            if data[:3] == codecs.BOM_UTF8:
+                encoding = 'utf-8-sig'
+            else:
+                encoding = 'latin1'
+            content = data.decode(encoding).strip()
+        else:
+            content = data.strip()
+
         if not content:
             return ParseResults()
 
-        return self.script.parseString(data, parseAll=True)
+        return self.script.parseString(content, parseAll=True)
 
     @cached_property
     def script(self):

diff --git a/tests/parser/test_raw_parser.py b/tests/parser/test_raw_parser.py
@@ -1,7 +1,4 @@
 from nose.tools import assert_equals
-import mock
-from six import StringIO
-from six.moves import builtins
 from gixy.parser.raw_parser import *
 
 
@@ -527,6 +524,28 @@ def test_empty_config():
     assert_config(config, expected)
 
 
+def test_utfbom_decoding():
+    config = b'''\xef\xbb\xbf
+add_header X-Test "Windows-1251";
+        '''
+
+    expected = [
+        ['add_header', 'X-Test', 'Windows-1251']
+    ]
+
+    assert_config(config, expected)
+
+
+def test_national_comment_decoding():
+    config = b'''
+# \xeb\xff-\xeb\xff-\xeb\xff = Lya-lya-lya
+add_header X-Test "Windows-1251";
+        '''
+
+    actual = RawParser().parse(config)
+    assert_equals(len(actual.asList()), 2)
+
+
 def assert_config(config, expected):
     actual = RawParser().parse(config)
     assert_equals(actual.asList(), expected)