Skip to content

Commit

Permalink
Try to autodetect encoding for bynary string while parsing.
Browse files Browse the repository at this point in the history
Deal with UTF-8 BOM
  • Loading branch information
buglloc committed May 14, 2017
1 parent 70d2d11 commit cea880b
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 8 deletions.
4 changes: 2 additions & 2 deletions gixy/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,10 @@ def main():

with Gixy(config=config) as yoda:
if path == '-':
with os.fdopen(sys.stdin.fileno(), 'r') as fdata:
with os.fdopen(sys.stdin.fileno(), 'rb') as fdata:
yoda.audit('<stdin>', fdata, is_stdin=True)
else:
with open(path, mode='r') as fdata:
with open(path, mode='rb') as fdata:
yoda.audit(path, fdata, is_stdin=False)

formatted = formatters()[config.output_format]().format(yoda)
Expand Down
2 changes: 1 addition & 1 deletion gixy/parser/nginx_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def __init__(self, cwd='', allow_includes=True):

def parse_file(self, path, root=None):
LOG.debug("Parse file: {}".format(path))
content = open(path).read()
content = open(path, mode='rb').read()
return self.parse(content=content, root=root, path_info=path)

def parse(self, content, root=None, path_info=None):
Expand Down
14 changes: 12 additions & 2 deletions gixy/parser/raw_parser.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import logging
import codecs
import six
from cached_property import cached_property

from pyparsing import (
Expand Down Expand Up @@ -27,11 +29,19 @@ def parse(self, data):
"""
Returns the parsed tree.
"""
content = data.strip()
if isinstance(data, six.binary_type):
if data[:3] == codecs.BOM_UTF8:
encoding = 'utf-8-sig'
else:
encoding = 'latin1'
content = data.decode(encoding).strip()
else:
content = data.strip()

if not content:
return ParseResults()

return self.script.parseString(data, parseAll=True)
return self.script.parseString(content, parseAll=True)

@cached_property
def script(self):
Expand Down
25 changes: 22 additions & 3 deletions tests/parser/test_raw_parser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
from nose.tools import assert_equals
import mock
from six import StringIO
from six.moves import builtins
from gixy.parser.raw_parser import *


Expand Down Expand Up @@ -527,6 +524,28 @@ def test_empty_config():
assert_config(config, expected)


def test_utfbom_decoding():
config = b'''\xef\xbb\xbf
add_header X-Test "Windows-1251";
'''

expected = [
['add_header', 'X-Test', 'Windows-1251']
]

assert_config(config, expected)


def test_national_comment_decoding():
config = b'''
# \xeb\xff-\xeb\xff-\xeb\xff = Lya-lya-lya
add_header X-Test "Windows-1251";
'''

actual = RawParser().parse(config)
assert_equals(len(actual.asList()), 2)


def assert_config(config, expected):
actual = RawParser().parse(config)
assert_equals(actual.asList(), expected)

0 comments on commit cea880b

Please sign in to comment.