Skip to content

Commit

Permalink
Fix slow backtracking when parsing strings (no external deps) (eliben…
Browse files Browse the repository at this point in the history
…#347)

* Fix slow backtracking when parsing strings (no external deps)

Fixes eliben#61

This uses negative lookaheads to avoid ambiguity in how string should be
parsed by the regex.
- https://docs.python.org/2/library/re.html#regular-expression-syntax

- Previously, if it didn't immediately succeed at parsing an escape
  sequence such as `\123`, it would have to try `\1`+`23`, `\12` + `3`,
  and `\123`, which multiplied the time taken by 3 per additional escape
  sequence. This solves that by only allowing `\123`
- The same fix was added for hex escapes.

Also fix a test that relied on the incorrect handling of regexes.
The implementation documentation says that it intends to allow
**decimal** escapes permissively.

* WIP debug

* Fix ambiguity caused by allowing #path directives

Solve this by allowing "\x" when not followed by hex,
in the regular string literal.

In the previous commits,
`\x12` could be parsed both as `\x`+`12` and `\x12`,
which caused exponential options for backtracking.

* Document changes to lexer, remove debug code

* Optimize this for strings
  • Loading branch information
TysonAndre authored and eliben committed Aug 26, 2019
1 parent 5d5904d commit 62ee4ba
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 8 deletions.
39 changes: 32 additions & 7 deletions pycparser/c_lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class CLexer(object):
tokens.
The public attribute filename can be set to an initial
filaneme, but the lexer will update it upon #line
filename, but the lexer will update it upon #line
directives.
"""
def __init__(self, error_func, on_lbrace_func, on_rbrace_func,
Expand Down Expand Up @@ -205,23 +205,48 @@ def _make_tok_location(self, token):
# parse all correct code, even if it means to sometimes parse incorrect
# code.
#
simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
decimal_escape = r"""(\d+)"""
hex_escape = r"""(x[0-9a-fA-F]+)"""
bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
# The original regexes were taken verbatim from the C syntax definition,
# and were later modified to avoid worst-case exponential running time.
#
# simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
# decimal_escape = r"""(\d+)"""
# hex_escape = r"""(x[0-9a-fA-F]+)"""
# bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""
#
# The following modifications were made to avoid the ambiguity that allowed backtracking:
# (https://github.com/eliben/pycparser/issues/61)
#
# - \x was removed from simple_escape, unless it was not followed by a hex digit, to avoid ambiguity with hex_escape.
# - hex_escape allows one or more hex characters, but requires that the next character(if any) is not hex
# - decimal_escape allows one or more decimal characters, but requires that the next character(if any) is not a decimal
# - bad_escape does not allow any decimals (8-9), to avoid conflicting with the permissive decimal_escape.
#
# Without this change, python's `re` module would recursively try parsing each ambiguous escape sequence in multiple ways.
# e.g. `\123` could be parsed as `\1`+`23`, `\12`+`3`, and `\123`.

simple_escape = r"""([a-wyzA-Z._~!=&\^\-\\?'"]|x(?![0-9a-fA-F]))"""
decimal_escape = r"""(\d+)(?!\d)"""
hex_escape = r"""(x[0-9a-fA-F]+)(?![0-9a-fA-F])"""
bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-9])"""

escape_sequence = r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))'

# This complicated regex with lookahead might be slow for strings, so because all of the valid escapes (including \x) allowed
# 0 or more non-escaped characters after the first character, simple_escape+decimal_escape+hex_escape got simplified to

escape_sequence_start_in_string = r"""(\\[0-9a-zA-Z._~!=&\^\-\\?'"])"""

cconst_char = r"""([^'\\\n]|"""+escape_sequence+')'
char_const = "'"+cconst_char+"'"
wchar_const = 'L'+char_const
unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)"
bad_char_const = r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+bad_escape+r"""[^'\n]*')"""

# string literals (K&R2: A.2.6)
string_char = r"""([^"\\\n]|"""+escape_sequence+')'
string_char = r"""([^"\\\n]|"""+escape_sequence_start_in_string+')'
string_literal = '"'+string_char+'*"'
wstring_literal = 'L'+string_literal
bad_string_literal = '"'+string_char+'*?'+bad_escape+string_char+'*"'
bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"'

# floating constants (K&R2: A.2.5.3)
exponent_part = r"""([eE][-+]?[0-9]+)"""
Expand Down
31 changes: 30 additions & 1 deletion tests/test_c_lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ def test_char_constants(self):
self.assertTokensTypes(r"""'\t'""", ['CHAR_CONST'])
self.assertTokensTypes(r"""'\''""", ['CHAR_CONST'])
self.assertTokensTypes(r"""'\?'""", ['CHAR_CONST'])
self.assertTokensTypes(r"""'\0'""", ['CHAR_CONST'])
self.assertTokensTypes(r"""'\012'""", ['CHAR_CONST'])
self.assertTokensTypes(r"""'\x2f'""", ['CHAR_CONST'])
self.assertTokensTypes(r"""'\x2f12'""", ['CHAR_CONST'])
Expand Down Expand Up @@ -149,6 +150,24 @@ def test_string_literal(self):
self.assertTokensTypes(
'"\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123"',
['STRING_LITERAL'])
# Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
# directives with Windows paths as filenames (..\..\dir\file)
self.assertTokensTypes(
r'"\x"',
['STRING_LITERAL'])
self.assertTokensTypes(
r'"\a\b\c\d\e\f\g\h\i\j\k\l\m\n\o\p\q\r\s\t\u\v\w\x\y\z\A\B\C\D\E\F\G\H\I\J\K\L\M\N\O\P\Q\R\S\T\U\V\W\X\Y\Z"',
['STRING_LITERAL'])
self.assertTokensTypes(
r'"C:\x\fa\x1e\xited"',
['STRING_LITERAL'])
# The lexer is permissive and allows decimal escapes (not just octal)
self.assertTokensTypes(
'"jx\9"',
['STRING_LITERAL'])
self.assertTokensTypes(
'"fo\9999999"',
['STRING_LITERAL'])

def test_mess(self):
self.assertTokensTypes(
Expand Down Expand Up @@ -428,14 +447,24 @@ def test_integer_constants(self):
def test_char_constants(self):
self.assertLexerError("'", ERR_UNMATCHED_QUOTE)
self.assertLexerError("'b\n", ERR_UNMATCHED_QUOTE)
self.assertLexerError("'\\xaa\n'", ERR_UNMATCHED_QUOTE)

self.assertLexerError(r"'\12a'", ERR_INVALID_CCONST)
self.assertLexerError(r"'\xabg'", ERR_INVALID_CCONST)
self.assertLexerError("''", ERR_INVALID_CCONST)
self.assertLexerError("'jx'", ERR_INVALID_CCONST)
self.assertLexerError(r"'\*'", ERR_INVALID_CCONST)

def test_string_literals(self):
self.assertLexerError(r'"jx\9"', ERR_STRING_ESCAPE)
self.assertLexerError(r'"jx\`"', ERR_STRING_ESCAPE)
self.assertLexerError(r'"hekllo\* on ix"', ERR_STRING_ESCAPE)
self.assertLexerError(r'L"hekllo\* on ix"', ERR_STRING_ESCAPE)
# Should not suffer from slow backtracking
self.assertLexerError(r'"\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\`\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123"', ERR_STRING_ESCAPE)
self.assertLexerError(r'"\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\x23\`\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23"', ERR_STRING_ESCAPE)
# Should not suffer from slow backtracking when there's no end quote
self.assertLexerError(r'"\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\`\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\12\123456', ERR_ILLEGAL_CHAR)
self.assertLexerError(r'"\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\`\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x2\x23456', ERR_ILLEGAL_CHAR)

def test_preprocessor(self):
self.assertLexerError('#line "ka"', ERR_FILENAME_BEFORE_LINE)
Expand Down

0 comments on commit 62ee4ba

Please sign in to comment.