Skip to content

Commit

Permalink
Merge pull request c2nes#51 from cassianomonteiro/master
Browse files Browse the repository at this point in the history
Option to ignore errors on tokenization.
  • Loading branch information
c2nes authored Apr 22, 2018
2 parents a6f23dd + bc71746 commit 5838927
Show file tree
Hide file tree
Showing 2 changed files with 109 additions and 16 deletions.
89 changes: 87 additions & 2 deletions javalang/test/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@

class TestTokenizer(unittest.TestCase):

def test_tokenizer(self):

def test_tokenizer_annotation(self):
# Given
code = " @Override"

Expand All @@ -18,3 +17,89 @@ def test_tokenizer(self):
self.assertEqual(tokens[1].value, "Override")
self.assertEqual(type(tokens[0]), tokenizer.Annotation)
self.assertEqual(type(tokens[1]), tokenizer.Identifier)

def test_tokenizer_javadoc(self):
# Given
code = "/**\n" \
" * See {@link BlockTokenSecretManager#setKeys(ExportedBlockKeys)}\n" \
" */"

# When
tokens = list(tokenizer.tokenize(code))

# Then
self.assertEqual(len(tokens), 0)

def test_tokenize_ignore_errors(self):
# Given
# character '#' was supposed to trigger an error of unknown token with a single line of javadoc
code = " * See {@link BlockTokenSecretManager#setKeys(ExportedBlockKeys)}"

# When
tokens = list(tokenizer.tokenize(code, ignore_errors=True))

# Then
self.assertEqual(len(tokens), 11)

def test_tokenize_comment_line_with_period(self):
# Given
code = " * all of the servlets resistant to cross-site scripting attacks."

# When
tokens = list(tokenizer.tokenize(code))

# Then
self.assertEqual(len(tokens), 13)

def test_tokenize_integer_at_end(self):
# Given
code = "nextKey = new BlockKey(serialNo, System.currentTimeMillis() + 3"

# When
tokens = list(tokenizer.tokenize(code, ignore_errors=True))

# Then
self.assertEqual(len(tokens), 14)

def test_tokenize_float_at_end(self):
# Given
code = "nextKey = new BlockKey(serialNo, System.currentTimeMillis() + 3.0"

# When
tokens = list(tokenizer.tokenize(code, ignore_errors=True))

# Then
self.assertEqual(len(tokens), 14)

def test_tokenize_hex_integer_at_end(self):
# Given
code = "nextKey = new BlockKey(serialNo, System.currentTimeMillis() + 0x3"

# When
tokens = list(tokenizer.tokenize(code, ignore_errors=True))

# Then
self.assertEqual(len(tokens), 14)

def test_tokenize_hex_float_integer_at_end(self):
# Given
code = "nextKey = new BlockKey(serialNo, System.currentTimeMillis() + 0x3.2p2"

# When
tokens = list(tokenizer.tokenize(code, ignore_errors=True))

# Then
self.assertEqual(len(tokens), 14)

def test_string_delim_within_comment(self):

# Given
code = "* Returns 0 if it can't find the end \
if (*itr == '\r') { \
int status;"

# When
tokens = list(tokenizer.tokenize(code, ignore_errors=True))

# Then
self.assertEqual(len(tokens), 8)
36 changes: 22 additions & 14 deletions javalang/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,8 +146,10 @@ class JavaTokenizer(object):

IDENT_PART_CATEGORIES = set(['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mc', 'Mn', 'Nd', 'Nl', 'Pc', 'Sc'])

def __init__(self, data):
def __init__(self, data, ignore_errors=False):
self.data = data
self.ignore_errors = ignore_errors
self.errors = []

self.current_line = 1
self.start_of_line = 0
Expand Down Expand Up @@ -193,6 +195,7 @@ def read_string(self):
while True:
if j >= length:
self.error('Unterminated character/string literal')
break

if state == 0:
if self.data[j] == '\\':
Expand Down Expand Up @@ -289,23 +292,23 @@ def read_decimal_float_or_integer(self):

self.read_decimal_integer()

if self.data[self.j] not in '.eEfFdD':
if self.j >= len(self.data) or self.data[self.j] not in '.eEfFdD':
return DecimalInteger

if self.data[self.j] == '.':
self.i = self.j + 1
self.read_decimal_integer()

if self.data[self.j] in 'eE':
if self.j < len(self.data) and self.data[self.j] in 'eE':
self.j = self.j + 1

if self.data[self.j] in '-+':
if self.j < len(self.data) and self.data[self.j] in '-+':
self.j = self.j + 1

self.i = self.j
self.read_decimal_integer()

if self.data[self.j] in 'fFdD':
if self.j < len(self.data) and self.data[self.j] in 'fFdD':
self.j = self.j + 1

self.i = orig_i
Expand All @@ -317,25 +320,25 @@ def read_hex_integer_or_float(self):

self.read_hex_integer()

if self.data[self.j] not in '.pP':
if self.j >= len(self.data) or self.data[self.j] not in '.pP':
return HexInteger

if self.data[self.j] == '.':
self.j = self.j + 1
self.read_digits('0123456789abcdefABCDEF')

if self.data[self.j] in 'pP':
if self.j < len(self.data) and self.data[self.j] in 'pP':
self.j = self.j + 1
else:
self.error('Invalid hex float literal')

if self.data[self.j] in '-+':
if self.j < len(self.data) and self.data[self.j] in '-+':
self.j = self.j + 1

self.i = self.j
self.read_decimal_integer()

if self.data[self.j] in 'fFdD':
if self.j < len(self.data) and self.data[self.j] in 'fFdD':
self.j = self.j + 1

self.i = orig_i
Expand All @@ -345,7 +348,7 @@ def read_digits(self, digits):
tmp_i = 0
c = None

while True:
while self.j + tmp_i < len(self.data):
c = self.data[self.j + tmp_i]

if c in digits:
Expand Down Expand Up @@ -535,7 +538,7 @@ def tokenize(self):
token_type = Annotation
self.j = self.i + 1

elif c == '.' and c_next.isdigit():
elif c == '.' and c_next and c_next.isdigit():
token_type = self.read_decimal_float_or_integer()

elif self.try_separator():
Expand All @@ -556,6 +559,8 @@ def tokenize(self):

else:
self.error('Could not process token', c)
self.i = self.i + 1
continue

position = (self.current_line, self.i - self.start_of_line)
token = token_type(self.data[self.i:self.j], position, self.javadoc)
Expand All @@ -578,11 +583,14 @@ def error(self, message, char=None):
char = self.data[self.j]

message = u'%s at "%s", line %s: %s' % (message, char, line_number, line)
error = LexerError(message)
self.errors.append(error)

raise LexerError(message)
if not self.ignore_errors:
raise error

def tokenize(code):
tokenizer = JavaTokenizer(code)
def tokenize(code, ignore_errors=False):
tokenizer = JavaTokenizer(code, ignore_errors)
return tokenizer.tokenize()

def reformat_tokens(tokens):
Expand Down

0 comments on commit 5838927

Please sign in to comment.