Skip to content

Commit

Permalink
Various work in progress. Position tracking
Browse files Browse the repository at this point in the history
  • Loading branch information
dabeaz committed Sep 7, 2022
1 parent cd9014e commit 62203d8
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 73 deletions.
14 changes: 13 additions & 1 deletion CHANGES
Original file line number Diff line number Diff line change
@@ -1,6 +1,18 @@
In Progress
-----------

03/25/2022 Added automatic location tracking to the parser. Use
Parser.line_position(value) to return the line number
and Parser.index_position(value) to return a (start, end)
index pair. value is *any* object returned by one of
the various methods in the parser definition. Typically,
it would be a AST node. The parser tracks the data using
the value of id(value).

03/25/2022 Added .end attribute to tokens that specify the ending
index of the matching text. This is used to do more
precise location tracking for the purpose of issuing
more useful error messages.

05/09/2020 Experimental support for EBNF choices. For example:

@('term { PLUS|MINUS term }')
Expand Down
8 changes: 5 additions & 3 deletions sly/lex.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,9 @@ class Token(object):
'''
Representation of a single token.
'''
__slots__ = ('type', 'value', 'lineno', 'index')
__slots__ = ('type', 'value', 'lineno', 'index', 'end')
def __repr__(self):
return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index})'
return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index}, end={self.end})'

class TokenStr(str):
@staticmethod
Expand Down Expand Up @@ -406,7 +406,7 @@ def _reject():
tok.index = index
m = _master_re.match(text, index)
if m:
index = m.end()
tok.end = index = m.end()
tok.value = m.group()
tok.type = m.lastgroup

Expand All @@ -431,6 +431,7 @@ def _reject():
# No match, see if the character is in literals
if text[index] in _literals:
tok.value = text[index]
tok.end = index + 1
tok.type = tok.value
index += 1
yield tok
Expand All @@ -442,6 +443,7 @@ def _reject():
tok.value = text[index:]
tok = self.error(tok)
if tok is not None:
tok.end = self.index
yield tok

index = self.index
Expand Down
58 changes: 46 additions & 12 deletions sly/yacc.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,6 @@ def __len__(self):
@property
def lineno(self):
for tok in self._slice:
if isinstance(tok, YaccSymbol):
continue
lineno = getattr(tok, 'lineno', None)
if lineno:
return lineno
Expand All @@ -136,13 +134,20 @@ def lineno(self):
@property
def index(self):
for tok in self._slice:
if isinstance(tok, YaccSymbol):
continue
index = getattr(tok, 'index', None)
if index is not None:
return index
raise AttributeError('No index attribute found')

@property
def end(self):
result = None
for tok in self._slice:
r = getattr(tok, 'end', None)
if r:
result = r
return result

def __getattr__(self, name):
if name in self._namemap:
return self._namemap[name](self._slice)
Expand Down Expand Up @@ -1806,12 +1811,6 @@ class ParserMeta(type):
@classmethod
def __prepare__(meta, *args, **kwargs):
d = ParserMetaDict()
# def _(rule, *extra):
# rules = [rule, *extra]
# def decorate(func):
# func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ]
# return func
# return decorate
d['_'] = _decorator
return d

Expand All @@ -1822,6 +1821,9 @@ def __new__(meta, clsname, bases, attributes):
return cls

class Parser(metaclass=ParserMeta):
# Automatic tracking of position information
track_positions = True

# Logging object where debugging/diagnostic messages are sent
log = SlyLogger(sys.stderr)

Expand Down Expand Up @@ -2076,9 +2078,15 @@ def parse(self, tokens):
self.tokens = tokens
self.statestack = statestack = [] # Stack of parsing states
self.symstack = symstack = [] # Stack of grammar symbols
pslice._stack = symstack # Associate the stack with the production
pslice._stack = symstack # Associate the stack with the production
self.restart()

# Set up position tracking
track_positions = self.track_positions
if not hasattr(self, '_line_positions'):
self._line_positions = { } # id: -> lineno
self._index_positions = { } # id: -> (start, end)

errtoken = None # Err token
while True:
# Get the next symbol on the input. If a lookahead symbol
Expand All @@ -2093,7 +2101,7 @@ def parse(self, tokens):
if not lookahead:
lookahead = YaccSymbol()
lookahead.type = '$end'

# Check the action table
ltype = lookahead.type
t = actions[self.state].get(ltype)
Expand Down Expand Up @@ -2129,7 +2137,23 @@ def parse(self, tokens):
value = p.func(self, pslice)
if value is pslice:
value = (pname, *(s.value for s in pslice._slice))

sym.value = value

# Record positions
if track_positions:
if plen:
sym.lineno = symstack[-plen].lineno
sym.index = symstack[-plen].index
sym.end = symstack[-1].end
else:
# A zero-length production (what to put here?)
sym.lineno = None
sym.index = None
sym.end = None
self._line_positions[id(value)] = sym.lineno
self._index_positions[id(value)] = (sym.index, sym.end)

if plen:
del symstack[-plen:]
del statestack[-plen:]
Expand Down Expand Up @@ -2214,6 +2238,8 @@ def parse(self, tokens):
t.lineno = lookahead.lineno
if hasattr(lookahead, 'index'):
t.index = lookahead.index
if hasattr(lookahead, 'end'):
t.end = lookahead.end
t.value = lookahead
lookaheadstack.append(lookahead)
lookahead = t
Expand All @@ -2225,3 +2251,11 @@ def parse(self, tokens):

# Call an error function here
raise RuntimeError('sly: internal parser error!!!\n')

# Return position tracking information
def line_position(self, value):
return self._line_positions[id(value)]

def index_position(self, value):
return self._index_positions[id(value)]

72 changes: 15 additions & 57 deletions tests/test_lex.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
import pytest
from sly import Lexer

try:
import regex
except ImportError:
regex = None

class CalcLexer(Lexer):
# Set of token names. This is always required
tokens = {
Expand Down Expand Up @@ -61,29 +56,6 @@ def error(self, t):
def __init__(self):
self.errors = []

if regex is not None:
class RegexModuleCalcLexer(Lexer):
regex_module = regex

tokens = { 'ID', 'PLUS', 'MINUS' }

literals = { '(', ')' }
ignore = ' \t'

ID = r'\p{Ll}+' # Unicode lowercase letters, regex module feature
PLUS = r'\+'
MINUS = r'-'

ignore_comment = r'\#.*'

@_(r'\n+')
def newline(self, t):
self.lineno += t.value.count('\n')

def ID(self, t):
t.value = t.value.upper()
return t

# Test basic recognition of various tokens and literals
def test_tokens():
lexer = CalcLexer()
Expand All @@ -93,17 +65,21 @@ def test_tokens():
assert types == ['ID','NUMBER','PLUS','MINUS','TIMES','DIVIDE','ASSIGN','LT','LE','(',')']
assert vals == ['ABC', 123, '+', '-', '*', '/', '=', '<', '<=', '(', ')']

# Test third-party regex module support
@pytest.mark.skipif(regex is None,
reason="third-party regex module not installed")
def test_3rd_party_regex_module():
lexer = RegexModuleCalcLexer()
toks = list(lexer.tokenize('a + b - c'))
types = [t.type for t in toks]
vals = [t.value for t in toks]
assert types == ['ID','PLUS','ID','MINUS','ID']
assert vals == ['A', '+', 'B', '-', 'C']

# Test position tracking
def test_positions():
lexer = CalcLexer()
text = 'abc\n( )'
toks = list(lexer.tokenize(text))
lines = [t.lineno for t in toks ]
indices = [t.index for t in toks ]
ends = [t.end for t in toks]
values = [ text[t.index:t.end] for t in toks ]
assert values == ['abc', '(', ')']
assert lines == [1, 2, 2]
assert indices == [0, 4, 6]
assert ends == [3, 5, 7]


# Test ignored comments and newlines
def test_ignored():
lexer = CalcLexer()
Expand Down Expand Up @@ -228,23 +204,5 @@ def test_modern_error_return():
assert vals == [123, ':+-', '+', '-']
assert lexer.errors == [ ':+-' ]

# Test Lexer Inheritance. This class should inherit all of the tokens
# and features of ModernCalcLexer, but add two new tokens to it. The
# PLUSPLUS token matches before the PLUS token.

if False:
class SubModernCalcLexer(ModernCalcLexer):
tokens |= { DOLLAR, PLUSPLUS }
DOLLAR = r'\$'
PLUSPLUS = r'\+\+'
PLUSPLUS.before = PLUS

def test_lexer_inherit():
lexer = SubModernCalcLexer()
toks = list(lexer.tokenize('123 + - $ ++ if'))
types = [t.type for t in toks]
vals = [t.value for t in toks]
assert types == ['NUMBER', 'PLUS', 'MINUS', 'DOLLAR', 'PLUSPLUS', 'IF']
assert vals == [123, '+', '-', '$', '++', 'if']


0 comments on commit 62203d8

Please sign in to comment.