Various work in progress. Position tracking

dabeaz · Sep 7, 2022 · 62203d8 · 62203d8
1 parent cd9014e
commit 62203d8
Show file tree

Hide file tree

Showing 4 changed files with 79 additions and 73 deletions.
diff --git a/CHANGES b/CHANGES
@@ -1,6 +1,18 @@
 In Progress
 -----------
-
+03/25/2022 Added automatic location tracking to the parser.  Use
+	   Parser.line_position(value) to return the line number
+           and Parser.index_position(value) to return a (start, end)
+	   index pair.  value is *any* object returned by one of
+	   the various methods in the parser definition. Typically,
+	   it would be a AST node.  The parser tracks the data using
+	   the value of id(value).
+
+03/25/2022 Added .end attribute to tokens that specify the ending
+           index of the matching text.   This is used to do more
+	   precise location tracking for the purpose of issuing
+	   more useful error messages.
+
 05/09/2020 Experimental support for EBNF choices.  For example:
 
 	      @('term { PLUS|MINUS term }')

diff --git a/sly/lex.py b/sly/lex.py
@@ -73,9 +73,9 @@ class Token(object):
     '''
     Representation of a single token.
     '''
-    __slots__ = ('type', 'value', 'lineno', 'index')
+    __slots__ = ('type', 'value', 'lineno', 'index', 'end')
     def __repr__(self):
-        return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index})'
+        return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index}, end={self.end})'
 
 class TokenStr(str):
     @staticmethod
@@ -406,7 +406,7 @@ def _reject():
                 tok.index = index
                 m = _master_re.match(text, index)
                 if m:
-                    index = m.end()
+                    tok.end = index = m.end()
                     tok.value = m.group()
                     tok.type = m.lastgroup
 
@@ -431,6 +431,7 @@ def _reject():
                     # No match, see if the character is in literals
                     if text[index] in _literals:
                         tok.value = text[index]
+                        tok.end = index + 1
                         tok.type = tok.value
                         index += 1
                         yield tok
@@ -442,6 +443,7 @@ def _reject():
                         tok.value = text[index:]
                         tok = self.error(tok)
                         if tok is not None:
+                            tok.end = self.index
                             yield tok
 
                         index = self.index

diff --git a/sly/yacc.py b/sly/yacc.py
@@ -126,8 +126,6 @@ def __len__(self):
     @property
     def lineno(self):
         for tok in self._slice:
-            if isinstance(tok, YaccSymbol):
-                continue
             lineno = getattr(tok, 'lineno', None)
             if lineno:
                 return lineno
@@ -136,13 +134,20 @@ def lineno(self):
     @property
     def index(self):
         for tok in self._slice:
-            if isinstance(tok, YaccSymbol):
-                continue
             index = getattr(tok, 'index', None)
             if index is not None:
                 return index
         raise AttributeError('No index attribute found')
 
+    @property
+    def end(self):
+        result = None
+        for tok in self._slice:
+            r = getattr(tok, 'end', None)
+            if r:
+                result = r
+        return result
+
     def __getattr__(self, name):
         if name in self._namemap:
             return self._namemap[name](self._slice)
@@ -1806,12 +1811,6 @@ class ParserMeta(type):
     @classmethod
     def __prepare__(meta, *args, **kwargs):
         d = ParserMetaDict()
-#        def _(rule, *extra):
-#            rules = [rule, *extra]
-#            def decorate(func):
-#                func.rules = [ *getattr(func, 'rules', []), *rules[::-1] ]
-#                return func
-#            return decorate
         d['_'] = _decorator
         return d
 
@@ -1822,6 +1821,9 @@ def __new__(meta, clsname, bases, attributes):
         return cls
 
 class Parser(metaclass=ParserMeta):
+    # Automatic tracking of position information
+    track_positions = True
+
     # Logging object where debugging/diagnostic messages are sent
     log = SlyLogger(sys.stderr)     
 
@@ -2076,9 +2078,15 @@ def parse(self, tokens):
         self.tokens = tokens
         self.statestack = statestack = []                 # Stack of parsing states
         self.symstack = symstack = []                     # Stack of grammar symbols
-        pslice._stack = symstack                           # Associate the stack with the production
+        pslice._stack = symstack                          # Associate the stack with the production
         self.restart()
 
+        # Set up position tracking
+        track_positions = self.track_positions
+        if not hasattr(self, '_line_positions'):
+            self._line_positions = { }           # id: -> lineno
+            self._index_positions = { }          # id: -> (start, end)
+
         errtoken   = None                                 # Err token
         while True:
             # Get the next symbol on the input.  If a lookahead symbol
@@ -2093,7 +2101,7 @@ def parse(self, tokens):
                     if not lookahead:
                         lookahead = YaccSymbol()
                         lookahead.type = '$end'
-
+                    
                 # Check the action table
                 ltype = lookahead.type
                 t = actions[self.state].get(ltype)
@@ -2129,7 +2137,23 @@ def parse(self, tokens):
                     value = p.func(self, pslice)
                     if value is pslice:
                         value = (pname, *(s.value for s in pslice._slice))
+
                     sym.value = value
+
+                    # Record positions
+                    if track_positions:
+                        if plen:
+                            sym.lineno = symstack[-plen].lineno
+                            sym.index = symstack[-plen].index
+                            sym.end = symstack[-1].end
+                        else:
+                            # A zero-length production  (what to put here?)
+                            sym.lineno = None
+                            sym.index = None
+                            sym.end = None
+                        self._line_positions[id(value)] = sym.lineno
+                        self._index_positions[id(value)] = (sym.index, sym.end)
+
                     if plen:
                         del symstack[-plen:]
                         del statestack[-plen:]
@@ -2214,6 +2238,8 @@ def parse(self, tokens):
                         t.lineno = lookahead.lineno
                     if hasattr(lookahead, 'index'):
                         t.index = lookahead.index
+                    if hasattr(lookahead, 'end'):
+                        t.end = lookahead.end
                     t.value = lookahead
                     lookaheadstack.append(lookahead)
                     lookahead = t
@@ -2225,3 +2251,11 @@ def parse(self, tokens):
 
             # Call an error function here
             raise RuntimeError('sly: internal parser error!!!\n')
+
+    # Return position tracking information
+    def line_position(self, value):
+        return self._line_positions[id(value)]
+
+    def index_position(self, value):
+        return self._index_positions[id(value)]
+
diff --git a/tests/test_lex.py b/tests/test_lex.py
@@ -1,11 +1,6 @@
 import pytest
 from sly import Lexer
 
-try:
-    import regex
-except ImportError:
-    regex = None
-
 class CalcLexer(Lexer):
     # Set of token names.   This is always required
     tokens = {
@@ -61,29 +56,6 @@ def error(self, t):
     def __init__(self):
         self.errors = []
 
-if regex is not None:
-    class RegexModuleCalcLexer(Lexer):
-        regex_module = regex
-
-        tokens = { 'ID', 'PLUS', 'MINUS' }
-
-        literals = { '(', ')' }
-        ignore = ' \t'
-
-        ID      = r'\p{Ll}+'  # Unicode lowercase letters, regex module feature
-        PLUS    = r'\+'
-        MINUS   = r'-'
-
-        ignore_comment = r'\#.*'
-
-        @_(r'\n+')
-        def newline(self, t):
-            self.lineno += t.value.count('\n')
-
-        def ID(self, t):
-            t.value = t.value.upper()
-            return t
-
 # Test basic recognition of various tokens and literals
 def test_tokens():
     lexer = CalcLexer()
@@ -93,17 +65,21 @@ def test_tokens():
     assert types == ['ID','NUMBER','PLUS','MINUS','TIMES','DIVIDE','ASSIGN','LT','LE','(',')']
     assert vals == ['ABC', 123, '+', '-', '*', '/', '=', '<', '<=', '(', ')']
 
-# Test third-party regex module support
-@pytest.mark.skipif(regex is None,
-                    reason="third-party regex module not installed")
-def test_3rd_party_regex_module():
-    lexer = RegexModuleCalcLexer()
-    toks = list(lexer.tokenize('a + b - c'))
-    types = [t.type for t in toks]
-    vals = [t.value for t in toks]
-    assert types == ['ID','PLUS','ID','MINUS','ID']
-    assert vals == ['A', '+', 'B', '-', 'C']
-
+# Test position tracking
+def test_positions():
+    lexer = CalcLexer()
+    text = 'abc\n( )'
+    toks = list(lexer.tokenize(text))
+    lines = [t.lineno for t in toks ]
+    indices = [t.index for t in toks ]
+    ends = [t.end for t in toks]
+    values = [ text[t.index:t.end] for t in toks ]
+    assert values == ['abc', '(', ')']
+    assert lines == [1, 2, 2]
+    assert indices == [0, 4, 6]
+    assert ends == [3, 5, 7]
+
+
 # Test ignored comments and newlines
 def test_ignored():
     lexer = CalcLexer()
@@ -228,23 +204,5 @@ def test_modern_error_return():
     assert vals == [123, ':+-', '+', '-']
     assert lexer.errors == [ ':+-' ]
 
-# Test Lexer Inheritance.  This class should inherit all of the tokens
-# and features of ModernCalcLexer, but add two new tokens to it.  The
-# PLUSPLUS token matches before the PLUS token.
-
-if False:
-    class SubModernCalcLexer(ModernCalcLexer):
-        tokens |= { DOLLAR, PLUSPLUS }
-        DOLLAR = r'\$'
-        PLUSPLUS = r'\+\+'
-        PLUSPLUS.before = PLUS
-
-    def test_lexer_inherit():
-        lexer = SubModernCalcLexer()
-        toks = list(lexer.tokenize('123 + - $ ++ if'))
-        types = [t.type for t in toks]
-        vals = [t.value for t in toks]
-        assert types == ['NUMBER', 'PLUS', 'MINUS', 'DOLLAR', 'PLUSPLUS', 'IF']
-        assert vals == [123, '+', '-', '$', '++', 'if']