1
+ /*
2
+ /*
3
+ * The MIT License (MIT)
4
+ *
5
+ * Copyright (c) 2019 Robert Einhorn
6
+ *
7
+ * Permission is hereby granted, free of charge, to any person
8
+ * obtaining a copy of this software and associated documentation
9
+ * files (the "Software"), to deal in the Software without
10
+ * restriction, including without limitation the rights to use,
11
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
12
+ * copies of the Software, and to permit persons to whom the
13
+ * Software is furnished to do so, subject to the following
14
+ * conditions:
15
+ *
16
+ * The above copyright notice and this permission notice shall be
17
+ * included in all copies or substantial portions of the Software.
18
+ *
19
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
21
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
22
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
23
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
24
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
26
+ * OTHER DEALINGS IN THE SOFTWARE.
27
+ *
28
+ * Project : Python3 Indent/Dedent handler for ANTLR4 grammars
29
+ * https://github.com/antlr/grammars-v4/tree/master/python/python3-without-actions
30
+ * Developed by : Robert Einhorn, [email protected]
31
+ */
32
+
33
+ import org .antlr .v4 .runtime .*;
34
+
35
+ // *************************************************************************************************************
36
+ // **** THE FOLLOWING IMPORT SECTION ALSO CAN BE USED IN THE SECTION OF THE @lexer::header{} IN THE GRAMMAR ****
37
+ // *************************************************************************************************************
38
+ import java .util .*;
39
+
40
+ public class LexerWithIndentDedentInjector extends Python3Lexer { //*** https://www.antlr.org/api/Java/org/antlr/v4/runtime/Lexer.html
41
+ public LexerWithIndentDedentInjector (CharStream input ) {
42
+ super (input );
43
+ }
44
+
45
+ // *******************************************************************************************************
46
+ // **** THE FOLLOWING SECTION ALSO CAN BE USED IN THE SECTION OF THE @lexer::members{} IN THE GRAMMAR ****
47
+ // *******************************************************************************************************
48
+ // The stack that keeps track of the indentation lengths
49
+ private final Stack <Integer > indentLengths = new Stack <>() {{ push (0 ); }}; // initializing with default 0 indentation length
50
+ // A queue where extra tokens are pushed on
51
+ private final Deque <Token > pendingTokens = new ArrayDeque <>();
52
+ // An integer that stores the type of the last appended token to the token stream
53
+ private int lastAppendedTokenType ;
54
+
55
+ // The amount of opened braces, brackets and parenthesis
56
+ private int opened = 0 ;
57
+
58
+ // Was there space char in the indentations?
59
+ private boolean wasSpaceIndentation = false ;
60
+ // Was there TAB char in the indentations?
61
+ private boolean wasTabIndentation = false ;
62
+
63
+ // A string list that stores the lexer warnings
64
+ private List <String > warnings = new ArrayList <>();
65
+ // A string list that stores the lexer error messages
66
+ private List <String > errors = new ArrayList <>();
67
+
68
+ // Patterns for the custom error listener to recognize error messages
69
+ public static final String TEXT_LEXER = "lexer --> " ;
70
+ public static final String TEXT_INSERTED_INDENT = "inserted INDENT" ;
71
+
72
+ @ Override
73
+ public Token nextToken () {
74
+ final boolean atVeryFirstCharWhichIsSpaceOrTAB = getCharIndex () == 0 && List .of ((int ) ' ' , (int ) '\t' ).contains (_input .LA (1 ));
75
+ Token currentToken ;
76
+
77
+ while (true ) {
78
+ currentToken = super .nextToken (); // get a token from the inputstream
79
+ this .insertLeadingTokens (atVeryFirstCharWhichIsSpaceOrTAB , currentToken .getType (), currentToken .getStartIndex ());
80
+ switch (currentToken .getType ()) {
81
+ case OPEN_PAREN :
82
+ case OPEN_BRACK :
83
+ case OPEN_BRACE :
84
+ this .opened ++;
85
+ this .pendingTokens .addLast (currentToken ); // insert the current open parentheses or square bracket or curly brace token
86
+ break ;
87
+ case CLOSE_PAREN :
88
+ case CLOSE_BRACK :
89
+ case CLOSE_BRACE :
90
+ this .opened --;
91
+ this .pendingTokens .addLast (currentToken ); // insert the current close parentheses or square bracket or curly brace token
92
+ break ;
93
+ case NEWLINE :
94
+ if (this .opened > 0 ) { //*** https://docs.python.org/3/reference/lexical_analysis.html#implicit-line-joining
95
+ continue ; // We're inside an implicit line joining section, skip the NEWLINE token
96
+ } else {
97
+ switch (_input .LA (1 ) /* next symbol */ ) { //*** https://www.antlr.org/api/Java/org/antlr/v4/runtime/IntStream.html#LA(int)
98
+ case '\r' :
99
+ case '\n' :
100
+ case '\f' :
101
+ case '#' : //*** https://docs.python.org/3/reference/lexical_analysis.html#blank-lines
102
+ continue ; // We're on a blank line or before a comment, skip the NEWLINE token
103
+ default :
104
+ this .pendingTokens .addLast (currentToken ); // insert the current NEWLINE token
105
+ this .insertIndentDedentTokens (); //*** https://docs.python.org/3/reference/lexical_analysis.html#indentation
106
+ }
107
+ }
108
+ break ;
109
+ case EOF :
110
+ if ( !this .indentLengths .isEmpty () ) {
111
+ this .insertTrailingTokens (); // indentLengths stack wil be empty
112
+ this .checkSpaceAndTabIndentation ();
113
+ this .pendingTokens .addLast (currentToken ); // insert the current EOF token
114
+ }
115
+ break ;
116
+ default :
117
+ this .pendingTokens .addLast (currentToken ); // insert the current token
118
+ }
119
+ break ; // exit from the loop
120
+ }
121
+ this .lastAppendedTokenType = this .pendingTokens .peekFirst ().getType (); // save the token type before removing from the deque for the trailing tokens inserting later
122
+ return this .pendingTokens .pollFirst (); // append a token to the token stream until the first returning EOF
123
+ }
124
+
125
+ private void insertLeadingTokens (boolean atVeryFirstCharWhichIsSpaceOrTAB , int type , int startIndex ) {
126
+ if (atVeryFirstCharWhichIsSpaceOrTAB && // We're at the first line of the input starting with a space or TAB
127
+ !List .of (NEWLINE , EOF ).contains (type ) // and within that the first token that is visible (comments were skiped and OPEN_PAREN, OPEN_BRACK OPEN_BRACE cannot be the first token)
128
+ ) { // We need to insert a NEWLINE and an INDENT token before the first token to raise an 'unexpected indent' error by the parser later
129
+ this .insertToken (0 , startIndex - 1 , "<inserted leading NEWLINE>" + " " .repeat (startIndex ), NEWLINE , 1 , 0 );
130
+ this .insertToken (startIndex , startIndex - 1 , "<" + TEXT_INSERTED_INDENT + ", " + this .getIndentationDescription (startIndex ) + ">" , Python3Parser .INDENT , 1 , startIndex );
131
+ this .indentLengths .push (startIndex );
132
+ }
133
+ }
134
+
135
+ private void insertIndentDedentTokens () {
136
+ final int currentIndentLength = this .getIndentationLength (getText ());
137
+ int previousIndentLength = this .indentLengths .peek ();
138
+
139
+ if (currentIndentLength > previousIndentLength ) { // insert an INDENT token
140
+ this .insertToken ("<" + TEXT_INSERTED_INDENT + ", " + this .getIndentationDescription (currentIndentLength ) + ">" , Python3Parser .INDENT );
141
+ this .indentLengths .push (currentIndentLength );
142
+ } else if (currentIndentLength < previousIndentLength ) {
143
+ do { // More than 1 DEDENT token may be inserted
144
+ this .indentLengths .pop ();
145
+ previousIndentLength = this .indentLengths .peek ();
146
+ if (currentIndentLength <= previousIndentLength ) {
147
+ this .insertToken ("<inserted DEDENT, " + this .getIndentationDescription (previousIndentLength ) + ">" , Python3Parser .DEDENT );
148
+ } else {
149
+ this .insertToken ("<inserted (I N C O N S I S T E N T!) DEDENT, " + this .getIndentationDescription (currentIndentLength ) + ">" , Python3Parser .DEDENT );
150
+ this .errors .add (TEXT_LEXER + "line " + getLine () + ":" + getCharPositionInLine () + "\t IndentationError: unindent does not match any outer indentation level" );
151
+ }
152
+ } while (currentIndentLength < previousIndentLength );
153
+ }
154
+ }
155
+
156
+ private void insertTrailingTokens () {
157
+ if ( !List .of (NEWLINE , Python3Parser .DEDENT ).contains (this .lastAppendedTokenType ) ) { // If the last token was not NEWLINE or DEDENT then
158
+ this .insertToken ("<inserted trailing NEWLINE>" , NEWLINE ); // insert an extra trailing NEWLINE token that serves as the end of the statement
159
+ }
160
+
161
+ this .indentLengths .removeElementAt (0 ); // Remove the default 0 indentation length
162
+ while ( !this .indentLengths .isEmpty () ) { // Now insert as much trailing DEDENT tokens as needed
163
+ this .insertToken ("<inserted trailing DEDENT, " + this .getIndentationDescription (this .indentLengths .pop ()) + ">" , Python3Parser .DEDENT );
164
+ }
165
+ }
166
+
167
+ private String getIndentationDescription (int lengthOfIndent ) {
168
+ return "length=" + lengthOfIndent + ", level=" + (this .indentLengths .size ());
169
+ }
170
+
171
+ private void insertToken (String text , int type ) {
172
+ final int startIndex = _tokenStartCharIndex + getText ().length (); //*** https://www.antlr.org/api/Java/org/antlr/v4/runtime/Lexer.html#_tokenStartCharIndex
173
+ this .insertToken (startIndex , startIndex - 1 , text , type , getLine (), getCharPositionInLine ());
174
+ }
175
+
176
+ private void insertToken (int startIndex , int stopIndex , String text , int type , int line , int charPositionInLine ) {
177
+ CommonToken token = new CommonToken (_tokenFactorySourcePair , type , DEFAULT_TOKEN_CHANNEL , startIndex , stopIndex ); //*** https://www.antlr.org/api/Java/org/antlr/v4/runtime/CommonToken.html
178
+ token .setText (text );
179
+ token .setLine (line );
180
+ token .setCharPositionInLine (charPositionInLine );
181
+ this .pendingTokens .addLast (token );
182
+ }
183
+
184
+ // Calculates the indentation of the provided spaces, taking the
185
+ // following rules into account:
186
+ //
187
+ // "Tabs are replaced (from left to right) by one to eight spaces
188
+ // such that the total number of characters up to and including
189
+ // the replacement is a multiple of eight [...]"
190
+ //
191
+ // -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation
192
+ private int getIndentationLength (String textOfMatchedNEWLINE ) {
193
+ int count = 0 ;
194
+
195
+ for (char ch : textOfMatchedNEWLINE .toCharArray ()) {
196
+ switch (ch ) {
197
+ case ' ' : // A normal space char
198
+ this .wasSpaceIndentation = true ;
199
+ count ++;
200
+ break ;
201
+ case '\t' :
202
+ this .wasTabIndentation = true ;
203
+ count += 8 - (count % 8 );
204
+ break ;
205
+ }
206
+ }
207
+ return count ;
208
+ }
209
+
210
+ private void checkSpaceAndTabIndentation () {
211
+ if (this .wasSpaceIndentation && this .wasTabIndentation ) {
212
+ this .warnings .add ("Mixture of space and tab were used for indentation." );
213
+ }
214
+ }
215
+
216
+ public List <String > getWarnings () { // can be called from a grammar embedded action also
217
+ return this .warnings ;
218
+ }
219
+
220
+ public List <String > getErrorMessages () { // can be called from a grammar embedded action also
221
+ return this .errors ;
222
+ }
223
+ }
0 commit comments