Skip to content

Commit 10ba06b

Browse files
authored
Merge pull request antlr#1640 from RobEin/master
Python 3 grammar without embedded actions and Tiny Python grammar
2 parents f40d77f + 5d1bcb9 commit 10ba06b

31 files changed

+2231
-1329
lines changed

python/pom.xml

+2-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818
<module>python3-js</module>
1919
<module>python3-py</module>
2020
<module>python3-ts</module>
21-
<module>python3_without_actions</module>
21+
<module>python3-without-actions</module>
2222
<module>python3alt</module>
23+
<module>tiny-python</module>
2324
</modules>
2425
</project>
+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# Python 3 parser without grammar actions &nbsp; [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
2+
3+
A Python 3 parser that based on the Bart Kiers's Python 3.3 grammar with an improved indentation handling outside of the grammar with the following advantages:
4+
- more informative token metadata
5+
- reusable code for grammar with embedded action and without action
6+
- detection of various indentation errors
7+
8+
#### Some indentation errors with message:
9+
```python
10+
i = 1 # first line begins with space
11+
# line 1:1 IndentationError: unexpected indent
12+
13+
14+
if i == 1:
15+
j =
16+
# line 2:0 IndentationError: expected an indented block
17+
18+
19+
if i == 1:
20+
j = 0
21+
k = 0
22+
# line 3:8 IndentationError: unexpected indent
23+
24+
25+
if i == 1:
26+
j = 0
27+
k = 0
28+
# line 3:2 IndentationError: unindent does not match any outer indentation level
29+
```
30+
31+
## How to use
32+
```bash
33+
antlr4 Python3.g4
34+
javac *.java
35+
java Main test.py
36+
```
37+
38+
## Related links
39+
[The Python 3.3.7 Language Reference](https://docs.python.org/3.3/reference/grammar.html)
40+
41+
[Bart Kiers's Python 3.3 ANTLR4 grammar](https://github.com/bkiers/python3-parser)
42+
43+
+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
2+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
3+
<modelVersion>4.0.0</modelVersion>
4+
<artifactId>python3-without-actions</artifactId>
5+
<packaging>jar</packaging>
6+
<name>Python 3 grammar without embedded actions</name>
7+
<parent>
8+
<groupId>org.antlr.grammars</groupId>
9+
<artifactId>pythonparent</artifactId>
10+
<version>1.0-SNAPSHOT</version>
11+
<relativePath>../pom.xml</relativePath>
12+
</parent>
13+
<build>
14+
<plugins>
15+
<plugin>
16+
<groupId>org.antlr</groupId>
17+
<artifactId>antlr4-maven-plugin</artifactId>
18+
<version>${antlr.version}</version>
19+
<configuration>
20+
<sourceDirectory>${basedir}</sourceDirectory>
21+
<grammars>Python3.g4</grammars>
22+
<visitor>true</visitor>
23+
<listener>true</listener>
24+
</configuration>
25+
<executions>
26+
<execution>
27+
<goals>
28+
<goal>antlr4</goal>
29+
</goals>
30+
</execution>
31+
</executions>
32+
</plugin>
33+
<plugin>
34+
<groupId>org.apache.maven.plugins</groupId>
35+
<artifactId>maven-compiler-plugin</artifactId>
36+
<executions>
37+
<execution>
38+
<id>default-compile</id>
39+
<phase>none</phase>
40+
</execution>
41+
</executions>
42+
</plugin>
43+
<plugin>
44+
<groupId>org.apache.maven.plugins</groupId>
45+
<artifactId>maven-compiler-plugin</artifactId>
46+
<configuration>
47+
<source>11</source>
48+
<target>11</target>
49+
</configuration>
50+
</plugin>
51+
</plugins>
52+
</build>
53+
</project>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import org.antlr.v4.runtime.*;
2+
3+
public class IndentationErrorListener extends BaseErrorListener {
4+
private boolean isFirstTime = true;
5+
6+
@Override
7+
public void syntaxError(Recognizer<?, ?> recognizer,
8+
Object offendingSymbol,
9+
int line, int charPositionInLine,
10+
String msg,
11+
RecognitionException e) {
12+
13+
if (isFirstTime) {
14+
isFirstTime = false;
15+
System.out.println();
16+
System.err.println("ERROR:");
17+
}
18+
19+
if (msg.startsWith(LexerWithIndentDedentInjector.TEXT_LEXER)) { // this is a custom error message from the lexer contained a pattern
20+
System.err.println(msg.substring(LexerWithIndentDedentInjector.TEXT_LEXER.length())); // displaying the lexer error message without the pattern
21+
} else { // this is a parser error message
22+
String startOfMessage = "line " + line + ":" + charPositionInLine + "\t ";
23+
if (msg.startsWith("missing INDENT")) {
24+
System.err.println(startOfMessage + "IndentationError: expected an indented block"); // displaying the modified parser error message
25+
} else if (msg.startsWith("extraneous input '<" + LexerWithIndentDedentInjector.TEXT_INSERTED_INDENT)) {
26+
System.err.println(startOfMessage + "IndentationError: unexpected indent"); // displaying the modified parser error message
27+
} else {
28+
System.err.println(startOfMessage + "at " + offendingSymbol + ": " + msg); // displaying the original parser error message
29+
}
30+
}
31+
}
32+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
/*
2+
/*
3+
* The MIT License (MIT)
4+
*
5+
* Copyright (c) 2019 Robert Einhorn
6+
*
7+
* Permission is hereby granted, free of charge, to any person
8+
* obtaining a copy of this software and associated documentation
9+
* files (the "Software"), to deal in the Software without
10+
* restriction, including without limitation the rights to use,
11+
* copy, modify, merge, publish, distribute, sublicense, and/or sell
12+
* copies of the Software, and to permit persons to whom the
13+
* Software is furnished to do so, subject to the following
14+
* conditions:
15+
*
16+
* The above copyright notice and this permission notice shall be
17+
* included in all copies or substantial portions of the Software.
18+
*
19+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20+
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
21+
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
22+
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
23+
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
24+
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25+
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
26+
* OTHER DEALINGS IN THE SOFTWARE.
27+
*
28+
* Project : Python3 Indent/Dedent handler for ANTLR4 grammars
29+
* https://github.com/antlr/grammars-v4/tree/master/python/python3-without-actions
30+
* Developed by : Robert Einhorn, [email protected]
31+
*/
32+
33+
import org.antlr.v4.runtime.*;
34+
35+
// *************************************************************************************************************
36+
// **** THE FOLLOWING IMPORT SECTION ALSO CAN BE USED IN THE SECTION OF THE @lexer::header{} IN THE GRAMMAR ****
37+
// *************************************************************************************************************
38+
import java.util.*;
39+
40+
public class LexerWithIndentDedentInjector extends Python3Lexer { //*** https://www.antlr.org/api/Java/org/antlr/v4/runtime/Lexer.html
41+
public LexerWithIndentDedentInjector(CharStream input) {
42+
super(input);
43+
}
44+
45+
// *******************************************************************************************************
46+
// **** THE FOLLOWING SECTION ALSO CAN BE USED IN THE SECTION OF THE @lexer::members{} IN THE GRAMMAR ****
47+
// *******************************************************************************************************
48+
// The stack that keeps track of the indentation lengths
49+
private final Stack<Integer> indentLengths = new Stack<>() {{ push(0); }}; // initializing with default 0 indentation length
50+
// A queue where extra tokens are pushed on
51+
private final Deque<Token> pendingTokens = new ArrayDeque<>();
52+
// An integer that stores the type of the last appended token to the token stream
53+
private int lastAppendedTokenType;
54+
55+
// The amount of opened braces, brackets and parenthesis
56+
private int opened = 0;
57+
58+
// Was there space char in the indentations?
59+
private boolean wasSpaceIndentation = false;
60+
// Was there TAB char in the indentations?
61+
private boolean wasTabIndentation = false;
62+
63+
// A string list that stores the lexer warnings
64+
private List<String> warnings = new ArrayList<>();
65+
// A string list that stores the lexer error messages
66+
private List<String> errors = new ArrayList<>();
67+
68+
// Patterns for the custom error listener to recognize error messages
69+
public static final String TEXT_LEXER = "lexer --> ";
70+
public static final String TEXT_INSERTED_INDENT = "inserted INDENT";
71+
72+
@Override
73+
public Token nextToken() {
74+
final boolean atVeryFirstCharWhichIsSpaceOrTAB = getCharIndex() == 0 && List.of((int) ' ', (int) '\t').contains(_input.LA(1));
75+
Token currentToken;
76+
77+
while (true) {
78+
currentToken = super.nextToken(); // get a token from the inputstream
79+
this.insertLeadingTokens(atVeryFirstCharWhichIsSpaceOrTAB, currentToken.getType(), currentToken.getStartIndex());
80+
switch (currentToken.getType()) {
81+
case OPEN_PAREN:
82+
case OPEN_BRACK:
83+
case OPEN_BRACE:
84+
this.opened++;
85+
this.pendingTokens.addLast(currentToken); // insert the current open parentheses or square bracket or curly brace token
86+
break;
87+
case CLOSE_PAREN:
88+
case CLOSE_BRACK:
89+
case CLOSE_BRACE:
90+
this.opened--;
91+
this.pendingTokens.addLast(currentToken); // insert the current close parentheses or square bracket or curly brace token
92+
break;
93+
case NEWLINE:
94+
if (this.opened > 0) { //*** https://docs.python.org/3/reference/lexical_analysis.html#implicit-line-joining
95+
continue; // We're inside an implicit line joining section, skip the NEWLINE token
96+
} else {
97+
switch (_input.LA(1) /* next symbol */) { //*** https://www.antlr.org/api/Java/org/antlr/v4/runtime/IntStream.html#LA(int)
98+
case '\r':
99+
case '\n':
100+
case '\f':
101+
case '#': //*** https://docs.python.org/3/reference/lexical_analysis.html#blank-lines
102+
continue; // We're on a blank line or before a comment, skip the NEWLINE token
103+
default:
104+
this.pendingTokens.addLast(currentToken); // insert the current NEWLINE token
105+
this.insertIndentDedentTokens(); //*** https://docs.python.org/3/reference/lexical_analysis.html#indentation
106+
}
107+
}
108+
break;
109+
case EOF:
110+
if ( !this.indentLengths.isEmpty() ) {
111+
this.insertTrailingTokens(); // indentLengths stack wil be empty
112+
this.checkSpaceAndTabIndentation();
113+
this.pendingTokens.addLast(currentToken); // insert the current EOF token
114+
}
115+
break;
116+
default:
117+
this.pendingTokens.addLast(currentToken); // insert the current token
118+
}
119+
break; // exit from the loop
120+
}
121+
this.lastAppendedTokenType = this.pendingTokens.peekFirst().getType(); // save the token type before removing from the deque for the trailing tokens inserting later
122+
return this.pendingTokens.pollFirst(); // append a token to the token stream until the first returning EOF
123+
}
124+
125+
private void insertLeadingTokens(boolean atVeryFirstCharWhichIsSpaceOrTAB, int type, int startIndex) {
126+
if (atVeryFirstCharWhichIsSpaceOrTAB && // We're at the first line of the input starting with a space or TAB
127+
!List.of(NEWLINE, EOF).contains(type) // and within that the first token that is visible (comments were skiped and OPEN_PAREN, OPEN_BRACK OPEN_BRACE cannot be the first token)
128+
) { // We need to insert a NEWLINE and an INDENT token before the first token to raise an 'unexpected indent' error by the parser later
129+
this.insertToken(0, startIndex - 1, "<inserted leading NEWLINE>" + " ".repeat(startIndex), NEWLINE, 1, 0);
130+
this.insertToken(startIndex, startIndex - 1, "<" + TEXT_INSERTED_INDENT + ", " + this.getIndentationDescription(startIndex) + ">", Python3Parser.INDENT, 1, startIndex);
131+
this.indentLengths.push(startIndex);
132+
}
133+
}
134+
135+
private void insertIndentDedentTokens() {
136+
final int currentIndentLength = this.getIndentationLength(getText());
137+
int previousIndentLength = this.indentLengths.peek();
138+
139+
if (currentIndentLength > previousIndentLength) { // insert an INDENT token
140+
this.insertToken("<" + TEXT_INSERTED_INDENT + ", " + this.getIndentationDescription(currentIndentLength) + ">", Python3Parser.INDENT);
141+
this.indentLengths.push(currentIndentLength);
142+
} else if (currentIndentLength < previousIndentLength) {
143+
do { // More than 1 DEDENT token may be inserted
144+
this.indentLengths.pop();
145+
previousIndentLength = this.indentLengths.peek();
146+
if (currentIndentLength <= previousIndentLength) {
147+
this.insertToken("<inserted DEDENT, " + this.getIndentationDescription(previousIndentLength) + ">", Python3Parser.DEDENT);
148+
} else {
149+
this.insertToken("<inserted (I N C O N S I S T E N T!) DEDENT, " + this.getIndentationDescription(currentIndentLength) + ">", Python3Parser.DEDENT);
150+
this.errors.add(TEXT_LEXER + "line " + getLine() + ":" + getCharPositionInLine() + "\t IndentationError: unindent does not match any outer indentation level");
151+
}
152+
} while (currentIndentLength < previousIndentLength);
153+
}
154+
}
155+
156+
private void insertTrailingTokens() {
157+
if ( !List.of(NEWLINE, Python3Parser.DEDENT).contains(this.lastAppendedTokenType) ) { // If the last token was not NEWLINE or DEDENT then
158+
this.insertToken("<inserted trailing NEWLINE>", NEWLINE); // insert an extra trailing NEWLINE token that serves as the end of the statement
159+
}
160+
161+
this.indentLengths.removeElementAt(0); // Remove the default 0 indentation length
162+
while ( !this.indentLengths.isEmpty() ) { // Now insert as much trailing DEDENT tokens as needed
163+
this.insertToken("<inserted trailing DEDENT, " + this.getIndentationDescription(this.indentLengths.pop()) + ">", Python3Parser.DEDENT);
164+
}
165+
}
166+
167+
private String getIndentationDescription(int lengthOfIndent) {
168+
return "length=" + lengthOfIndent + ", level=" + (this.indentLengths.size());
169+
}
170+
171+
private void insertToken(String text, int type) {
172+
final int startIndex = _tokenStartCharIndex + getText().length(); //*** https://www.antlr.org/api/Java/org/antlr/v4/runtime/Lexer.html#_tokenStartCharIndex
173+
this.insertToken(startIndex, startIndex - 1, text, type, getLine(), getCharPositionInLine());
174+
}
175+
176+
private void insertToken(int startIndex, int stopIndex, String text, int type, int line, int charPositionInLine) {
177+
CommonToken token = new CommonToken(_tokenFactorySourcePair, type, DEFAULT_TOKEN_CHANNEL, startIndex, stopIndex); //*** https://www.antlr.org/api/Java/org/antlr/v4/runtime/CommonToken.html
178+
token.setText(text);
179+
token.setLine(line);
180+
token.setCharPositionInLine(charPositionInLine);
181+
this.pendingTokens.addLast(token);
182+
}
183+
184+
// Calculates the indentation of the provided spaces, taking the
185+
// following rules into account:
186+
//
187+
// "Tabs are replaced (from left to right) by one to eight spaces
188+
// such that the total number of characters up to and including
189+
// the replacement is a multiple of eight [...]"
190+
//
191+
// -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation
192+
private int getIndentationLength(String textOfMatchedNEWLINE) {
193+
int count = 0;
194+
195+
for (char ch : textOfMatchedNEWLINE.toCharArray()) {
196+
switch (ch) {
197+
case ' ': // A normal space char
198+
this.wasSpaceIndentation = true;
199+
count++;
200+
break;
201+
case '\t':
202+
this.wasTabIndentation = true;
203+
count += 8 - (count % 8);
204+
break;
205+
}
206+
}
207+
return count;
208+
}
209+
210+
private void checkSpaceAndTabIndentation() {
211+
if (this.wasSpaceIndentation && this.wasTabIndentation) {
212+
this.warnings.add("Mixture of space and tab were used for indentation.");
213+
}
214+
}
215+
216+
public List<String> getWarnings() { // can be called from a grammar embedded action also
217+
return this.warnings;
218+
}
219+
220+
public List<String> getErrorMessages() { // can be called from a grammar embedded action also
221+
return this.errors;
222+
}
223+
}

0 commit comments

Comments
 (0)