forked from CosmosOS/Cosmos
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathParser.cs
324 lines (292 loc) · 12.9 KB
/
Parser.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace XSharp.Compiler {
public class Parser {
/// <summary>Index in <see cref="mData"/> of the first yet unconsumed character.</summary>
protected int mStart = 0;
/// <summary>Initial text provided as a constructor parameter.</summary>
protected string mData;
/// <summary>true if whitespace tokens should be kept and propagated to the next parsing
/// stage.</summary>
protected bool mIncludeWhiteSpace;
/// <summary>true while every token encountered until so far by this parser are whitespace
/// tokens.</summary>
protected bool mAllWhitespace;
/// <summary>true if the parser supports patterns recognition.</summary>
protected bool mAllowPatterns;
/// <summary>Tokens retrieved so far by the parser.</summary>
protected TokenList mTokens;
/// <summary>Get a list of tokens that has been built at class instanciation.</summary>
public TokenList Tokens {
get { return mTokens; }
}
protected static readonly char[] mComma = new char[] { ',' };
protected static readonly char[] mSpace = new char[] { ' ' };
public static string[] mKeywords = (
"As,All"
+ ",BYTE"
+ ",CALL,CONST"
+ ",DWORD"
+ ",exit"
+ ",function"
+ ",goto"
+ ",IF,INTERRUPT,iret"
+ ",namespace"
+ ",PORT"
+ ",return,ret,REPEAT"
+ ",times"
+ ",var"
+ ",word,while"
).ToUpper().Split(mComma);
public static readonly string[] Registers;
public static readonly string[] RegistersAddr;
public static readonly string[] Registers8 = "AH,AL,BH,BL,CH,CL,DH,DL".Split(mComma);
public static readonly string[] Registers16 = "AX,BX,CX,DX".Split(mComma);
public static readonly string[] Registers32 = "EAX,EBX,ECX,EDX".Split(mComma);
public static readonly string[] RegistersIdx = "ESI,EDI,ESP,EBP".Split(mComma);
public static readonly string[] RegisterPatterns = "_REG,_REG8,_REG16,_REG32,_REGIDX,_REGADDR".Split(mComma);
public static readonly string[] Delimiters = ",".Split(mSpace);
// _.$ are AlphaNum. See comments in Parser
// # is comment and literal, but could be reused when not first char
// string[] is used instead of string because operators can be multi char, != >= etc
public static readonly string[] Operators = "( ) () ! = != >= <= [ [- ] + - * : { } < > ?= ?& @ ~> <~ >> << ++ -- # +# & | ^".Split(mSpace);
static Parser() {
var xRegisters = new List<string>();
xRegisters.AddRange(Registers8);
xRegisters.AddRange(Registers16);
xRegisters.AddRange(Registers32);
xRegisters.AddRange(RegistersIdx);
Registers = xRegisters.ToArray();
var xRegistersAddr = new List<string>();
xRegistersAddr.AddRange(Registers32);
xRegistersAddr.AddRange(RegistersIdx);
RegistersAddr = xRegistersAddr.ToArray();
}
/// <summary>Parse next token from currently parsed line, starting at given position and
/// add the retrieved token at end of given token list.</summary>
/// <param name="aList">The token list where to add the newly recognized token.</param>
/// <param name="lineNumber">Line number for diagnostics and debugging purpose.</param>
/// <param name="rPos">The index in current source code line of the first not yet consumed
/// character. On return this parameter will be updated to account for characters that would
/// have been consumed.</param>
protected void NewToken(TokenList aList, int lineNumber, ref int rPos) {
#region Pattern Notes
// All patterns start with _, this makes them reserved. User can use too, but at own risk of conflict.
//
// Wildcards
// -_REG or ??X
// -_REG8 or ?H,?L
// -_REG16 or ?X
// -_REG32 or E?X
// - ? based ones are ugly and less clear
// -_Keyword
// -_ABC
//
//
// Multiple Options (All caps only) - Registers only
// Used to suport EAX,EBX - ie lists. But found out wasnt really needed. May add again later.
//
// -AX/AL - Conflict if we ever use /
// -AX|AL - Conflict if we ever use |
// -AX,AL - , is unlikely to ever be used as an operator and is logical as a separator. Method calls might use, but likely better to use a space
// since we will only allow simple arguments, not compound.
// -_REG:AX|AL - End terminator issue
// -_REG[AX|AL] - Conflict with existing indirect access. Is indirect access always numeric? I think x86 has some register based ones too.
//
//
// Specific: Register, Keyword, AlphaNum
// -EAX
#endregion
string xString = null;
char xChar1 = mData[mStart];
var xToken = new Token(lineNumber);
// Recognize comments and literal assembler code.
if (mAllWhitespace && "/!".Contains(xChar1)) {
rPos = mData.Length; // This will account for the dummy whitespace at the end.
xString = mData.Substring(mStart + 1, rPos - mStart - 1).Trim();
// So ToString/Format wont generate error
xString = xString.Replace("{", "{{");
xString = xString.Replace("}", "}}");
// Fix issue #15662 with string length check.
// Fix issue #15663 with comparing from mData and not from xString anymore.
if (('/' == xChar1) && (2 <= xString.Length) && ('/' == mData[mStart + 1])) {
xString = xString.Substring(1);
xToken.Type = TokenType.Comment;
} else if (xChar1 == '!') {
// Literal assembler code.
xToken.Type = TokenType.LiteralAsm;
}
} else {
xString = mData.Substring(mStart, rPos - mStart);
if (string.IsNullOrWhiteSpace(xString) && xString.Length > 0) {
xToken.Type = TokenType.WhiteSpace;
} else if (xChar1 == '\'') {
xToken.Type = TokenType.ValueString;
xString = xString.Substring(1, xString.Length - 2);
} else if (char.IsDigit(xChar1)) {
xToken.Type = TokenType.ValueInt;
} else if (xChar1 == '$') {
xToken.Type = TokenType.ValueInt;
// Remove surrounding '
xString = "0x" + xString.Substring(1);
} else if (IsAlphaNum(xChar1)) { // This must be after check for ValueInt
string xUpper = xString.ToUpper();
// Special parsing when in pattern mode. We recognize some special strings
// which would otherwise be considered as simple AlphaNum token otherwise.
if (mAllowPatterns) {
if (RegisterPatterns.Contains(xUpper)) {
xToken.Type = TokenType.Register;
} else if (xUpper == "_KEYWORD") {
xToken.Type = TokenType.Keyword;
xString = null;
} else if (xUpper == "_ABC") {
xToken.Type = TokenType.AlphaNum;
xString = null;
}
else if (xUpper == "_PCALL") {
xString = null;
xToken.Type = TokenType.Call;
}
}
if (xToken.Type == TokenType.Unknown) {
if (Registers.Contains(xUpper)) {
xToken.Type = TokenType.Register;
} else if (mKeywords.Contains(xUpper)) {
xToken.Type = TokenType.Keyword;
} else if(xString.Contains("(") && xString.Contains(")") && IsAlphaNum(xChar1)) {
xToken.Type = TokenType.Call;
} else {
xToken.Type = TokenType.AlphaNum;
}
}
} else if (Delimiters.Contains(xString)) {
xToken.Type = TokenType.Delimiter;
} else if (Operators.Contains(xString)) {
xToken.Type = TokenType.Operator;
}
}
xToken.Value = xString;
xToken.SrcPosStart = mStart;
xToken.SrcPosEnd = xToken.Type == TokenType.Call ? rPos : rPos - 1;
if (mAllWhitespace && (xToken.Type != TokenType.WhiteSpace)) {
mAllWhitespace = false;
}
mStart = xToken.Type == TokenType.Call ? rPos + 1 : rPos;
if (mIncludeWhiteSpace || (xToken.Type != TokenType.WhiteSpace)) {
aList.Add(xToken);
}
}
protected enum CharType { WhiteSpace, Identifier, Symbol, String };
protected bool IsAlphaNum(char aChar) {
return char.IsLetterOrDigit(aChar) || aChar == '_' || aChar == '.' || aChar == '$';
}
/// <summary>Consume text that has been provided to the class constructor, splitting it into
/// a list of tokens.</summary>
/// <param name="lineNumber">Line number for diagnostics and debugging.</param>
/// <returns>The resulting tokens list.</returns>
protected TokenList Parse(int lineNumber) {
// Save in comment, might be useful in future. Already had to dig it out of TFS once
//var xRegex = new System.Text.RegularExpressions.Regex(@"(\W)");
var xResult = new TokenList();
CharType xLastCharType = CharType.WhiteSpace;
char xChar;
CharType xCharType = CharType.WhiteSpace;
int i = 0;
for (i = 0; i < mData.Length; i++) {
xChar = mData[i];
// Extract string literal (surrounded with single quote characters).
if (xChar == '\'') {
// Take data before the ' as a token.
NewToken(xResult, lineNumber, ref i);
// Now scan to the next ' taking into account escaped single quotes.
bool escapedCharacter = false;
for (i = i + 1; i < mData.Length; i++) {
bool done = false;
switch(mData[i])
{
case '\'':
if (!escapedCharacter) { done = true; }
break;
case '\\':
escapedCharacter = !escapedCharacter;
break;
default:
escapedCharacter = false;
break;
}
if (done) { break; }
}
if (i == mData.Length) {
throw new Exception("Unterminated string.");
}
i++;
xCharType = CharType.String;
}
else if (xChar == '(')
{
for (i += 1; i < mData.Length; i++)
{
if (mData[i] == ')' && mData.LastIndexOf(")") <= i)
{
i++;
NewToken(xResult, lineNumber, ref i);
break;
}
}
}
else if (char.IsWhiteSpace(xChar))
{
xCharType = CharType.WhiteSpace;
}
else if (IsAlphaNum(xChar))
{
// _ and . were never likely to stand on their own. ie ESP _ 2 and ESP . 2 are never likely to be used.
// Having them on their own required a lot of code
// to treat them as a single unit where we did use them. So we treat them as AlphaNum.
xCharType = CharType.Identifier;
}
else
{
xCharType = CharType.Symbol;
}
// i > 0 - Never do NewToken on first char. i = 0 is just a pass to get char and set lastchar.
// But its faster as the second short circuit rather than a separate if.
if ((xCharType != xLastCharType) && (0 < i)) {
NewToken(xResult, lineNumber, ref i);
}
xLastCharType = xCharType;
}
// Last token
if (mStart < mData.Length) {
NewToken(xResult, lineNumber, ref i);
}
return xResult;
}
/// <summary>Create a new Parser instance and immediately consume the given <paramref name="aData"/>
/// string. On return the <seealso cref="Tokens"/> property is available for enumeration.</summary>
/// <param name="aData">The text to be parsed. WARNING : This is expected to be a single full line
/// of text. The parser can be create with a special "pattern recognition" mode.</param>
/// <param name="aIncludeWhiteSpace"></param>
/// <param name="aAllowPatterns">True if <paramref name="aData"/> is a pattern and thus the parsing
/// should be performed specifically.</param>
/// <exception cref="Exception">At least one unrecognized token has been parsed.</exception>
public Parser(string aData, int lineNumber, bool aIncludeWhiteSpace, bool aAllowPatterns) {
mData = aData;
mIncludeWhiteSpace = aIncludeWhiteSpace;
mAllowPatterns = aAllowPatterns;
mAllWhitespace = true;
mTokens = Parse(lineNumber);
if (mTokens.Count(q => q.Type == TokenType.Unknown) > 0) {
foreach(Token token in mTokens)
{
if (TokenType.Unknown == token.Type) {
throw new Exception(string.Format("Unknown token '{0}' found at {1}/{2}.",
token.Value ?? "NULL", token.LineNumber, token.SrcPosStart));
}
}
}
}
}
}