-
Notifications
You must be signed in to change notification settings - Fork 0
/
lex.py
190 lines (171 loc) · 6.71 KB
/
lex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import enum
import sys
class Lexer:
def __init__(self, input):
self.source = input + '\n' # Source code to lex as a string. Append a newline to simplify lexing/parsing the last token/statement.
self.curChar = '' # Current character in the string.
self.curPos = -1 # Current position in the string.
self.nextChar()
# Process the next character.
def nextChar(self):
self.curPos += 1
if self.curPos >= len(self.source):
self.curChar = '\0' # EOF
else:
self.curChar = self.source[self.curPos]
# Return the lookahead character.
def peek(self):
if self.curPos + 1 >= len(self.source):
return '\0'
return self.source[self.curPos+1]
# Invalid token found, print error message and exit.
def abort(self, message):
sys.exit("Lexing error. " + message)
# Skip whitespace except newlines, which we will use to indicate the end of a statement.
def skipWhitespace(self):
while self.curChar == ' ' or self.curChar == '\t' or self.curChar == '\r':
self.nextChar()
# Skip comments in the code.
def skipComment(self):
if self.curChar == '#':
while self.curChar != '\n':
self.nextChar()
# Return the next token.
def getToken(self):
self.skipWhitespace()
self.skipComment()
token = None
# Check the first character of this token to see if we can decide what it is.
# If it is a multiple character operator (e.g., !=), number, identifier, or keyword then we will process the rest.
if self.curChar == '+':
token = Token(self.curChar, TokenType.PLUS)
elif self.curChar == '-':
token = Token(self.curChar, TokenType.MINUS)
elif self.curChar == '*':
token = Token(self.curChar, TokenType.ASTERISK)
elif self.curChar == '/':
token = Token(self.curChar, TokenType.SLASH)
elif self.curChar == '=':
# Check whether this token is = or ==
if self.peek() == '=':
lastChar = self.curChar
self.nextChar()
token = Token(lastChar + self.curChar, TokenType.EQEQ)
else:
token = Token(self.curChar, TokenType.EQ)
elif self.curChar == '>':
# Check whether this is token is > or >=
if self.peek() == '=':
lastChar = self.curChar
self.nextChar()
token = Token(lastChar + self.curChar, TokenType.GTEQ)
else:
token = Token(self.curChar, TokenType.GT)
elif self.curChar == '<':
# Check whether this is token is < or <=
if self.peek() == '=':
lastChar = self.curChar
self.nextChar()
token = Token(lastChar + self.curChar, TokenType.LTEQ)
else:
token = Token(self.curChar, TokenType.LT)
elif self.curChar == '!':
if self.peek() == '=':
lastChar = self.curChar
self.nextChar()
token = Token(lastChar + self.curChar, TokenType.NOTEQ)
else:
self.abort("Expected !=, got !" + self.peek())
elif self.curChar == '\"':
# Get characters between quotations.
self.nextChar()
startPos = self.curPos
while self.curChar != '\"':
# Don't allow special characters in the string. No escape characters, newlines, tabs, or %.
# We will be using C's printf on this string.
if self.curChar == '\r' or self.curChar == '\n' or self.curChar == '\t' or self.curChar == '\\' or self.curChar == '%':
self.abort("Illegal character in string.")
self.nextChar()
tokText = self.source[startPos : self.curPos] # Get the substring.
token = Token(tokText, TokenType.STRING)
elif self.curChar.isdigit():
# Leading character is a digit, so this must be a number.
# Get all consecutive digits and decimal if there is one.
startPos = self.curPos
while self.peek().isdigit():
self.nextChar()
if self.peek() == '.': # Decimal!
self.nextChar()
# Must have at least one digit after decimal.
if not self.peek().isdigit():
# Error!
self.abort("Illegal character in number.")
while self.peek().isdigit():
self.nextChar()
tokText = self.source[startPos : self.curPos + 1] # Get the substring.
token = Token(tokText, TokenType.NUMBER)
elif self.curChar.isalpha():
# Leading character is a letter, so this must be an identifier or a keyword.
# Get all consecutive alpha numeric characters.
startPos = self.curPos
while self.peek().isalnum():
self.nextChar()
# Check if the token is in the list of keywords.
tokText = self.source[startPos : self.curPos + 1] # Get the substring.
keyword = Token.checkIfKeyword(tokText)
if keyword == None: # Identifier
token = Token(tokText, TokenType.IDENT)
else: # Keyword
token = Token(tokText, keyword)
elif self.curChar == '\n':
token = Token(self.curChar, TokenType.NEWLINE)
elif self.curChar == '\0':
token = Token('', TokenType.EOF)
else:
# Unknown token!
self.abort("Unknown token: " + self.curChar)
self.nextChar()
return token
# Token contains the original text and the type of token.
class Token:
def __init__(self, tokenText, tokenKind):
self.text = tokenText # The token's actual text. Used for identifiers, strings, and numbers.
self.kind = tokenKind # The TokenType that this token is classified as.
@staticmethod
def checkIfKeyword(tokenText):
for kind in TokenType:
# Relies on all keyword enum values being 1XX.
if kind.name == tokenText and kind.value >= 100 and kind.value < 200:
return kind
return None
# TokenType is our enum for all the types of tokens.
class TokenType(enum.Enum):
EOF = -1
NEWLINE = 0
NUMBER = 1
IDENT = 2
STRING = 3
# Keywords.
LABEL = 101
GOTO = 102
PRINT = 103
INPUT = 104
LET = 105
IF = 106
THEN = 107
ENDIF = 108
WHILE = 109
REPEAT = 110
ENDWHILE = 111
# Operators.
EQ = 201
PLUS = 202
MINUS = 203
ASTERISK = 204
SLASH = 205
EQEQ = 206
NOTEQ = 207
LT = 208
LTEQ = 209
GT = 210
GTEQ = 211