-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscanner.py
242 lines (185 loc) · 6.97 KB
/
scanner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
''' scanner.py -- generates a state machine scanner for the Algol W READ statement
See the Algol W Language Description, section 6.3
--
This file is part of Awe. Copyright 2012 Glyn Webster.
Awe is free software: you can redistribute it and/or modify it
under the terms of the GNU General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Awe is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public
License along with Awe. If not, see <http://www.gnu.org/licenses/>.
'''
MESSAGE = """/* GENERATED BY scanner.py: DON'T EDIT THIS FILE, EDIT scanner.py */
"""
STATE = '''
case %(STATE)s:
switch (c) {
%(CASES)s
default: Scanner_close_buffer(scanner); return Error;
}
break;
'''
CASE = ' %(CONDITION)s scanner->state = %(NEXT)s; %(ACTION)s break;\n'
# Shorthand for the various classes of character that the scanner should see:
condition_data = r"""
n case '\n': //various white space
s case ' ':
! case EOF:
w case ' ': case '\n': case EOF:
d case '0' ... '9': //parts of numbers
- case '-': case '+':
. case '.':
e case '\'': case 'e': case 'E': //exponent sign
//hexadecimal for BITS
# case '#':
h case '0' ... '9': case 'A' ... 'F': case 'a' ... 'f':
//strings of printable ISO-8559-1 characters
" case '"':
c case ' ': case '!': case '#' ... '~': case '\xAF' ... '\xFF':
I case 'I': case 'i': //various letters
T case 'T': case 't':
R case 'R': case 'r':
U case 'U': case 'u':
E case 'E': case 'e':
F case 'F': case 'f':
A case 'A': case 'a':
L case 'L': case 'l':
S case 'S': case 's':
""" #"
# The state machine: current state, current character, next state, action.
#
# See aweio.c for the meanings of the Scanner_* actions.
# (Basically, they are filling a buffer with the C versions of
# the Algol W constants which will be rescanned with 'strtod' and friends.)
default_action = 'Scanner_addchar(scanner, c);'
transition_data = r'''
// Ready for the next constant.
0 d 101 Scanner_start(scanner); Scanner_addchar(scanner, c); //These enter the number reading states.
0 e 103 Scanner_start(scanner); Scanner_addstring(scanner, "1.0e");
0 . 107 Scanner_start(scanner); Scanner_addstring(scanner, "0.");
0 - 100 Scanner_start(scanner); Scanner_addchar(scanner, c);
0 " 301 Scanner_start(scanner); //These enter string, bits, and TRUE/FALSE constant states.
0 # 401 Scanner_start(scanner);
0 T 501 Scanner_start(scanner); Scanner_addchar(scanner, c);
0 F 601 Scanner_start(scanner); Scanner_addchar(scanner, c);
0 s 0 ; //Leading spaces are ignored
0 n 0 ; //Newlines are ignored, but increment the line count
0 ! 0 Scanner_close_buffer(scanner); return Eof;
//An end of file when expecting a constant raises an exception.
// Numbers: integer, real, or imaginary.
// Algol W lets a ridiculous number of real constant parts be optional.
// An integer or real terminated with a sign rather than a space is the
// real part of a complex number, the imaginary part will follow (state 200).
100 . 107 Scanner_addstring(scanner, "0.");
100 d 101
100 e 103 Scanner_addstring(scanner, "1.0e");
101 d 101
101 . 102
101 e 103 Scanner_addstring(scanner, ".0e");
101 I 105 ;
101 L 109 ;
101 - 200
101 w 0 Scanner_close_buffer(scanner); return Integer;
102 d 102
102 e 103 Scanner_addstring(scanner, "0e");
102 w 0 Scanner_close_buffer(scanner); return Real;
102 I 105 Scanner_addstring(scanner, "0");
102 L 109 Scanner_addstring(scanner, "0");
102 - 200
103 d 104
103 - 108
104 d 104
104 I 105 ;
104 L 109 ;
104 - 200
104 w 0 Scanner_close_buffer(scanner); return Real;
105 L 106 ;
105 w 0 Scanner_close_buffer(scanner); return Imaginary;
106 w 0 Scanner_close_buffer(scanner); return Imaginary;
107 d 102
108 d 104
109 - 200
109 w 0 Scanner_close_buffer(scanner); return Real;
// Numbers, the imaginary part of a complex number.
// This must contain an I.
200 d 201
200 e 203 Scanner_addstring(scanner, "1.0e");
200 . 207 Scanner_addstring(scanner, "0.");
201 d 201
201 . 202
201 I 205 ;
201 e 203 Scanner_addstring(scanner, ".0e");
202 d 202
202 e 203 Scanner_addstring(scanner, "0e");
202 I 205 Scanner_addstring(scanner, "0");
203 d 204
203 - 209
204 d 204
204 I 205 ;
204 w 0 Scanner_close_buffer(scanner); return Complex;
205 L 206 ;
205 w 0 Scanner_close_buffer(scanner); return Complex;
206 w 0 Scanner_close_buffer(scanner); return Complex;
207 d 202
209 d 204
// Quoted strings.
// Two double quotes are the escape for a double quote character.
301 c 301
301 " 302 ;
302 " 301 Scanner_addstring(scanner, "\"\"");
302 w 0 Scanner_close_buffer(scanner); return String;
// Bits, in hexadecimal.
401 h 401
401 w 0 Scanner_close_buffer(scanner); return Bits;
// "TRUE" and "FALSE"
501 R 502
502 U 503
503 E 504
504 w 0 Scanner_close_buffer(scanner); return Logical;
601 A 602
602 L 603
603 S 604
604 E 605
605 w 0 Scanner_close_buffer(scanner); return Logical;
'''
# Generate the C code of the state machine conditions and transitions
import re
COMMENT = re.compile(r'(?m) *//.*?$')
CONDITION = re.compile(r'(?m)^ *(.) +(.+) *$')
conditions = {}
for m in CONDITION.finditer(COMMENT.sub('', condition_data)):
conditions[m.group(1)] = m.group(2)
TRANSITION = re.compile(r'(?m)^ *(\d+) +(.) +(\d+) *(.+)? *$')
transitions = {}
for m in TRANSITION.finditer(COMMENT.sub('', transition_data)):
state, condition_code, next, action = m.group(1,2,3,4)
if not action:
action = default_action
transitions.setdefault(state, []).append((condition_code, next, action))
# Output C code for state machine to scanner.inc
c_code = ''
for state in sorted(transitions.keys()):
cases = ''
for c, next, action in transitions[state]:
cases += CASE % {'CONDITION': conditions[c], 'NEXT': next, 'ACTION': action}
c_code += STATE % {'STATE': state, 'CASES': cases}
f = open('scanner.inc', 'w')
f.write(MESSAGE)
f.write(c_code)
f.close()
# Output a "dot file" that can be used to visualize the state machine with GraphViz.
def escape(s): return s.encode('utf-8').decode('unicode_escape').replace('"', '\\"')
f = open("scanner.dot", 'w')
f.write('digraph state_machine { rankdir=LR; node [shape = circle];\n')
for state in sorted(transitions.keys()):
for c, next, action in transitions[state]:
#label = escape(c) + '\\n' + escape(action)
label = escape(c)
f.write('%s -> %s [label="%s"];\n' % (state, next, label))
f.write('}\n')
f.close()
#end