-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvert.py
205 lines (177 loc) · 8.43 KB
/
convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import sys
import json
import jsbeautifier
# Converts the txt representation of the grammar into a json representation
# for the rest of the program. Names the json representation JSON_FILE_NAME.
def main(txt_file_name, json_file_name):
txt_file = open(txt_file_name, 'r')
lines = txt_file.readlines()
# Strip comments and empty lines
lines = [line.strip() for line in lines if is_valid_line(line)]
# The first line of the file is the set of terminals. The rest are rules.
terminals = [token.strip() for token in lines[0].split(',')]
terminals = [t if t != 'COMMA' else ',' for t in terminals]
terminals = list(set(terminals)) # Clear duplicates, if any
rules = coalesce_rules(lines[1:])
# Create and mutate the rule_map before dumping it to JSON
rule_map = get_rule_map(rules)
handle_plus_rules(rule_map)
handle_star_rules(rule_map)
handle_question_rules(rule_map)
# Create dictionary mapping to be marshalled into a JSON representation
json_repr = {}
json_repr['config'] = {}
json_repr['config']['TERMINALS'] = terminals
json_repr['config']['NONTERMINALS'] = list(rule_map.keys())
json_repr['grammar'] = {}
json_repr['grammar']['start'] = list(rule_map.keys())[0]
json_repr['grammar']['rules'] = []
for rule_start, rule_bodies in rule_map.items():
rule_dict = {}
rule_dict['start'] = rule_start
rule_dict['bodies'] = []
for rule_body in rule_bodies:
split_body = [token if token != '/eps' else '' for token in rule_body.split()]
rule_dict['bodies'].append(split_body)
json_repr['grammar']['rules'].append(rule_dict)
# Dump the JSON representation to the output file
json_dump = json.dumps(json_repr)
opts = jsbeautifier.default_options()
opts.indent_size = 2
output = jsbeautifier.beautify(json_dump, opts)
output_file = open(json_file_name, 'w+')
output_file.write(output)
print('Conversion successful!')
print('Remember to update the remainder of the configuration options!')
# Mutates the rule map to search for all expressions ending in the rule_token.
# Adds a new rule corresponding to the expression via the passed-in rule_builder
# and names it according to the rule_name function. Updates each rule ending in
# the rule_token to point to the newly created rule
#
# Example Parameters:
# rule_token = '/+'
# rule_name_fn = lambda s: '_%s_plus' % s
# rule_builder_fn = lambda s, n: [s, '%s %s' % (n, s)]
#
# Example Effect:
# program := expr/+
# =>
# program := _expr_plus
# _expr_plus := expr | _expr_plus expr
# If any other rules had expr/+, they would also point to _expr_plus
def handle_custom_rules(rule_map, rule_token, rule_name_fn, rule_builder_fn):
# Taking the list of the keys forces us to iterate only through keys
# that were present at the start of this function
for rule_name in list(rule_map.keys()):
# The rule_map maps a string rule name to a list of rule bodies
# We create any new rules necessary in rule_map, and update this
# list of rule bodies to use the new rule
rule_bodies = rule_map[rule_name]
for rule_body_index in range(len(rule_bodies)):
rule_body = rule_bodies[rule_body_index]
# Tokenize the rule body to check for any custom tokens
rule_body_symbols = rule_body.split()
for i in range(len(rule_body_symbols)):
symbol = rule_body_symbols[i]
# If part of a token ends in (non-escaped) rule_token...
n = len(rule_token)
if symbol[-n:] == rule_token and symbol[-(n+1)] != '/':
cleaned_symbol = symbol[:-n]
custom_rule_name = rule_name_fn(cleaned_symbol)
# If there is not already a custom rule for this token, create one
if not custom_rule_name in rule_map:
rule_map[custom_rule_name] = rule_builder_fn(cleaned_symbol, custom_rule_name)
# Update the token ending in rule_token to use the new rule
rule_body_symbols[i] = custom_rule_name
# Some tokens in rule_body_symbols may have been updated
# Regardless, we untokenize the rule_body_symbols back into a
# string rule_body, where tokens are separated by spaces
new_rule_body = ' '.join(rule_body_symbols)
# Update the rule body list to use the (potentially new) rule body
rule_bodies[rule_body_index] = new_rule_body
# Mutates the rule map to search for all regex + expressions
# Adds a new rule corresponding to the + expression, and updates each rule
# containing a + expression to point to the newly created rule
#
# Example:
# program := expr/+
# =>
# program := _expr_plus
# _expr_plus := expr | _expr_plus expr
# If any other rules had expr/+, they would also point to _expr_plus
def handle_plus_rules(rule_map):
plus_rule_token = '/+'
plus_rule_name_fn = lambda s: '_%s_plus' % s
plus_rule_builder_fn = lambda s, n: ['%s %s' % (n, s), s]
handle_custom_rules(rule_map, plus_rule_token, plus_rule_name_fn, plus_rule_builder_fn)
# Mutates the rule map to search for all regex * expressions
# Adds a new rule corresponding to the * expression, and updates each rule
# containing a * expression to point to the newly created rule
#
# Example:
# program := expr/*
# =>
# program := _expr_star
# _expr_star := /eps | _expr_star expr
# If any other rules had expr/*, they would also point to _expr_star
def handle_star_rules(rule_map):
star_rule_token = '/*'
star_rule_name_fn = lambda s: '_%s_star' % s
star_rule_builder_fn = lambda s, n: ['%s %s' % (n, s), '/eps']
handle_custom_rules(rule_map, star_rule_token, star_rule_name_fn, star_rule_builder_fn)
# Mutates the rule map to search for all regex ? expressions
# Adds a new rule corresponding to the ? expression, and updates each rule
# containing a ? expression to point to the newly created rule
#
# Example:
# program := expr?
# =>
# program := _expr_one_or_none
# _expr_one_or_none := expr | /eps
# If any other rules had expr/?, they would also point to _expr_one_or_none
def handle_question_rules(rule_map):
question_rule_token = '/?'
question_rule_name_fn = lambda s: '_%s_one_or_none' % s
question_rule_builder_fn = lambda s, n: [s, '/eps']
handle_custom_rules(rule_map, question_rule_token, question_rule_name_fn, question_rule_builder_fn)
# Turns the list of rules (as strings) into a rule map
# Maps the start nonterminal of a rule to a list of strings of its bodies
# There are only multiple rule bodies if there are | statements in the rule
def get_rule_map(rules):
rule_map = {}
for rule in rules:
rule_name, rule_body = rule.split(':=')
rule_name, rule_body = rule_name.strip(), rule_body.strip()
rule_map[rule_name] = [body.strip() for body in rule_body.split('|')]
return rule_map
# Takes in a list where some rules are split among multiple lines and
# coalesces them so that each rule belongs to one line (one entry of rules)
def coalesce_rules(rules):
rule_idx, ptr = 0, 1
while rule_idx < len(rules):
# rule_idx points to a real rule, and ptr points directly after that
# Loop through and add this rule's | statements to the rule
while ptr < len(rules) and ':=' not in rules[ptr]:
rules[rule_idx] += ' ' + rules[ptr]
ptr += 1
# Now, the real rule has been updated to include all if its
# | statements, and ptr points to the next real rule OR
# it points past the end of the list
# Set rule_idx to point to the next real rule, and ptr to points
# directly after that to reset our loop invariant
rule_idx = ptr
ptr = ptr + 1
# Finally, prune out all | lines, since they were not removed
# in the above loop
return [rule for rule in rules if ':=' in rule]
# Input lines starting with a # are comment lines and should be ignored
# Ignore all empty lines and lines containing only whitespace
def is_valid_line(line):
line = line.strip()
return line != '' and line[0] != '#'
if __name__ == '__main__':
if len(sys.argv) != 3:
print('Usage: python3 convert.py <.txt> <.json>')
exit()
else:
main(sys.argv[1], sys.argv[2])