-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlexer.lua
325 lines (280 loc) · 9.59 KB
/
lexer.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
--[=[
Lexical scanner for creating a sequence of tokens from Lua source code.
This is a heavily modified and Roblox-optimized version of
the original Penlight Lexer module:
https://github.com/stevedonovan/Penlight
Authors:
stevedonovan <https://github.com/stevedonovan> ----------- Original Penlight lexer author
ryanjmulder <https://github.com/ryanjmulder> ------------- Penlight lexer contributer
mpeterv <https://github.com/mpeterv> --------------------- Penlight lexer contributer
Tieske <https://github.com/Tieske> ----------------------- Penlight lexer contributer
boatbomber <https://github.com/boatbomber> --------------- Roblox port, added builtin token,
added patterns for incomplete syntax, bug fixes,
behavior changes, token optimization, thread optimization
Added lexer.navigator() for non-sequential reads
Sleitnick <https://github.com/Sleitnick> ----------------- Roblox optimizations
howmanysmall <https://github.com/howmanysmall> ----------- Lua + Roblox optimizations
List of possible tokens:
- iden
- keyword
- builtin
- string
- number
- comment
- operator
--]=]
local lexer = {}
local Prefix, Suffix, Cleaner = "^[%c%s]*", "[%c%s]*", "[%c%s]+"
local UNICODE = "[%z\x01-\x7F\xC2-\xF4][\x80-\xBF]+"
local NUMBER_A = "0[xX][%da-fA-F_]+"
local NUMBER_B = "0[bB][01_]+"
local NUMBER_C = "%d+%.?%d*[eE][%+%-]?%d+"
local NUMBER_D = "%d+[%._]?[%d_eE]*"
local OPERATORS = "[:;<>/~%*%(%)%-={},%.#%^%+%%]+"
local BRACKETS = "[%[%]]+" -- needs to be separate pattern from other operators or it'll mess up multiline strings
local IDEN = "[%a_][%w_]*"
local STRING_EMPTY = "(['\"])%1" --Empty String
local STRING_PLAIN = "(['\"])[^\n]-([^\\]%1)" --TODO: Handle escaping escapes
local STRING_INTER = "`[^\n]-`"
local STRING_INCOMP_A = "(['\"]).-\n" --Incompleted String with next line
local STRING_INCOMP_B = "(['\"])[^\n]*" --Incompleted String without next line
local STRING_MULTI = "%[(=*)%[.-%]%1%]" --Multiline-String
local STRING_MULTI_INCOMP = "%[=*%[.-.*" --Incompleted Multiline-String
local COMMENT_MULTI = "%-%-%[(=*)%[.-%]%1%]" --Completed Multiline-Comment
local COMMENT_MULTI_INCOMP = "%-%-%[=*%[.-.*" --Incompleted Multiline-Comment
local COMMENT_PLAIN = "%-%-.-\n" --Completed Singleline-Comment
local COMMENT_INCOMP = "%-%-.*" --Incompleted Singleline-Comment
-- local TYPED_VAR = ":%s*([%w%?%| \t]+%s*)" --Typed variable, parameter, function
local lang = loadstring(game:HttpGet("https://raw.githubusercontent.com/hello-n-bye/internal-executor-project/main/language.lua"))()
local lua_keyword = lang.keyword
local lua_builtin = lang.builtin
local lua_libraries = lang.libraries
lexer.language = lang
local lua_matches = {
-- Indentifiers
{ Prefix .. IDEN .. Suffix, "var" },
-- Numbers
{ Prefix .. NUMBER_A .. Suffix, "number" },
{ Prefix .. NUMBER_B .. Suffix, "number" },
{ Prefix .. NUMBER_C .. Suffix, "number" },
{ Prefix .. NUMBER_D .. Suffix, "number" },
-- Strings
{ Prefix .. STRING_EMPTY .. Suffix, "string" },
{ Prefix .. STRING_PLAIN .. Suffix, "string" },
{ Prefix .. STRING_INCOMP_A .. Suffix, "string" },
{ Prefix .. STRING_INCOMP_B .. Suffix, "string" },
{ Prefix .. STRING_MULTI .. Suffix, "string" },
{ Prefix .. STRING_MULTI_INCOMP .. Suffix, "string" },
{ Prefix .. STRING_INTER .. Suffix, "string_inter" },
-- Comments
{ Prefix .. COMMENT_MULTI .. Suffix, "comment" },
{ Prefix .. COMMENT_MULTI_INCOMP .. Suffix, "comment" },
{ Prefix .. COMMENT_PLAIN .. Suffix, "comment" },
{ Prefix .. COMMENT_INCOMP .. Suffix, "comment" },
-- Operators
{ Prefix .. OPERATORS .. Suffix, "operator" },
{ Prefix .. BRACKETS .. Suffix, "operator" },
-- Unicode
{ Prefix .. UNICODE .. Suffix, "iden" },
-- Unknown
{ "^.", "iden" },
}
-- To reduce the amount of table indexing during lexing, we separate the matches now
local PATTERNS, TOKENS = {}, {}
for i, m in lua_matches do
PATTERNS[i] = m[1]
TOKENS[i] = m[2]
end
--- Create a plain token iterator from a string.
-- @tparam string s a string.
function lexer.scan(s: string)
local index = 1
local size = #s
local previousContent1, previousContent2, previousContent3, previousToken = "", "", "", ""
local thread = coroutine.create(function()
while index <= size do
local matched = false
for tokenType, pattern in ipairs(PATTERNS) do
-- Find match
local start, finish = string.find(s, pattern, index)
if start == nil then
continue
end
-- Move head
index = finish + 1
matched = true
-- Gather results
local content = string.sub(s, start, finish)
local rawToken = TOKENS[tokenType]
local processedToken = rawToken
-- Process token
if rawToken == "var" then
-- Since we merge spaces into the tok, we need to remove them
-- in order to check the actual word it contains
local cleanContent = string.gsub(content, Cleaner, "")
if lua_keyword[cleanContent] then
processedToken = "keyword"
elseif lua_builtin[cleanContent] then
processedToken = "builtin"
elseif string.find(previousContent1, "%.[%s%c]*$") and previousToken ~= "comment" then
-- The previous was a . so we need to special case indexing things
local parent = string.gsub(previousContent2, Cleaner, "")
local lib = lua_libraries[parent]
if lib and lib[cleanContent] and not string.find(previousContent3, "%.[%s%c]*$") then
-- Indexing a builtin lib with existing item, treat as a builtin
processedToken = "builtin"
else
-- Indexing a non builtin, can't be treated as a keyword/builtin
processedToken = "iden"
end
-- print("indexing",parent,"with",cleanTok,"as",t2)
else
processedToken = "iden"
end
elseif rawToken == "string_inter" then
if not string.find(content, "[^\\]{") then
-- This inter string doesnt actually have any inters
processedToken = "string"
else
-- We're gonna do our own yields, so the main loop won't need to
-- Our yields will be a mix of string and whatever is inside the inters
processedToken = nil
local isString = true
local subIndex = 1
local subSize = #content
while subIndex <= subSize do
-- Find next brace
local subStart, subFinish = string.find(content, "^.-[^\\][{}]", subIndex)
if subStart == nil then
-- No more braces, all string
coroutine.yield("string", string.sub(content, subIndex))
break
end
if isString then
-- We are currently a string
subIndex = subFinish + 1
coroutine.yield("string", string.sub(content, subStart, subFinish))
-- This brace opens code
isString = false
else
-- We are currently in code
subIndex = subFinish
local subContent = string.sub(content, subStart, subFinish - 1)
for innerToken, innerContent in lexer.scan(subContent) do
coroutine.yield(innerToken, innerContent)
end
-- This brace opens string/closes code
isString = true
end
end
end
end
-- Record last 3 tokens for the indexing context check
previousContent3 = previousContent2
previousContent2 = previousContent1
previousContent1 = content
previousToken = processedToken or rawToken
if processedToken then
coroutine.yield(processedToken, content)
end
break
end
-- No matches found
if not matched then
return
end
end
-- Completed the scan
return
end)
return function()
if coroutine.status(thread) == "dead" then
return
end
local success, token, content = coroutine.resume(thread)
if success and token then
return token, content
end
return
end
end
function lexer.navigator()
local nav = {
Source = "",
TokenCache = table.create(50),
_RealIndex = 0,
_UserIndex = 0,
_ScanThread = nil,
}
function nav:Destroy()
self.Source = nil
self._RealIndex = nil
self._UserIndex = nil
self.TokenCache = nil
self._ScanThread = nil
end
function nav:SetSource(SourceString)
self.Source = SourceString
self._RealIndex = 0
self._UserIndex = 0
table.clear(self.TokenCache)
self._ScanThread = coroutine.create(function()
for Token, Src in lexer.scan(self.Source) do
self._RealIndex += 1
self.TokenCache[self._RealIndex] = { Token, Src }
coroutine.yield(Token, Src)
end
end)
end
function nav.Next()
nav._UserIndex += 1
if nav._RealIndex >= nav._UserIndex then
-- Already scanned, return cached
return table.unpack(nav.TokenCache[nav._UserIndex])
else
if coroutine.status(nav._ScanThread) == "dead" then
-- Scan thread dead
return
else
local success, token, src = coroutine.resume(nav._ScanThread)
if success and token then
-- Scanned new data
return token, src
else
-- Lex completed
return
end
end
end
end
function nav.Peek(PeekAmount)
local GoalIndex = nav._UserIndex + PeekAmount
if nav._RealIndex >= GoalIndex then
-- Already scanned, return cached
if GoalIndex > 0 then
return table.unpack(nav.TokenCache[GoalIndex])
else
-- Invalid peek
return
end
else
if coroutine.status(nav._ScanThread) == "dead" then
-- Scan thread dead
return
else
local IterationsAway = GoalIndex - nav._RealIndex
local success, token, src = nil, nil, nil
for _ = 1, IterationsAway do
success, token, src = coroutine.resume(nav._ScanThread)
if not (success or token) then
-- Lex completed
break
end
end
return token, src
end
end
end
return nav
end
return lexer