forked from njm64/zawk
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tok.awk
94 lines (78 loc) · 2.57 KB
/
tok.awk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
function tok_init( i, n, entry_len) {
cpu_pc = hdr_dictionary_offset
tok_read_separators()
tok_read_dictionary()
tok_word_length = 6 # This varies by version
}
function tok_read_separators( i, n, c) {
n = fetch_u8()
for(i = 0; i < n; i++) {
c = chr[fetch_u8()]
tok_separators[c] = 1
}
# Space is not technically a separator character, but we can treat
# it like one, as long as we don't include it in the list of tokens
# that are returned from read.
tok_separators[" "] = 1
}
function tok_read_dictionary( i, n, wsize) {
wsize = fetch_u8()
n = fetch_u16()
for(i = 0; i < n; i++) {
txt_decode(cpu_pc)
tok_dict[txt_buf] = cpu_pc
txt_clear()
cpu_pc += wsize
}
}
# Split the given string and store it in the globals tok_array/tok_count
# Separators are returned as tokens, and spaces are too (we need them
# to calculate offsets correctly)
function tok_split(s, i, c, prev, prev_split) {
prev_split = 1
tok_count = 0
for(i = 1; i <= length(s); i++) {
c = substr(s, i, 1)
# Split points are before/after spaces & separator characters
if(i > 0 && (c in tok_separators || prev in tok_separators)) {
tok_array[tok_count++] = substr(s, prev_split, i - prev_split)
prev_split = i
}
prev = c
}
tok_array[tok_count++] = substr(s, prev_split)
}
function tok_read_line() {
FS=RS="\n"
if(!getline < "/dev/tty") {
cpu_break = 1
}
return $0
}
function tok_read(text_buf, parse_buf, line, max_chars, max_tokens, i, tok, t, offset) {
max_chars = mem_read_u8(text_buf) - 1
max_tokens = mem_read_u8(parse_buf)
# Read a line, convert to lower case, truncate if necessary,
# then write into text_buf.
line = tok_read_line()
line = tolower(line)
line = substr(line, 1, max_chars)
mem_write_string(text_buf, line)
# Split into tokens
tok_split(line)
# For each non-space token, write the address of the token in the
# dictionary, the length of the token in bytes, and the offset into
# the text_buf.
for(i = 0; i < tok_count && n < max_tokens; i++) {
tok = tok_array[i]
if(tok != " ") {
mem_write_u16(parse_buf + 2 + t * 4, tok_dict[substr(tok, 1, 6)])
mem_write_u8(parse_buf + 4 + t * 4, length(tok))
mem_write_u8(parse_buf + 5 + t * 4, offset + 1)
t++
}
offset += length(tok)
}
# Write the total number of non-space tokens
mem_write_u8(parse_buf + 1, t)
}