-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscanner.go
269 lines (255 loc) · 5.91 KB
/
scanner.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
package main
import (
"bufio"
"bytes"
"errors"
"fmt"
"io"
"unicode"
)
const eof = rune(-1)
// Scanner is a lexical scanner for scanning configuration files.
// This works only on UTF-& text.
type Scanner struct {
r *bufio.Reader
txt *bytes.Buffer
currPos int
line int
err error
column int
}
// NewScanner takes src and returns a new Scanner.
func NewScanner(src io.Reader) *Scanner {
return &Scanner{
r: bufio.NewReader(src),
txt: &bytes.Buffer{},
}
}
//Scan returns a new token for every call by advancing on the consumed UTF-8
//encoded input text.
//
// Anything after ; is considered a comment. White space is preserved together
// with new lines. New lines and spaces are interpreted differently.
func (s *Scanner) Scan() (*Token, error) {
ch := s.peek()
if isIdent(ch) {
return s.scanIdent()
}
switch ch {
case ';':
return s.scanComment()
case ' ', '\t':
return s.scanWhitespace()
case '\n', '\r':
return s.scanNewline()
case '=':
return s.scanRune(Assign)
case '[':
return s.scanRune(LBrace)
case ']':
return s.scanRune(RBrace)
case '(':
return s.scanRune(LBracket)
case ')':
return s.scanRune(RBracket)
case '!':
return s.scanRune(Exclam)
case eof:
return nil, io.EOF
}
return nil, errors.New("unrecognized token " + string(ch))
}
//scanComment scans the input for Comments, only single line comments are
//supported.
//
// A comment is all the text that is after a comment identifier, This does not
// enforce the identifier, so it is up to the caller to decide where the comment
// starts, this will read all the text up to the end of the line and return it
// as a single comment token.
//
// TODO(gernest) accept the comment identifier, or check whether the first
// rune is the supported token identifier.
func (s *Scanner) scanComment() (*Token, error) {
tok := &Token{}
buf := &bytes.Buffer{}
isBlock := false
for _ = range make([]struct{}, 4) {
ch, _, err := s.r.ReadRune()
if err != nil {
if err.Error() == io.EOF.Error() {
goto final
}
return nil, err
}
_, _ = buf.WriteRune(ch)
}
if buf.String() == ";-- " {
isBlock = true
}
END:
for {
begin:
ch, _, err := s.r.ReadRune()
if err != nil {
if err.Error() == io.EOF.Error() {
fmt.Println("END")
break END
}
return nil, err
}
switch ch {
case '\n', '\r':
if isBlock {
_, _ = buf.WriteRune(ch)
goto begin
}
_ = s.r.UnreadRune()
break END
case '-':
_, _ = buf.WriteRune(ch)
if isBlock {
var str string
for _ = range make([]struct{}, 2) {
ch, _, err = s.r.ReadRune()
if err != nil {
if err.Error() == io.EOF.Error() {
goto final
}
return nil, err
}
str += string(ch)
}
_, _ = buf.WriteString(str)
if str == "-;" {
break END
}
}
default:
_, _ = buf.WriteRune(ch)
}
}
final:
s.column++
tok.Begin = s.currPos
s.currPos += buf.Len() // advance the current position
tok.End = s.currPos
tok.Column = s.column
tok.Type = Comment
tok.Text = buf.String()
tok.Line = s.line
return tok, nil
}
//scanWhitespace scans all utf-8 white space characters until it hits a non
//whitespace character.
//
// Tabs ('\t') and space(' ') all represent white space.
func (s *Scanner) scanWhitespace() (*Token, error) {
tok := &Token{}
// There can be arbitrary spaces so we need to bugger them up.
buf := &bytes.Buffer{}
END:
for {
ch, _, err := s.r.ReadRune()
if err != nil {
if err.Error() == io.EOF.Error() {
break END
}
return nil, err
}
switch ch {
case ' ', '\t':
_, _ = buf.WriteRune(ch)
default:
// Stop after hitting non whitespace character
// Reseting the buffer is necessary so that the scanned character can be
// accessed for the next call to Scan method.
_ = s.r.UnreadRune()
break END
}
}
tok.Column = s.column
tok.Begin = s.currPos
s.currPos += buf.Len()
tok.End = s.currPos
tok.Type = WhiteSpace
tok.Text = buf.String()
tok.Line = s.line
return tok, nil
}
//scanNewline returns a token of type NewLine. It is necessary to separate
//newlines from normal spaces because many configuration files formats make use
//of new lines.
//
// A new line can either be a carriage return( '\r') or a new line
// character('\n')
//
// TODO(gernest) accept a new line character as input.
func (s *Scanner) scanNewline() (*Token, error) {
ch, size, err := s.r.ReadRune()
if err != nil {
return nil, err
}
tok := &Token{}
tok.Type = NLine
tok.Text = string(ch)
tok.Begin = s.currPos
s.currPos += size
tok.End = s.currPos
s.column = 0
s.line++
tok.Column = s.column
tok.Line = s.line
return tok, nil
}
//isIdent returns true if ch is a valid identifier
// valid identifiers are
// underscore _
// dash -
// plus +
// a unicode letter a-zA-Z
// a unicode digit 0-9
func isIdent(ch rune) bool {
return ch == '_' || ch == '-' || ch == '+' || unicode.IsLetter(ch) || unicode.IsDigit(ch)
}
//scanIdent returns the current character in the input source as an Ident Token
//
// TODO(gernest) Accept the character as input argument.
func (s *Scanner) scanIdent() (*Token, error) {
return s.scanRune(Ident)
}
// scanRune scans the current rune and returns a token of type typ, whose Text
// is the scanned character
//
// Use this for single character tokens
func (s *Scanner) scanRune(typ TokenType) (*Token, error) {
ch, size, err := s.r.ReadRune()
if err != nil {
return nil, err
}
tok := &Token{}
tok.Type = typ
tok.Text = string(ch)
tok.Begin = s.currPos
s.currPos += size
tok.End = s.currPos
s.column++
tok.Column = s.column
tok.Line = s.line
return tok, nil
}
// peek returns the next rune in the input buffer but does not advance the
// position of the current buffer.
//
// This is a safe way to peek at the next rune character without actually
// reading it.
func (s *Scanner) peek() rune {
ch, _, err := s.r.ReadRune()
if err != nil {
if err.Error() == io.EOF.Error() {
return eof
}
panic(err)
}
_ = s.r.UnreadRune()
return ch
}