forked from hashicorp/hcl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scanner.go
309 lines (283 loc) · 7.46 KB
/
scanner.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
package json
import (
"fmt"
"github.com/apparentlymart/go-textseg/v13/textseg"
"github.com/hashicorp/hcl/v2"
)
//go:generate stringer -type tokenType scanner.go
type tokenType rune
const (
tokenBraceO tokenType = '{'
tokenBraceC tokenType = '}'
tokenBrackO tokenType = '['
tokenBrackC tokenType = ']'
tokenComma tokenType = ','
tokenColon tokenType = ':'
tokenKeyword tokenType = 'K'
tokenString tokenType = 'S'
tokenNumber tokenType = 'N'
tokenEOF tokenType = '␄'
tokenInvalid tokenType = 0
tokenEquals tokenType = '=' // used only for reminding the user of JSON syntax
)
type token struct {
Type tokenType
Bytes []byte
Range hcl.Range
}
// scan returns the primary tokens for the given JSON buffer in sequence.
//
// The responsibility of this pass is to just mark the slices of the buffer
// as being of various types. It is lax in how it interprets the multi-byte
// token types keyword, string and number, preferring to capture erroneous
// extra bytes that we presume the user intended to be part of the token
// so that we can generate more helpful diagnostics in the parser.
func scan(buf []byte, start pos) []token {
var tokens []token
p := start
for {
if len(buf) == 0 {
tokens = append(tokens, token{
Type: tokenEOF,
Bytes: nil,
Range: posRange(p, p),
})
return tokens
}
buf, p = skipWhitespace(buf, p)
if len(buf) == 0 {
tokens = append(tokens, token{
Type: tokenEOF,
Bytes: nil,
Range: posRange(p, p),
})
return tokens
}
start = p
first := buf[0]
switch {
case first == '{' || first == '}' || first == '[' || first == ']' || first == ',' || first == ':' || first == '=':
p.Pos.Column++
p.Pos.Byte++
tokens = append(tokens, token{
Type: tokenType(first),
Bytes: buf[0:1],
Range: posRange(start, p),
})
buf = buf[1:]
case first == '"':
var tokBuf []byte
tokBuf, buf, p = scanString(buf, p)
tokens = append(tokens, token{
Type: tokenString,
Bytes: tokBuf,
Range: posRange(start, p),
})
case byteCanStartNumber(first):
var tokBuf []byte
tokBuf, buf, p = scanNumber(buf, p)
tokens = append(tokens, token{
Type: tokenNumber,
Bytes: tokBuf,
Range: posRange(start, p),
})
case byteCanStartKeyword(first):
var tokBuf []byte
tokBuf, buf, p = scanKeyword(buf, p)
tokens = append(tokens, token{
Type: tokenKeyword,
Bytes: tokBuf,
Range: posRange(start, p),
})
default:
tokens = append(tokens, token{
Type: tokenInvalid,
Bytes: buf[:1],
Range: start.Range(1, 1),
})
// If we've encountered an invalid then we might as well stop
// scanning since the parser won't proceed beyond this point.
// We insert a synthetic EOF marker here to match the expectations
// of consumers of this data structure.
p.Pos.Column++
p.Pos.Byte++
tokens = append(tokens, token{
Type: tokenEOF,
Bytes: nil,
Range: posRange(p, p),
})
return tokens
}
}
}
func byteCanStartNumber(b byte) bool {
switch b {
// We are slightly more tolerant than JSON requires here since we
// expect the parser will make a stricter interpretation of the
// number bytes, but we specifically don't allow 'e' or 'E' here
// since we want the scanner to treat that as the start of an
// invalid keyword instead, to produce more intelligible error messages.
case '-', '+', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
return true
default:
return false
}
}
func scanNumber(buf []byte, start pos) ([]byte, []byte, pos) {
// The scanner doesn't check that the sequence of digit-ish bytes is
// in a valid order. The parser must do this when decoding a number
// token.
var i int
p := start
Byte:
for i = 0; i < len(buf); i++ {
switch buf[i] {
case '-', '+', '.', 'e', 'E', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
p.Pos.Byte++
p.Pos.Column++
default:
break Byte
}
}
return buf[:i], buf[i:], p
}
func byteCanStartKeyword(b byte) bool {
switch {
// We allow any sequence of alphabetical characters here, even though
// JSON is more constrained, so that we can collect what we presume
// the user intended to be a single keyword and then check its validity
// in the parser, where we can generate better diagnostics.
// So e.g. we want to be able to say:
// unrecognized keyword "True". Did you mean "true"?
case isAlphabetical(b):
return true
default:
return false
}
}
func scanKeyword(buf []byte, start pos) ([]byte, []byte, pos) {
var i int
p := start
Byte:
for i = 0; i < len(buf); i++ {
b := buf[i]
switch {
case isAlphabetical(b) || b == '_':
p.Pos.Byte++
p.Pos.Column++
default:
break Byte
}
}
return buf[:i], buf[i:], p
}
func scanString(buf []byte, start pos) ([]byte, []byte, pos) {
// The scanner doesn't validate correct use of escapes, etc. It pays
// attention to escapes only for the purpose of identifying the closing
// quote character. It's the parser's responsibility to do proper
// validation.
//
// The scanner also doesn't specifically detect unterminated string
// literals, though they can be identified in the parser by checking if
// the final byte in a string token is the double-quote character.
// Skip the opening quote symbol
i := 1
p := start
p.Pos.Byte++
p.Pos.Column++
escaping := false
Byte:
for i < len(buf) {
b := buf[i]
switch {
case b == '\\':
escaping = !escaping
p.Pos.Byte++
p.Pos.Column++
i++
case b == '"':
p.Pos.Byte++
p.Pos.Column++
i++
if !escaping {
break Byte
}
escaping = false
case b < 32:
break Byte
default:
// Advance by one grapheme cluster, so that we consider each
// grapheme to be a "column".
// Ignoring error because this scanner cannot produce errors.
advance, _, _ := textseg.ScanGraphemeClusters(buf[i:], true)
p.Pos.Byte += advance
p.Pos.Column++
i += advance
escaping = false
}
}
return buf[:i], buf[i:], p
}
func skipWhitespace(buf []byte, start pos) ([]byte, pos) {
var i int
p := start
Byte:
for i = 0; i < len(buf); i++ {
switch buf[i] {
case ' ':
p.Pos.Byte++
p.Pos.Column++
case '\n':
p.Pos.Byte++
p.Pos.Column = 1
p.Pos.Line++
case '\r':
// For the purpose of line/column counting we consider a
// carriage return to take up no space, assuming that it will
// be paired up with a newline (on Windows, for example) that
// will account for both of them.
p.Pos.Byte++
case '\t':
// We arbitrarily count a tab as if it were two spaces, because
// we need to choose _some_ number here. This means any system
// that renders code on-screen with markers must itself treat
// tabs as a pair of spaces for rendering purposes, or instead
// use the byte offset and back into its own column position.
p.Pos.Byte++
p.Pos.Column += 2
default:
break Byte
}
}
return buf[i:], p
}
type pos struct {
Filename string
Pos hcl.Pos
}
func (p *pos) Range(byteLen, charLen int) hcl.Range {
start := p.Pos
end := p.Pos
end.Byte += byteLen
end.Column += charLen
return hcl.Range{
Filename: p.Filename,
Start: start,
End: end,
}
}
func posRange(start, end pos) hcl.Range {
return hcl.Range{
Filename: start.Filename,
Start: start.Pos,
End: end.Pos,
}
}
func (t token) GoString() string {
return fmt.Sprintf("json.token{json.%s, []byte(%q), %#v}", t.Type, t.Bytes, t.Range)
}
func isAlphabetical(b byte) bool {
return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z')
}