-
Notifications
You must be signed in to change notification settings - Fork 12
/
parser.c
488 lines (400 loc) · 11.8 KB
/
parser.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
#include "rl_jsonInt.h"
#include "parser.h"
enum char_advance_status {
CHAR_ADVANCE_OK,
CHAR_ADVANCE_UNESCAPED_NULL
};
void parse_error(struct parse_error* details, const char* errmsg, const unsigned char* doc, size_t char_ofs) //{{{
{
if (details == NULL) return;
details->errmsg = errmsg;
details->doc = (const char*)doc;
details->char_ofs = char_ofs;
}
//}}}
void throw_parse_error(Tcl_Interp* interp, struct parse_error* details) //{{{
{
char char_ofs_buf[20]; // 20 bytes allows for 19 bytes of decimal max 64 bit size_t, plus null terminator
snprintf(char_ofs_buf, 20, "%ld", details->char_ofs);
Tcl_SetObjResult(interp, Tcl_ObjPrintf("Error parsing JSON value: %s at offset %ld", details->errmsg, details->char_ofs));
Tcl_SetErrorCode(interp, "RL", "JSON", "PARSE", details->errmsg, details->doc, char_ofs_buf, NULL);
}
//}}}
struct parse_context* push_parse_context(struct parse_context* cx, const enum json_types container, const size_t char_ofs) //{{{
{
struct parse_context* last = cx->last;
struct parse_context* new;
if (last->container == JSON_UNDEF) {
new = last;
} else if (likely((ptrdiff_t)last >= (ptrdiff_t)cx && (ptrdiff_t)last < (ptrdiff_t)(cx + CX_STACK_SIZE - 1))) {
// Space remains on the cx array stack
new = cx->last+1;
} else {
new = (struct parse_context*)malloc(sizeof(*new));
}
new->prev = last;
if (last->mode == VALIDATE) {
new->val = NULL;
} else {
Tcl_IncrRefCount(
new->val = JSON_NewJvalObj(container, container == JSON_OBJECT ?
(cx->l ? cx->l->tcl_empty_dict : Tcl_NewDictObj()) :
(cx->l ? cx->l->tcl_empty_list : Tcl_NewListObj(0, NULL))
));
}
new->hold_key = NULL;
new->char_ofs = char_ofs;
new->container = container;
new->closed = 0;
new->objtype = g_objtype_for_type[container];
new->l = last->l;
new->mode = last->mode;
cx->last = new;
return new;
}
//}}}
struct parse_context* pop_parse_context(struct parse_context* cx) //{{{
{
struct parse_context* last = cx->last;
cx->last->closed = 1;
if (unlikely((ptrdiff_t)cx == (ptrdiff_t)last)) {
return cx->last;
}
if (likely(last->val != NULL)) {
if (last->prev->val && Tcl_IsShared(last->prev->val))
replace_tclobj(&last->prev->val, Tcl_DuplicateObj(last->prev->val));
append_to_cx(last->prev, last->val);
release_tclobj(&last->val);
}
if (likely((ptrdiff_t)last >= (ptrdiff_t)cx && (ptrdiff_t)last < (ptrdiff_t)(cx + CX_STACK_SIZE))) {
// last is on the cx array stack
cx->last--;
} else {
if (last->prev) {
cx->last = last->prev;
free(last);
}
}
return cx->last;
}
//}}}
void free_cx(struct parse_context* cx) //{{{
{
struct parse_context* tail = cx->last;
while (1) {
release_tclobj(&tail->hold_key);
release_tclobj(&tail->val);
if (tail == cx) break;
tail = pop_parse_context(cx);
}
}
//}}}
static int is_whitespace(const unsigned char c) //{{{
{
switch (c) {
case 0x20:
case 0x09:
case 0x0A:
case 0x0D:
return 1;
default:
return 0;
}
}
//}}}
static enum char_advance_status char_advance(const unsigned char** p, size_t* char_adj) //{{{
{
// TODO: use Tcl_UtfNext instead?
// This relies on some properties from the utf-8 returned by Tcl_GetString:
// - no invalid encodings (partial utf-8 sequences, etc)
// - not truncated in the middle of a char
const unsigned char first = **p;
unsigned int eat;
(*p)++;
if (unlikely(first >= 0xC0)) {
// Advance to next UTF-8 character
// TODO: detect invalid sequences?
if (first == 0xC0 && **p == 0x80) {
(*p)--;
// Have to check for this here - other unescaped control chars are handled by the < 0x1F
// test, but 0x00 is transformed to 0xC0 0x80 by Tcl (MUTF-8 rather than UTF-8)
return CHAR_ADVANCE_UNESCAPED_NULL;
}
if (first < 0xe0 /* 0b11100000 */) {
eat = 1;
#if TCL_UTF_MAX == 3
} else {
eat = 2;
#else
} else if (first < 0xf0 /* 0b11110000 */) {
eat = 2;
} else if (first < 0xf8 /* 0b11111000 */) {
eat = 3;
} else if (first < 0xfc /* 0b11111100 */) {
eat = 4;
} else {
eat = 5;
#endif
}
*p += eat;
*char_adj += eat;
}
return CHAR_ADVANCE_OK;
}
//}}}
int skip_whitespace(const unsigned char** s, const unsigned char* e, const char** errmsg, const unsigned char** err_at, size_t* char_adj, enum extensions extensions) //{{{
{
const unsigned char* p = *s;
const unsigned char* start;
size_t start_char_adj;
enum char_advance_status status = CHAR_ADVANCE_OK;
consume_space_or_comment:
while (is_whitespace(*p)) p++;
if (unlikely((extensions & EXT_COMMENTS) && *p == '/')) {
start = p;
start_char_adj = *char_adj;
p++;
if (*p == '/') {
p++;
while (likely(p < e && (*p > 0x1f || *p == 0x09) && status == CHAR_ADVANCE_OK))
status = char_advance(&p, char_adj);
} else if (*p == '*') {
p++;
while (likely(p < e-2 && *p != '*')) {
if (unlikely(*p <= 0x1f && !is_whitespace(*p))) goto err_illegal_char;
status = char_advance(&p, char_adj);
if (unlikely(status != CHAR_ADVANCE_OK)) goto err_illegal_char;
}
if (unlikely(*p++ != '*' || *p++ != '/')) goto err_unterminated;
} else {
goto err_illegal_char;
}
goto consume_space_or_comment;
}
*s = p;
return 0;
err_unterminated:
*err_at = start;
*char_adj = start_char_adj;
*errmsg = "Unterminated comment";
*s = p;
return 1;
err_illegal_char:
*err_at = p;
*errmsg = "Illegal character";
*s = p;
return 1;
}
//}}}
int is_template(const char* s, int len) //{{{
{
if (
len >= 3 &&
s[0] == '~' &&
s[2] == ':'
) {
switch (s[1]) {
case 'S':
case 'N':
case 'B':
case 'J':
case 'T':
case 'L':
return 1;
}
}
return 0;
}
//}}}
int value_type(struct interp_cx* l, const unsigned char* doc, const unsigned char* p, const unsigned char* e, size_t* char_adj, const unsigned char** next, enum json_types *type, Tcl_Obj** val, struct parse_error* details) //{{{
{
const unsigned char* err_at = NULL;
const char* errmsg = NULL;
Tcl_Obj* out = NULL;
if (val)
replace_tclobj(val, NULL);
if (unlikely(p >= e)) goto err;
switch (*p) {
case '"':
p++; // Advance past the " to the first byte of the string
{
const unsigned char* chunk;
size_t len;
char mapped;
enum json_types stype = JSON_STRING;
enum char_advance_status status = CHAR_ADVANCE_OK;
// Peek ahead to detect template subst markers.
TEMPLATE_TYPE(p, e-p, stype);
while (1) {
chunk = p;
// These tests are where the majority of the parsing time is spent
while (likely(p < e && *p != '"' && *p != '\\' && *p > 0x1f && status == CHAR_ADVANCE_OK))
status = char_advance(&p, char_adj);
if (unlikely(p >= e)) goto err;
len = p-chunk;
if (likely(out == NULL)) {
replace_tclobj(&out, get_string(l, (const char*)chunk, len));
} else if (len > 0) {
if (unlikely(Tcl_IsShared(out)))
replace_tclobj(&out, Tcl_DuplicateObj(out));
Tcl_AppendToObj(out, (const char*)chunk, len);
}
if (likely(*p == '"')) {
p++; // Point at the first byte after the string
break;
}
if (unlikely(*p != '\\')) goto err;
p++; // Advance to the backquoted byte
if (unlikely(Tcl_IsShared(out)))
replace_tclobj(&out, Tcl_DuplicateObj(out));
switch (*p) { // p could point at the NULL terminator at this point
case '\\':
case '"':
case '/': // RFC4627 allows this for some reason
mapped = *p;
goto append_mapped;
case 'b': mapped='\b'; goto append_mapped;
case 'f': mapped='\f'; goto append_mapped;
case 'n': mapped='\n'; goto append_mapped;
case 'r': mapped='\r'; goto append_mapped;
case 't': mapped='\t';
append_mapped: Tcl_AppendToObj(out, &mapped, 1); // Weird, but arranged this way the compiler optimizes it to a jump table
break;
case 'u':
{
Tcl_UniChar acc = 0;
char utfbuf[6];
int i=4, digit;
if (unlikely(e-p-2 < i)) { // -2 is for the "u" and the close quote
err_at = e-1;
if (err_at < e && *err_at != '"')
err_at++; // Bodge the case where the doc was truncated without a closing quote
errmsg = "Unicode sequence too short";
goto err;
}
while (i--) {
digit = *++p - '0';
if (unlikely(digit < 0)) {
err_at = p;
errmsg = "Unicode sequence too short";
goto err;
}
if (digit > 9) {
digit -= (digit < 'a' - '0') ?
'A' - '0' - 0xA :
'a' - '0' - 0xA;
if (unlikely(digit < 0xA || digit > 0xF)) {
err_at = p;
errmsg = "Unicode sequence too short";
goto err;
}
}
acc <<= 4;
acc += digit;
}
if (
(acc >= 0xD800 && acc <= 0xDBFF) ||
(acc >= 0xDC00 && acc <= 0xDFFF)
) {
// Replace invalid codepoints (in the high and low surrogate ranges for UTF-16) with
// U+FFFD in accordance with Unicode recommendations
acc = 0xFFFD;
}
//const unsigned char* utfend = output_utf8(acc, utfbuf);
const int len = Tcl_UniCharToUtf(acc, utfbuf);
Tcl_AppendToObj(out, utfbuf, len);
}
break;
default:
goto err;
}
p++; // Advance to the first byte after the backquoted sequence
}
*type = stype;
if (val)
replace_tclobj(val, out);
}
break;
case '{':
*type = JSON_OBJECT;
p++;
break;
case '[':
*type = JSON_ARRAY;
p++;
break;
case 't':
if (unlikely(e-p < 4 || *(uint32_t*)p != *(uint32_t*)"true")) goto err; // Evil endian-compensated trick
*type = JSON_BOOL;
if (val)
replace_tclobj(val, l ? l->tcl_true : Tcl_NewBooleanObj(1));
p += 4;
break;
case 'f':
if (unlikely(e-p < 5 || *(uint32_t*)(p+1) != *(uint32_t*)"alse")) goto err; // Evil endian-compensated trick
*type = JSON_BOOL;
if (val)
replace_tclobj(val, l ? l->tcl_false : Tcl_NewBooleanObj(0));
p += 5;
break;
case 'n':
if (unlikely(e-p < 4 || *(uint32_t*)p != *(uint32_t*)"null")) goto err; // Evil endian-compensated trick
*type = JSON_NULL;
if (val)
replace_tclobj(val, l ? l->tcl_empty : Tcl_NewStringObj("", 0));
p += 4;
break;
default:
{
const unsigned char* start = p;
const unsigned char* t;
if (*p == '-') p++;
if (unlikely(
*p == '0' && (
// Only 3 characters can legally follow a leading '0' according to the spec:
// . - fraction, e or E - exponent
(p[1] >= '0' && p[1] <= '9') // Octal, hex, or decimal with leading 0
)
)) {
// Indexing one beyond p is safe - all the strings we
// receive are guaranteed to be null terminated by Tcl, and
// *p here is '0'
err_at = p;
errmsg = "Leading 0 not allowed for numbers";
goto err;
}
t = p;
while (*p >= '0' && *p <= '9') p++;
if (unlikely(p == t)) goto err; // No integer part after the decimal point
if (*p == '.') { // p could point at the NULL terminator at this point
p++;
t = p;
while (*p >= '0' && *p <= '9') p++;
if (unlikely(p == t)) goto err; // No integer part after the decimal point
}
if (unlikely((*p | 0x20 /* 0b100000 */) == 'e')) { // p could point at the NULL terminator at this point
p++;
if (*p == '+' || *p == '-') p++;
t = p;
while (*p >= '0' && *p <= '9') p++;
if (unlikely(p == t)) goto err; // No integer part after the exponent symbol
}
*type = JSON_NUMBER;
if (val)
replace_tclobj(val, get_string(l, (const char*)start, p-start));
}
}
release_tclobj(&out);
*next = p;
return TCL_OK;
err:
release_tclobj(&out);
if (err_at == NULL)
err_at = p;
if (errmsg == NULL)
errmsg = (err_at == e) ? "Document truncated" : "Illegal character";
parse_error(details, errmsg, doc, (err_at - doc) - *char_adj);
return TCL_ERROR;
}
//}}}
// vim: foldmethod=marker foldmarker={{{,}}} ts=4 shiftwidth=4