Skip to content

Commit

Permalink
Transform invalid unicode sequences into U+FFFD as per recommendations
Browse files Browse the repository at this point in the history
  • Loading branch information
cyanogilvie committed Jan 26, 2020
1 parent 6690f96 commit de46844
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 19 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,5 @@ perf.data.old
.project
.settings
tests/JSONTestSuite/test_parsing
tests/JSONTestSuite/test_transform
tclobjs_remaining
47 changes: 33 additions & 14 deletions fetch_test_cases.tcl
Original file line number Diff line number Diff line change
@@ -1,14 +1,21 @@
#!/usr/bin/env tclsh

# Doesn't work. It should, but github is serving the binary .json files as utf-8, which breaks all the carefully
# crafted encoding tests

# Dependencies:
# rl_json: https://github.com/RubyLane/rl_json
# parse_args: https://github.com/RubyLane/parse_args
# rl_http: https://github.com/RubyLane/rl_http
# urlencode: copied to support/
# uri: tcllib
# Thread: https://core.tcl-lang.org/thread
# tls: https://core.tcl-lang.org/tcltls/index

set here [file dirname [file normalize [info script]]]

tcl::tm::path add [file join $here support]
lappend auto_path $here

package require rl_json ;# yeah...
package require rl_http
Expand Down Expand Up @@ -40,7 +47,10 @@ proc http {method url args} { #<<<
}

switch -glob -- [$h code] {
2* {return [$h body]}
2* {
#puts "Headers:\n\t[join [lmap {k v} [$h headers] {format {%s: %s} $k $v}] \n\t]"
return [$h body]
}
304 {throw [list HTTP CODE [$h code]] "Not modified"}
301 - 302 - 307 {set url [lindex [dict get [$h headers] location] 0]}
403 {
Expand Down Expand Up @@ -121,18 +131,7 @@ proc writebin {fn data} { #<<<

#>>>

parse_args $argv {
-pretty {-boolean}
}

set dest [file join $here tests JSONTestSuite test_parsing]
file mkdir $dest

set listing [github api GET -owner nst -repo JSONTestSuite test_parsing]

#puts [json pretty $listing]

json foreach file $listing {
proc fetch_file {dest file} { #<<<
set fn [file join $dest [json get $file name]]
if {[file exists $fn]} {
set mtime [file mtime $fn]
Expand All @@ -144,7 +143,7 @@ json foreach file $listing {
writebin [file join $dest [json get $file name]] $contents
} trap {HTTP CODE 304} {} {
puts " not modified"
continue
return
} on error {errmsg options} {
puts " Error: $errmsg"
}
Expand All @@ -161,4 +160,24 @@ json foreach file $listing {
}
}

#>>>

set dest [file join $here tests JSONTestSuite test_parsing]
file mkdir $dest

set listing [github api GET -owner nst -repo JSONTestSuite test_parsing]
#puts [json pretty $listing]
json foreach file $listing {
fetch_file $dest $file
}

set dest [file join $here tests JSONTestSuite test_transform]
file mkdir $dest

set listing [github api GET -owner nst -repo JSONTestSuite test_transform]
#puts [json pretty $listing]
json foreach file $listing {
fetch_file $dest $file
}

# vim: foldmethod=marker foldmarker=<<<,>>> ts=4 shiftwidth=4
13 changes: 11 additions & 2 deletions generic/parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,14 @@ append_mapped: Tcl_AppendToObj(out, &mapped, 1); // Weird, but arranged this
acc += digit;
}

if (
(acc >= 0xD800 && acc <= 0xDBFF) ||
(acc >= 0xDC00 && acc <= 0xDFFF)
) {
// Replace invalid codepoints (in the high and low surrogate ranges for UTF-16) with
// U+FFFD in accordance with Unicode recommendations
acc = 0xFFFD;
}
Tcl_AppendUnicodeToObj(out, &acc, 1);
}
break;
Expand Down Expand Up @@ -380,8 +388,9 @@ append_mapped: Tcl_AppendToObj(out, &mapped, 1); // Weird, but arranged this

if (unlikely(
*p == '0' && (
(p[1] >= '0' && p[1] <= '9') || // Octal
(p[1] == 'x' || p[1] == 'X') // Hex
// Only 3 characters can legally follow a leading '0' according to the spec:
// . - fraction, e or E - exponent
(p[1] >= '0' && p[1] <= '9') // Octal, hex, or decimal with leading 0
)
)) {
// Indexing one beyond p is safe - all the strings we
Expand Down
5 changes: 2 additions & 3 deletions generic/parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,10 @@ struct interp_cx {
const Tcl_ObjType* typeDict; // Evil hack to identify objects of type dict, used to choose whether to iterate over a list of pairs as a dict or a list, for efficiency

const Tcl_ObjType* typeInt; // Evil hack to snoop on the type of a number, so that we don't have to add 0 to a candidate to know if it's a valid number
const Tcl_ObjType* typeLong;
const Tcl_ObjType* typeWideInt;
const Tcl_ObjType* typeDouble;
const Tcl_ObjType* typeBoolean;
const Tcl_ObjType* typeBignum;
Tcl_Obj* apply;
Tcl_Obj* decode_bytes;
};

#define CX_STACK_SIZE 6
Expand Down

0 comments on commit de46844

Please sign in to comment.