// Copyright 2018 Couchbase, Inc. All rights reserved. package gojsonsm import ( "bytes" "errors" "unicode/utf8" ) var ( MalformedStringEscapeError = errors.New("Encountered an invalid escape sequence in a string") ) // JSON Unicode stuff: see https://tools.ietf.org/html/rfc7159#section-7 const supplementalPlanesOffset = 0x10000 const highSurrogateOffset = 0xD800 const lowSurrogateOffset = 0xDC00 const basicMultilingualPlaneReservedOffset = 0xDFFF const basicMultilingualPlaneOffset = 0xFFFF func combineUTF16Surrogates(high, low rune) rune { return supplementalPlanesOffset + (high-highSurrogateOffset)<<10 + (low - lowSurrogateOffset) } const badHex = -1 func h2I(c byte) int { switch { case c >= '0' && c <= '9': return int(c - '0') case c >= 'A' && c <= 'F': return int(c - 'A' + 10) case c >= 'a' && c <= 'f': return int(c - 'a' + 10) } return badHex } // decodeSingleUnicodeEscape decodes a single \uXXXX escape sequence. The prefix \u is assumed to be present and // is not checked. // In JSON, these escapes can either come alone or as part of "UTF16 surrogate pairs" that must be handled together. // This function only handles one; decodeUnicodeEscape handles this more complex case. func decodeSingleUnicodeEscape(in []byte) (rune, bool) { // We need at least 6 characters total if len(in) < 6 { return utf8.RuneError, false } // Convert hex to decimal h1, h2, h3, h4 := h2I(in[2]), h2I(in[3]), h2I(in[4]), h2I(in[5]) if h1 == badHex || h2 == badHex || h3 == badHex || h4 == badHex { return utf8.RuneError, false } // Compose the hex digits return rune(h1<<12 + h2<<8 + h3<<4 + h4), true } // isUTF16EncodedRune checks if a rune is in the range for non-BMP characters, // which is used to describe UTF16 chars. // Source: https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane func isUTF16EncodedRune(r rune) bool { return highSurrogateOffset <= r && r <= basicMultilingualPlaneReservedOffset } func decodeUnicodeEscape(in []byte) (rune, int) { if r, ok := decodeSingleUnicodeEscape(in); !ok { // Invalid Unicode escape return utf8.RuneError, -1 } else if r <= basicMultilingualPlaneOffset && !isUTF16EncodedRune(r) { // Valid Unicode escape in Basic Multilingual Plane return r, 6 } else if r2, ok := decodeSingleUnicodeEscape(in[6:]); !ok { // Note: previous decodeSingleUnicodeEscape success guarantees at least 6 bytes remain // UTF16 "high surrogate" without manditory valid following Unicode escape for the "low surrogate" return utf8.RuneError, -1 } else if r2 < lowSurrogateOffset { // Invalid UTF16 "low surrogate" return utf8.RuneError, -1 } else { // Valid UTF16 surrogate pair return combineUTF16Surrogates(r, r2), 12 } } // backslashCharEscapeTable: when '\X' is found for some byte X, it is to be replaced with backslashCharEscapeTable[X] var backslashCharEscapeTable = [...]byte{ '"': '"', '\\': '\\', '/': '/', 'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t', } // unescapeToUTF8 unescapes the single escape sequence starting at 'in' into 'out' and returns // how many characters were consumed from 'in' and emitted into 'out'. // If a valid escape sequence does not appear as a prefix of 'in', (-1, -1) to signal the error. func unescapeToUTF8(in, out []byte) (inLen int, outLen int) { if len(in) < 2 || in[0] != '\\' { // Invalid escape due to insufficient characters for any escape or no initial backslash return -1, -1 } // https://tools.ietf.org/html/rfc7159#section-7 switch e := in[1]; e { case '"', '\\', '/', 'b', 'f', 'n', 'r', 't': // Valid basic 2-character escapes (use lookup table) out[0] = backslashCharEscapeTable[e] return 2, 1 case 'u': // Unicode escape if r, inLen := decodeUnicodeEscape(in); inLen == -1 { // Invalid Unicode escape return -1, -1 } else { // Valid Unicode escape; re-encode as UTF8 outLen := utf8.EncodeRune(out, r) return inLen, outLen } } return -1, -1 } // unescape unescapes the string contained in 'in' and returns it as a slice. // If 'in' contains no escaped characters: // Returns 'in'. // Else, if 'out' is of sufficient capacity (guaranteed if cap(out) >= len(in)): // 'out' is used to build the unescaped string and is returned with no extra allocation // Else: // A new slice is allocated and returned. func unescapeJsonString(in, out []byte) ([]byte, error) { firstBackslash := bytes.IndexByte(in, '\\') if firstBackslash == -1 { return in, nil } // Get a buffer of sufficient size (allocate if needed) if cap(out) < len(in) { out = make([]byte, len(in)) } else { out = out[0:len(in)] } // Copy the first sequence of unescaped bytes to the output and obtain a buffer pointer (subslice) copy(out, in[:firstBackslash]) in = in[firstBackslash:] buf := out[firstBackslash:] for len(in) > 0 { // Unescape the next escaped character inLen, bufLen := unescapeToUTF8(in, buf) if inLen == -1 { return nil, MalformedStringEscapeError } in = in[inLen:] buf = buf[bufLen:] // Copy everything up until the next backslash nextBackslash := bytes.IndexByte(in, '\\') if nextBackslash == -1 { copy(buf, in) buf = buf[len(in):] break } else { copy(buf, in[:nextBackslash]) buf = buf[nextBackslash:] in = in[nextBackslash:] } } // Trim the out buffer to the amount that was actually emitted return out[:len(out)-len(buf)], nil }