json/scanner.go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0

package json

import (
	"fmt"

	"github.com/apparentlymart/go-textseg/v13/textseg"
	"github.com/hashicorp/hcl/v2"
)

//go:generate stringer -type tokenType scanner.go
type tokenType rune

const (
	tokenBraceO  tokenType = '{'
	tokenBraceC  tokenType = '}'
	tokenBrackO  tokenType = '['
	tokenBrackC  tokenType = ']'
	tokenComma   tokenType = ','
	tokenColon   tokenType = ':'
	tokenKeyword tokenType = 'K'
	tokenString  tokenType = 'S'
	tokenNumber  tokenType = 'N'
	tokenEOF     tokenType = '␄'
	tokenInvalid tokenType = 0
	tokenEquals  tokenType = '=' // used only for reminding the user of JSON syntax
)

type token struct {
	Type  tokenType
	Bytes []byte
	Range hcl.Range
}

// scan returns the primary tokens for the given JSON buffer in sequence.
//
// The responsibility of this pass is to just mark the slices of the buffer
// as being of various types. It is lax in how it interprets the multi-byte
// token types keyword, string and number, preferring to capture erroneous
// extra bytes that we presume the user intended to be part of the token
// so that we can generate more helpful diagnostics in the parser.
func scan(buf []byte, start pos) []token {
	var tokens []token
	p := start
	for {
		if len(buf) == 0 {
			tokens = append(tokens, token{
				Type:  tokenEOF,
				Bytes: nil,
				Range: posRange(p, p),
			})
			return tokens
		}

		buf, p = skipWhitespace(buf, p)

		if len(buf) == 0 {
			tokens = append(tokens, token{
				Type:  tokenEOF,
				Bytes: nil,
				Range: posRange(p, p),
			})
			return tokens
		}

		start = p

		first := buf[0]
		switch {
		case first == '{' || first == '}' || first == '[' || first == ']' || first == ',' || first == ':' || first == '=':
			p.Pos.Column++
			p.Pos.Byte++
			tokens = append(tokens, token{
				Type:  tokenType(first),
				Bytes: buf[0:1],
				Range: posRange(start, p),
			})
			buf = buf[1:]
		case first == '"':
			var tokBuf []byte
			tokBuf, buf, p = scanString(buf, p)
			tokens = append(tokens, token{
				Type:  tokenString,
				Bytes: tokBuf,
				Range: posRange(start, p),
			})
		case byteCanStartNumber(first):
			var tokBuf []byte
			tokBuf, buf, p = scanNumber(buf, p)
			tokens = append(tokens, token{
				Type:  tokenNumber,
				Bytes: tokBuf,
				Range: posRange(start, p),
			})
		case byteCanStartKeyword(first):
			var tokBuf []byte
			tokBuf, buf, p = scanKeyword(buf, p)
			tokens = append(tokens, token{
				Type:  tokenKeyword,
				Bytes: tokBuf,
				Range: posRange(start, p),
			})
		default:
			tokens = append(tokens, token{
				Type:  tokenInvalid,
				Bytes: buf[:1],
				Range: start.Range(1, 1),
			})
			// If we've encountered an invalid then we might as well stop
			// scanning since the parser won't proceed beyond this point.
			// We insert a synthetic EOF marker here to match the expectations
			// of consumers of this data structure.
			p.Pos.Column++
			p.Pos.Byte++
			tokens = append(tokens, token{
				Type:  tokenEOF,
				Bytes: nil,
				Range: posRange(p, p),
			})
			return tokens
		}
	}
}

func byteCanStartNumber(b byte) bool {
	switch b {
	// We are slightly more tolerant than JSON requires here since we
	// expect the parser will make a stricter interpretation of the
	// number bytes, but we specifically don't allow 'e' or 'E' here
	// since we want the scanner to treat that as the start of an
	// invalid keyword instead, to produce more intelligible error messages.
	case '-', '+', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
		return true
	default:
		return false
	}
}

func scanNumber(buf []byte, start pos) ([]byte, []byte, pos) {
	// The scanner doesn't check that the sequence of digit-ish bytes is
	// in a valid order. The parser must do this when decoding a number
	// token.
	var i int
	p := start
Byte:
	for i = 0; i < len(buf); i++ {
		switch buf[i] {
		case '-', '+', '.', 'e', 'E', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
			p.Pos.Byte++
			p.Pos.Column++
		default:
			break Byte
		}
	}
	return buf[:i], buf[i:], p
}

func byteCanStartKeyword(b byte) bool {
	switch {
	// We allow any sequence of alphabetical characters here, even though
	// JSON is more constrained, so that we can collect what we presume
	// the user intended to be a single keyword and then check its validity
	// in the parser, where we can generate better diagnostics.
	// So e.g. we want to be able to say:
	//   unrecognized keyword "True". Did you mean "true"?
	case isAlphabetical(b):
		return true
	default:
		return false
	}
}

func scanKeyword(buf []byte, start pos) ([]byte, []byte, pos) {
	var i int
	p := start
Byte:
	for i = 0; i < len(buf); i++ {
		b := buf[i]
		switch {
		case isAlphabetical(b) || b == '_':
			p.Pos.Byte++
			p.Pos.Column++
		default:
			break Byte
		}
	}
	return buf[:i], buf[i:], p
}

func scanString(buf []byte, start pos) ([]byte, []byte, pos) {
	// The scanner doesn't validate correct use of escapes, etc. It pays
	// attention to escapes only for the purpose of identifying the closing
	// quote character. It's the parser's responsibility to do proper
	// validation.
	//
	// The scanner also doesn't specifically detect unterminated string
	// literals, though they can be identified in the parser by checking if
	// the final byte in a string token is the double-quote character.

	// Skip the opening quote symbol
	i := 1
	p := start
	p.Pos.Byte++
	p.Pos.Column++
	escaping := false
Byte:
	for i < len(buf) {
		b := buf[i]

		switch {
		case b == '\\':
			escaping = !escaping
			p.Pos.Byte++
			p.Pos.Column++
			i++
		case b == '"':
			p.Pos.Byte++
			p.Pos.Column++
			i++
			if !escaping {
				break Byte
			}
			escaping = false
		case b < 32:
			break Byte
		default:
			// Advance by one grapheme cluster, so that we consider each
			// grapheme to be a "column".
			// Ignoring error because this scanner cannot produce errors.
			advance, _, _ := textseg.ScanGraphemeClusters(buf[i:], true)

			p.Pos.Byte += advance
			p.Pos.Column++
			i += advance

			escaping = false
		}
	}
	return buf[:i], buf[i:], p
}

func skipWhitespace(buf []byte, start pos) ([]byte, pos) {
	var i int
	p := start
Byte:
	for i = 0; i < len(buf); i++ {
		switch buf[i] {
		case ' ':
			p.Pos.Byte++
			p.Pos.Column++
		case '\n':
			p.Pos.Byte++
			p.Pos.Column = 1
			p.Pos.Line++
		case '\r':
			// For the purpose of line/column counting we consider a
			// carriage return to take up no space, assuming that it will
			// be paired up with a newline (on Windows, for example) that
			// will account for both of them.
			p.Pos.Byte++
		case '\t':
			// We arbitrarily count a tab as if it were two spaces, because
			// we need to choose _some_ number here. This means any system
			// that renders code on-screen with markers must itself treat
			// tabs as a pair of spaces for rendering purposes, or instead
			// use the byte offset and back into its own column position.
			p.Pos.Byte++
			p.Pos.Column += 2
		default:
			break Byte
		}
	}
	return buf[i:], p
}

type pos struct {
	Filename string
	Pos      hcl.Pos
}

func (p *pos) Range(byteLen, charLen int) hcl.Range {
	start := p.Pos
	end := p.Pos
	end.Byte += byteLen
	end.Column += charLen
	return hcl.Range{
		Filename: p.Filename,
		Start:    start,
		End:      end,
	}
}

func posRange(start, end pos) hcl.Range {
	return hcl.Range{
		Filename: start.Filename,
		Start:    start.Pos,
		End:      end.Pos,
	}
}

func (t token) GoString() string {
	return fmt.Sprintf("json.token{json.%s, []byte(%q), %#v}", t.Type, t.Bytes, t.Range)
}

func isAlphabetical(b byte) bool {
	return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z')
}