AK: Bring JsonParser's string consumption closer to the ECMA 404 spec

I added some spec comments, and implementation notices, this should not change behavior in a significant way. The previous code was quite unwieldy and repetitive. The long `if(next_is('X'))` chain is now a smaller `switch`. I also reinstated the fast path for long sequences of literal characters, which was broken in 0aad21f
lanmonster · Nov 1, 2023 · e02a4f5 · e02a4f5
1 parent c990db0
commit e02a4f5
Showing 1 changed file with 79 additions and 69 deletions.
diff --git a/AK/JsonParser.cpp b/AK/JsonParser.cpp
@@ -18,105 +18,115 @@ constexpr bool is_space(int ch)
  return ch == '\t' || ch == '\n' || ch == '\r' || ch == ' ';
 }
 
+// ECMA-404 9 String
+// Boils down to
+// STRING = "\"" *("[^\"\\]" | "\\" ("[\"\\bfnrt]" | "u[0-9A-Za-z]{4}")) "\""
+// │├── " ──╮───────────────────────────────────────────────╭── " ──┤│
+// │ │
+// │ ╭───────────────────<─────────────────────╮ │
+// │ │ │ │
+// ╰──╰──╮───────────── [^"\\] ──────────────╭──╯──╯
+// │ │
+// ╰── \ ───╮──── ["\\bfnrt] ───────╭──╯
+// │ │
+// ╰─── u[0-9A-Za-z]{4} ──╯
+//
 ErrorOr<DeprecatedString> JsonParser::consume_and_unescape_string()
 {
  if (!consume_specific('"'))
  return Error::from_string_literal("JsonParser: Expected '\"'");
  StringBuilder final_sb;
 
  for (;;) {
- size_t peek_index = m_index;
- char ch = 0;
+ // OPTIMIZATION: We try to append as many literal characters as possible at a time
+ // This also pre-checks some error conditions
+ // Note: All utf8 characters are either plain ascii, or have their most signifiant bit set,
+ // which puts the, above plain ascii in value, so they will always consist
+ // of a set of "legal" non-special bytes,
+ // hence we don't need to bother with a code-point iterator,
+ // as a simple byte iterator suffices, which GenericLexer provides by default
+ size_t literal_characters = 0;
  for (;;) {
- if (peek_index == m_input.length())
- break;
- ch = m_input[peek_index];
+ char ch = peek(literal_characters);
+ // Note: We get a 0 byte when we hit EOF
+ if (ch == 0)
+ return Error::from_string_literal("JsonParser: EOF while parsing String");
+ // Spec: All code points may be placed within the quotation marks except
+ // for the code points that must be escaped: quotation mark (U+0022),
+ // reverse solidus (U+005C), and the control characters U+0000 to U+001F.
+ // There are two-character escape sequence representations of some characters.
+ if (is_ascii_c0_control(ch))
+ return Error::from_string_literal("JsonParser: ASCII control sequence encountered");
  if (ch == '"' || ch == '\\')
  break;
- if (is_ascii_c0_control(ch))
- return Error::from_string_literal("JsonParser: Error while parsing string");
- ++peek_index;
+ ++literal_characters;
  }
+ final_sb.append(consume(literal_characters));
 
- while (peek_index != m_index) {
- final_sb.append(m_input[m_index]);
- m_index++;
- }
+ // We have checked all cases except end-of-string and escaped characters in the loop above,
+ // so we now only have to handle those two cases
+ char ch = peek();
 
- if (m_index == m_input.length())
- break;
- if (ch == '"')
+ if (ch == '"') {
+ consume();
  break;
- if (ch != '\\') {
- final_sb.append(consume());
- continue;
- }
- ignore();
- if (next_is('"')) {
- ignore();
- final_sb.append('"');
- continue;
  }
 
- if (next_is('\\')) {
- ignore();
- final_sb.append('\\');
- continue;
- }
+ ignore(); // '\'
 
- if (next_is('/')) {
+ switch (peek()) {
+ case '\0':
+ return Error::from_string_literal("JsonParser: EOF while parsing String");
+ case '"':
+ case '\\':
+ case '/':
+ final_sb.append(consume());
+ break;
+ case 'b':
  ignore();
- final_sb.append('/');
- continue;
- }
-
- if (next_is('n')) {
+ final_sb.append('\b');
+ break;
+ case 'f':
+ ignore();
+ final_sb.append('\f');
+ break;
+ case 'n':
  ignore();
  final_sb.append('\n');
- continue;
- }
-
- if (next_is('r')) {
+ break;
+ case 'r':
  ignore();
  final_sb.append('\r');
- continue;
- }
-
- if (next_is('t')) {
+ break;
+ case 't':
  ignore();
  final_sb.append('\t');
- continue;
- }
-
- if (next_is('b')) {
- ignore();
- final_sb.append('\b');
- continue;
- }
-
- if (next_is('f')) {
- ignore();
- final_sb.append('\f');
- continue;
- }
+ break;
+ case 'u': {
+ ignore(); // 'u'
 
- if (next_is('u')) {
- ignore();
  if (tell_remaining() < 4)
  return Error::from_string_literal("JsonParser: EOF while parsing Unicode escape");
-
- auto code_point = AK::StringUtils::convert_to_uint_from_hex(consume(4));
- if (code_point.has_value()) {
- final_sb.append_code_point(code_point.value());
- continue;
+ auto escaped_string = consume(4);
+ auto code_point = AK::StringUtils::convert_to_uint_from_hex(escaped_string);
+ if (!code_point.has_value()) {
+ dbgln("JsonParser: Error while parsing Unicode escape {}", escaped_string);
+ return Error::from_string_literal("JsonParser: Error while parsing Unicode escape");
  }
- return Error::from_string_literal("JsonParser: Error while parsing Unicode escape");
+ // Note/FIXME: "To escape a code point that is not in the Basic Multilingual Plane, the character may be represented as a
+ // twelve-character sequence, encoding the UTF-16 surrogate pair corresponding to the code point. So for
+ // example, a string containing only the G clef character (U+1D11E) may be represented as "\uD834\uDD1E".
+ // However, whether a processor of JSON texts interprets such a surrogate pair as a single code point or as an
+ // explicit surrogate pair is a semantic decision that is determined by the specific processor."
+ // ~ECMA-404, 2nd Edition Dec. 2017, page 5
+ final_sb.append_code_point(code_point.value());
+ break;
+ }
+ default:
+ dbgln("JsonParser: Invalid escaped character '{}' ({:#x}) ", peek(), peek());
+ return Error::from_string_literal("JsonParser: Invalid escaped character");
  }
-
- return Error::from_string_literal("JsonParser: Error while parsing string");
  }
- if (!consume_specific('"'))
- return Error::from_string_literal("JsonParser: Expected '\"'");
 
  return final_sb.to_deprecated_string();
 }