LibJS: Use GenericLexer to consume escaped code points

cubiclove · Aug 19, 2021 · dd44a5e · dd44a5e
1 parent fd8cced
commit dd44a5e
Showing 1 changed file with 17 additions and 52 deletions.
diff --git a/Userland/Libraries/LibJS/Token.cpp b/Userland/Libraries/LibJS/Token.cpp
@@ -10,7 +10,6 @@
 #include <AK/CharacterTypes.h>
 #include <AK/GenericLexer.h>
 #include <AK/StringBuilder.h>
-#include <AK/Utf16View.h>
 
 namespace JS {
 
@@ -103,16 +102,6 @@ String Token::string_value(StringValueStatus& status) const
  return {};
  };
 
- auto decode_surrogate = [&lexer]() -> Optional<u16> {
- u16 surrogate = 0;
- for (int j = 0; j < 4; ++j) {
- if (!lexer.next_is(is_ascii_hex_digit))
- return {};
- surrogate = (surrogate << 4u) | hex2int(lexer.consume());
- }
- return surrogate;
- };
-
  StringBuilder builder;
  while (!lexer.is_eof()) {
  // No escape, consume one char and continue
@@ -121,6 +110,23 @@ String Token::string_value(StringValueStatus& status) const
  continue;
  }
 
+ // Unicode escape
+ if (lexer.next_is("\\u"sv)) {
+ auto code_point_or_error = lexer.consume_escaped_code_point();
+
+ if (code_point_or_error.is_error()) {
+ switch (code_point_or_error.error()) {
+ case GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape:
+ return encoding_failure(StringValueStatus::MalformedUnicodeEscape);
+ case GenericLexer::UnicodeEscapeError::UnicodeEscapeOverflow:
+ return encoding_failure(StringValueStatus::UnicodeEscapeOverflow);
+ }
+ }
+
+ builder.append_code_point(code_point_or_error.value());
+ continue;
+ }
+
  lexer.ignore();
  VERIFY(!lexer.is_eof());
 
@@ -150,47 +156,6 @@ String Token::string_value(StringValueStatus& status) const
  builder.append_code_point(code_point);
  continue;
  }
- // Unicode escape
- if (lexer.next_is('u')) {
- lexer.ignore();
- u32 code_point = 0;
- if (lexer.next_is('{')) {
- lexer.ignore();
- while (true) {
- if (!lexer.next_is(is_ascii_hex_digit))
- return encoding_failure(StringValueStatus::MalformedUnicodeEscape);
- auto new_code_point = (code_point << 4u) | hex2int(lexer.consume());
- if (new_code_point < code_point)
- return encoding_failure(StringValueStatus::UnicodeEscapeOverflow);
- code_point = new_code_point;
- if (lexer.next_is('}'))
- break;
- }
- lexer.ignore();
- } else {
- auto high_surrogate = decode_surrogate();
- if (!high_surrogate.has_value())
- return encoding_failure(StringValueStatus::MalformedUnicodeEscape);
-
- if (Utf16View::is_high_surrogate(*high_surrogate) && lexer.consume_specific("\\u"sv)) {
- auto low_surrogate = decode_surrogate();
- if (!low_surrogate.has_value())
- return encoding_failure(StringValueStatus::MalformedUnicodeEscape);
-
- if (Utf16View::is_low_surrogate(*low_surrogate)) {
- code_point = Utf16View::decode_surrogate_pair(*high_surrogate, *low_surrogate);
- } else {
- builder.append_code_point(*high_surrogate);
- code_point = *low_surrogate;
- }
-
- } else {
- code_point = *high_surrogate;
- }
- }
- builder.append_code_point(code_point);
- continue;
- }
 
  // In non-strict mode LegacyOctalEscapeSequence is allowed in strings:
  // https://tc39.es/ecma262/#sec-additional-syntax-string-literals