Skip to content

Commit

Permalink
LibJS: Use GenericLexer to consume escaped code points
Browse files Browse the repository at this point in the history
  • Loading branch information
trflynn89 authored and awesomekling committed Aug 19, 2021
1 parent fd8cced commit dd44a5e
Showing 1 changed file with 17 additions and 52 deletions.
69 changes: 17 additions & 52 deletions Userland/Libraries/LibJS/Token.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
#include <AK/CharacterTypes.h>
#include <AK/GenericLexer.h>
#include <AK/StringBuilder.h>
#include <AK/Utf16View.h>

namespace JS {

Expand Down Expand Up @@ -103,16 +102,6 @@ String Token::string_value(StringValueStatus& status) const
return {};
};

auto decode_surrogate = [&lexer]() -> Optional<u16> {
u16 surrogate = 0;
for (int j = 0; j < 4; ++j) {
if (!lexer.next_is(is_ascii_hex_digit))
return {};
surrogate = (surrogate << 4u) | hex2int(lexer.consume());
}
return surrogate;
};

StringBuilder builder;
while (!lexer.is_eof()) {
// No escape, consume one char and continue
Expand All @@ -121,6 +110,23 @@ String Token::string_value(StringValueStatus& status) const
continue;
}

// Unicode escape
if (lexer.next_is("\\u"sv)) {
auto code_point_or_error = lexer.consume_escaped_code_point();

if (code_point_or_error.is_error()) {
switch (code_point_or_error.error()) {
case GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape:
return encoding_failure(StringValueStatus::MalformedUnicodeEscape);
case GenericLexer::UnicodeEscapeError::UnicodeEscapeOverflow:
return encoding_failure(StringValueStatus::UnicodeEscapeOverflow);
}
}

builder.append_code_point(code_point_or_error.value());
continue;
}

lexer.ignore();
VERIFY(!lexer.is_eof());

Expand Down Expand Up @@ -150,47 +156,6 @@ String Token::string_value(StringValueStatus& status) const
builder.append_code_point(code_point);
continue;
}
// Unicode escape
if (lexer.next_is('u')) {
lexer.ignore();
u32 code_point = 0;
if (lexer.next_is('{')) {
lexer.ignore();
while (true) {
if (!lexer.next_is(is_ascii_hex_digit))
return encoding_failure(StringValueStatus::MalformedUnicodeEscape);
auto new_code_point = (code_point << 4u) | hex2int(lexer.consume());
if (new_code_point < code_point)
return encoding_failure(StringValueStatus::UnicodeEscapeOverflow);
code_point = new_code_point;
if (lexer.next_is('}'))
break;
}
lexer.ignore();
} else {
auto high_surrogate = decode_surrogate();
if (!high_surrogate.has_value())
return encoding_failure(StringValueStatus::MalformedUnicodeEscape);

if (Utf16View::is_high_surrogate(*high_surrogate) && lexer.consume_specific("\\u"sv)) {
auto low_surrogate = decode_surrogate();
if (!low_surrogate.has_value())
return encoding_failure(StringValueStatus::MalformedUnicodeEscape);

if (Utf16View::is_low_surrogate(*low_surrogate)) {
code_point = Utf16View::decode_surrogate_pair(*high_surrogate, *low_surrogate);
} else {
builder.append_code_point(*high_surrogate);
code_point = *low_surrogate;
}

} else {
code_point = *high_surrogate;
}
}
builder.append_code_point(code_point);
continue;
}

// In non-strict mode LegacyOctalEscapeSequence is allowed in strings:
// https://tc39.es/ecma262/#sec-additional-syntax-string-literals
Expand Down

0 comments on commit dd44a5e

Please sign in to comment.