Skip to content

Commit

Permalink
LibJS: Allow Unicode escape sequences in identifiers
Browse files Browse the repository at this point in the history
For example, "property.br\u{64}wn" should resolve to "property.brown".

To support this behavior, this commit changes the Token class to hold
both the evaluated identifier name and a view into the original source
for the unevaluated name. There are some contexts in which identifiers
are not allowed to contain Unicode escape sequences; for example, export
statements of the form "export {} from foo.js" forbid escapes in the
identifier "from".

The test file is added to .prettierignore because prettier will replace
all escaped Unicode sequences with their unescaped value.
  • Loading branch information
trflynn89 authored and awesomekling committed Aug 19, 2021
1 parent c5b5c77 commit 1259dc3
Show file tree
Hide file tree
Showing 7 changed files with 162 additions and 53 deletions.
2 changes: 1 addition & 1 deletion .prettierignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
Base/home/anon/Source/js
Userland/Libraries/LibJS/Tests/eval-aliasing.js

Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js
122 changes: 89 additions & 33 deletions Userland/Libraries/LibJS/Lexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "Lexer.h"
#include <AK/CharacterTypes.h>
#include <AK/Debug.h>
#include <AK/GenericLexer.h>
#include <AK/HashMap.h>
#include <AK/Utf8View.h>
#include <LibUnicode/CharacterTypes.h>
Expand Down Expand Up @@ -350,6 +351,8 @@ u32 Lexer::current_code_point() const
if (m_position == 0)
return REPLACEMENT_CHARACTER;
Utf8View utf_8_view { m_source.substring_view(m_position - 1) };
if (utf_8_view.is_empty())
return REPLACEMENT_CHARACTER;
return *utf_8_view.begin();
}

Expand All @@ -369,30 +372,60 @@ bool Lexer::is_whitespace() const
return false;
}

bool Lexer::is_identifier_start() const
Optional<u32> Lexer::is_unicode_escape(size_t& identifier_length) const
{
if (!is_unicode_character())
return is_ascii_alpha(m_current_char) || m_current_char == '_' || m_current_char == '$';
auto code_point = current_code_point();
GenericLexer lexer(source().substring_view(m_position - 1));

if (auto code_point_or_error = lexer.consume_escaped_code_point(); !code_point_or_error.is_error()) {
identifier_length = lexer.tell();
return code_point_or_error.value();
}

return {};
}

Optional<u32> Lexer::is_identifier_start(size_t& identifier_length) const
{
u32 code_point = current_code_point();
identifier_length = 1;

if (code_point == '\\') {
if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value())
code_point = *maybe_code_point;
else
return {};
}

if (is_ascii_alpha(code_point) || code_point == '_' || code_point == '$')
return code_point;

static auto id_start_category = Unicode::property_from_string("ID_Start"sv);
if (id_start_category.has_value())
return Unicode::code_point_has_property(code_point, *id_start_category);
return false;
if (id_start_category.has_value() && Unicode::code_point_has_property(code_point, *id_start_category))
return code_point;

return {};
}

bool Lexer::is_identifier_middle() const
Optional<u32> Lexer::is_identifier_middle(size_t& identifier_length) const
{
if (!is_unicode_character())
return is_identifier_start() || is_ascii_digit(m_current_char);
auto code_point = current_code_point();
if (code_point == ZERO_WIDTH_NON_JOINER || code_point == ZERO_WIDTH_JOINER)
return true;
u32 code_point = current_code_point();
identifier_length = 1;

if (code_point == '\\') {
if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value())
code_point = *maybe_code_point;
else
return {};
}

if (is_ascii_alphanumeric(code_point) || (code_point == '$') || (code_point == ZERO_WIDTH_NON_JOINER) || (code_point == ZERO_WIDTH_JOINER))
return code_point;

static auto id_continue_category = Unicode::property_from_string("ID_Continue"sv);
if (id_continue_category.has_value())
return Unicode::code_point_has_property(code_point, *id_continue_category);
return false;
if (id_continue_category.has_value() && Unicode::code_point_has_property(code_point, *id_continue_category))
return code_point;

return {};
}

bool Lexer::is_line_comment_start(bool line_has_token_yet) const
Expand Down Expand Up @@ -494,6 +527,9 @@ Token Lexer::next()
// bunch of Invalid* tokens (bad numeric literals, unterminated comments etc.)
String token_message;

Optional<FlyString> identifier;
size_t identifier_length = 0;

if (m_current_token.type() == TokenType::RegexLiteral && !is_eof() && is_ascii_alpha(m_current_char) && !did_consume_whitespace_or_comments) {
token_type = TokenType::RegexFlags;
while (!is_eof() && is_ascii_alpha(m_current_char))
Expand Down Expand Up @@ -537,19 +573,26 @@ Token Lexer::next()
else
token_type = TokenType::TemplateLiteralString;
}
} else if (is_identifier_start()) {
} else if (auto code_point = is_identifier_start(identifier_length); code_point.has_value()) {
// identifier or keyword
StringBuilder builder;
do {
consume();
} while (is_identifier_middle());
builder.append_code_point(*code_point);
for (size_t i = 0; i < identifier_length; ++i)
consume();

code_point = is_identifier_middle(identifier_length);
} while (code_point.has_value());

StringView value = m_source.substring_view(value_start - 1, m_position - value_start);
auto it = s_keywords.find(value.hash(), [&](auto& entry) { return entry.key == value; });
if (it == s_keywords.end()) {
identifier = builder.build();
if (!m_parsed_identifiers.contains_slow(*identifier))
m_parsed_identifiers.append(*identifier);

auto it = s_keywords.find(identifier->hash(), [&](auto& entry) { return entry.key == identifier; });
if (it == s_keywords.end())
token_type = TokenType::Identifier;
} else {
else
token_type = it->value;
}
} else if (is_numeric_literal_start()) {
token_type = TokenType::NumericLiteral;
bool is_invalid_numeric_literal = false;
Expand Down Expand Up @@ -708,15 +751,28 @@ Token Lexer::next()
}
}

m_current_token = Token(
token_type,
token_message,
m_source.substring_view(trivia_start - 1, value_start - trivia_start),
m_source.substring_view(value_start - 1, m_position - value_start),
m_filename,
value_start_line_number,
value_start_column_number,
m_position);
if (identifier.has_value()) {
m_current_token = Token(
token_type,
token_message,
m_source.substring_view(trivia_start - 1, value_start - trivia_start),
m_source.substring_view(value_start - 1, m_position - value_start),
identifier.release_value(),
m_filename,
value_start_line_number,
value_start_column_number,
m_position);
} else {
m_current_token = Token(
token_type,
token_message,
m_source.substring_view(trivia_start - 1, value_start - trivia_start),
m_source.substring_view(value_start - 1, m_position - value_start),
m_filename,
value_start_line_number,
value_start_column_number,
m_position);
}

if constexpr (LEXER_DEBUG) {
dbgln("------------------------------");
Expand Down
9 changes: 7 additions & 2 deletions Userland/Libraries/LibJS/Lexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,9 @@ class Lexer {
bool is_eof() const;
bool is_line_terminator() const;
bool is_whitespace() const;
bool is_identifier_start() const;
bool is_identifier_middle() const;
Optional<u32> is_unicode_escape(size_t& identifier_length) const;
Optional<u32> is_identifier_start(size_t& identifier_length) const;
Optional<u32> is_identifier_middle(size_t& identifier_length) const;
bool is_line_comment_start(bool line_has_token_yet) const;
bool is_block_comment_start() const;
bool is_block_comment_end() const;
Expand Down Expand Up @@ -80,6 +81,10 @@ class Lexer {
static HashMap<String, TokenType> s_three_char_tokens;
static HashMap<String, TokenType> s_two_char_tokens;
static HashMap<char, TokenType> s_single_char_tokens;

// Resolved identifiers must be kept alive for the duration of the parsing stage, otherwise
// the only references to these strings are deleted by the Token destructor.
Vector<FlyString> m_parsed_identifiers;
};

}
17 changes: 10 additions & 7 deletions Userland/Libraries/LibJS/Parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,6 @@ constexpr OperatorPrecedenceTable g_operator_precedence;

Parser::ParserState::ParserState(Lexer l, Program::Type program_type)
: lexer(move(l))
, current_token(TokenType::Invalid, {}, {}, {}, {}, 0, 0, 0)
{
if (program_type == Program::Type::Module)
lexer.disallow_html_comments();
Expand Down Expand Up @@ -680,7 +679,7 @@ NonnullRefPtr<ClassExpression> Parser::parse_class_expression(bool expect_class_

if (match_property_key()) {
StringView name;
if (!is_generator && m_state.current_token.value() == "static"sv) {
if (!is_generator && m_state.current_token.original_value() == "static"sv) {
if (match(TokenType::Identifier)) {
consume();
is_static = true;
Expand Down Expand Up @@ -2524,7 +2523,7 @@ NonnullRefPtr<Statement> Parser::parse_for_statement()
{
auto rule_start = push_start();
auto match_for_in_of = [&]() {
return match(TokenType::In) || (match(TokenType::Identifier) && m_state.current_token.value() == "of");
return match(TokenType::In) || (match(TokenType::Identifier) && m_state.current_token.original_value() == "of");
};

consume(TokenType::For);
Expand Down Expand Up @@ -3019,7 +3018,7 @@ NonnullRefPtr<ImportStatement> Parser::parse_import_statement(Program& program)
};

auto match_as = [&] {
return match(TokenType::Identifier) && m_state.current_token.value() == "as"sv;
return match(TokenType::Identifier) && m_state.current_token.original_value() == "as"sv;
};

bool continue_parsing = true;
Expand Down Expand Up @@ -3134,11 +3133,15 @@ NonnullRefPtr<ExportStatement> Parser::parse_export_statement(Program& program)
syntax_error("Cannot use export statement outside a module");

auto match_as = [&] {
return match(TokenType::Identifier) && m_state.current_token.value() == "as"sv;
return match(TokenType::Identifier) && m_state.current_token.original_value() == "as"sv;
};

auto match_from = [&] {
return match(TokenType::Identifier) && m_state.current_token.value() == "from"sv;
return match(TokenType::Identifier) && m_state.current_token.original_value() == "from"sv;
};

auto match_default = [&] {
return match(TokenType::Default) && m_state.current_token.original_value() == "default"sv;
};

consume(TokenType::Export);
Expand All @@ -3158,7 +3161,7 @@ NonnullRefPtr<ExportStatement> Parser::parse_export_statement(Program& program)

RefPtr<ASTNode> expression = {};

if (match(TokenType::Default)) {
if (match_default()) {
auto default_position = position();
consume(TokenType::Default);

Expand Down
19 changes: 19 additions & 0 deletions Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
test("basic escapes", () => {
var foo = {};
foo.brown = 12389;

expect(foo.brown).toBe(12389);
expect(foo.br\u006fwn).toBe(12389);
expect(foo.br\u{6f}wn).toBe(12389);
expect(foo.\u{62}\u{72}\u{6f}\u{77}\u{6e}).toBe(12389);
});

test("non-ascii escapes", () => {
var foo = {};
foo.𝓑𝓻𝓸𝔀𝓷 = 12389;

expect(foo.𝓑𝓻𝓸𝔀𝓷).toBe(12389);
expect(foo.𝓑𝓻\ud835\udcf8𝔀𝓷).toBe(12389);
expect(foo.𝓑𝓻\u{1d4f8}𝔀𝓷).toBe(12389);
expect(foo.\u{1d4d1}\u{1d4fb}\u{1d4f8}\u{1d500}\u{1d4f7}).toBe(12389);
});
8 changes: 4 additions & 4 deletions Userland/Libraries/LibJS/Token.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ double Token::double_value() const

StringBuilder builder;

for (auto ch : m_value) {
for (auto ch : value()) {
if (ch == '_')
continue;
builder.append(ch);
Expand All @@ -75,7 +75,7 @@ double Token::double_value() const
return static_cast<double>(strtoul(value_string.characters() + 2, nullptr, 2));
} else if (is_ascii_digit(value_string[1])) {
// also octal, but syntax error in strict mode
if (!m_value.contains('8') && !m_value.contains('9'))
if (!value().contains('8') && !value().contains('9'))
return static_cast<double>(strtoul(value_string.characters() + 1, nullptr, 8));
}
}
Expand All @@ -95,7 +95,7 @@ String Token::string_value(StringValueStatus& status) const
VERIFY(type() == TokenType::StringLiteral || type() == TokenType::TemplateLiteralString);

auto is_template = type() == TokenType::TemplateLiteralString;
GenericLexer lexer(is_template ? m_value : m_value.substring_view(1, m_value.length() - 2));
GenericLexer lexer(is_template ? value() : value().substring_view(1, value().length() - 2));

auto encoding_failure = [&status](StringValueStatus parse_status) -> String {
status = parse_status;
Expand Down Expand Up @@ -195,7 +195,7 @@ String Token::string_value(StringValueStatus& status) const
bool Token::bool_value() const
{
VERIFY(type() == TokenType::BoolLiteral);
return m_value == "true";
return value() == "true";
}

bool Token::is_identifier_name() const
Expand Down
Loading

0 comments on commit 1259dc3

Please sign in to comment.