Skip to content

Commit

Permalink
AK: Stop using U+0000 as end of file code point in URL parser
Browse files Browse the repository at this point in the history
This changes URL parser to use the 0xFFFFFFFF constant instead of 0 to
indicate end of file. This fixes a bug where inputs containing null
bytes would terminate the parser early, because they were interpreted
as end of file.
  • Loading branch information
MaxWipfli authored and awesomekling committed Jun 5, 2021
1 parent 97425c7 commit 2164d8a
Showing 1 changed file with 18 additions and 16 deletions.
34 changes: 18 additions & 16 deletions AK/URLParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@

namespace AK {

// NOTE: This is similar to the LibC macro EOF = -1.
constexpr u32 end_of_file = 0xFFFFFFFF;

constexpr bool is_url_code_point(u32 code_point)
{
// FIXME: [...] and code points in the range U+00A0 to U+10FFFD, inclusive, excluding surrogates and noncharacters.
Expand Down Expand Up @@ -221,14 +224,13 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
// NOTE: "continue" should only be used to prevent incrementing the iterator, as this is done at the end of the loop.
// ++iterator : "increase pointer by 1"
// continue : "decrease pointer by 1"
// NOTE: The NULL code point is used as the "EOF code point".
for (;;) {
u32 code_point = 0;
u32 code_point = end_of_file;
if (!iterator.done())
code_point = *iterator;

if constexpr (URL_PARSER_DEBUG) {
if (!code_point)
if (code_point == end_of_file)
dbgln("URLParser::parse: {} state with EOF.", state_name(state));
else if (is_ascii_printable(code_point))
dbgln("URLParser::parse: {} state with code point U+{:04X} ({:c}).", state_name(state), code_point, code_point);
Expand Down Expand Up @@ -335,7 +337,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
} else if (code_point == '#') {
url.m_fragment = "";
state = State::Fragment;
} else if (code_point != 0) {
} else if (code_point != end_of_file) {
url.m_query = {};
if (url.m_paths.size())
url.m_paths.remove(url.m_paths.size() - 1);
Expand Down Expand Up @@ -408,7 +410,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
}
}
buffer.clear();
} else if (code_point == 0 || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) {
} else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) {
if (at_sign_seen && buffer.is_empty()) {
report_validation_error();
return {};
Expand All @@ -434,7 +436,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
url.m_host = host.release_value();
buffer.clear();
state = State::Port;
} else if (code_point == 0 || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) {
} else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) {
if (url.is_special() && buffer.is_empty()) {
report_validation_error();
return {};
Expand All @@ -457,7 +459,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
case State::Port:
if (is_ascii_digit(code_point)) {
buffer.append_code_point(code_point);
} else if (code_point == 0 || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) {
} else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) {
if (!buffer.is_empty()) {
auto port = buffer.to_string().to_uint();
if (!port.has_value() || port.value() > 65535) {
Expand Down Expand Up @@ -494,7 +496,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
} else if (code_point == '#') {
url.m_fragment = "";
state = State::Fragment;
} else if (code_point != 0) {
} else if (code_point != end_of_file) {
url.m_query = {};
auto substring_from_pointer = input.substring_view(iterator - input.begin()).as_string();
if (!starts_with_windows_drive_letter(substring_from_pointer)) {
Expand Down Expand Up @@ -524,7 +526,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
}
break;
case State::FileHost:
if (code_point == 0 || code_point == '/' || code_point == '\\' || code_point == '?' || code_point == '#') {
if (code_point == end_of_file || code_point == '/' || code_point == '\\' || code_point == '?' || code_point == '#') {
if (is_windows_drive_letter(buffer.to_string())) {
report_validation_error();
state = State::Path;
Expand Down Expand Up @@ -559,14 +561,14 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
} else if (code_point == '#') {
url.m_fragment = "";
state = State::Fragment;
} else if (code_point != 0) {
} else if (code_point != end_of_file) {
state = State::Path;
if (code_point != '/')
continue;
}
break;
case State::Path:
if (code_point == 0 || code_point == '/' || (url.is_special() && code_point == '\\') || code_point == '?' || code_point == '#') {
if (code_point == end_of_file || code_point == '/' || (url.is_special() && code_point == '\\') || code_point == '?' || code_point == '#') {
if (url.is_special() && code_point == '\\')
report_validation_error();
if (is_double_dot_path_segment(buffer.to_string())) {
Expand Down Expand Up @@ -616,10 +618,10 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
url.m_fragment = "";
state = State::Fragment;
} else {
if (code_point != 0 && !is_url_code_point(code_point) && code_point != '%')
if (code_point != end_of_file && !is_url_code_point(code_point) && code_point != '%')
report_validation_error();
// FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
if (code_point != 0) {
if (code_point != end_of_file) {
URL::append_percent_encoded_if_necessary(buffer, code_point, URL::PercentEncodeSet::C0Control);
} else {
// NOTE: This needs to be percent decoded since the member variables contain decoded data.
Expand All @@ -628,7 +630,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
}
break;
case State::Query:
if (code_point == '#' || code_point == 0) {
if (code_point == end_of_file || code_point == '#') {
VERIFY(url.m_query == "");
auto query_percent_encode_set = url.is_special() ? URL::PercentEncodeSet::SpecialQuery : URL::PercentEncodeSet::Query;
// NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences.
Expand All @@ -638,7 +640,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
url.m_fragment = "";
state = State::Fragment;
}
} else if (code_point != 0) {
} else if (code_point != end_of_file) {
if (!is_url_code_point(code_point) && code_point != '%')
report_validation_error();
// FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
Expand All @@ -647,7 +649,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
break;
case State::Fragment:
// NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the fragment on EOF.
if (code_point) {
if (code_point != end_of_file) {
if (!is_url_code_point(code_point) && code_point != '%')
report_validation_error();
// FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
Expand Down

0 comments on commit 2164d8a

Please sign in to comment.