forked from SerenityOS/serenity
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
AK: Add a Utf8View type for iterating over UTF-8 codepoints
Utf8View wraps a StringView and implements begin() and end() that return a Utf8CodepointIterator, which parses UTF-8-encoded Unicode codepoints and returns them as 32-bit integers. This is the first step towards supporting emojis in Serenity ^) SerenityOS#490
- Loading branch information
1 parent
970e014
commit 5d36961
Showing
4 changed files
with
241 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
#include <AK/TestSuite.h> | ||
|
||
#include <AK/Utf8View.h> | ||
|
||
TEST_CASE(decode_ascii) | ||
{ | ||
Utf8View utf8 { "Hello World!11" }; | ||
EXPECT(utf8.validate()); | ||
|
||
u32 expected[] = { 72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100, 33, 49, 49 }; | ||
size_t expected_size = sizeof(expected) / sizeof(expected[0]); | ||
|
||
size_t i = 0; | ||
for (u32 codepoint : utf8) { | ||
ASSERT(i < expected_size); | ||
EXPECT_EQ(codepoint, expected[i]); | ||
i++; | ||
} | ||
EXPECT_EQ(i, expected_size); | ||
} | ||
|
||
TEST_CASE(decode_utf8) | ||
{ | ||
Utf8View utf8 { "Привет, мир! 😀 γειά σου κόσμος こんにちは世界" }; | ||
EXPECT(utf8.validate()); | ||
|
||
u32 expected[] = { 1055, 1088, 1080, 1074, 1077, 1090, 44, 32, 1084, 1080, 1088, 33, 32, 128512, 32, 947, 949, 953, 940, 32, 963, 959, 965, 32, 954, 972, 963, 956, 959, 962, 32, 12371, 12435, 12395, 12385, 12399, 19990, 30028 }; | ||
size_t expected_size = sizeof(expected) / sizeof(expected[0]); | ||
|
||
size_t i = 0; | ||
for (u32 codepoint : utf8) { | ||
ASSERT(i < expected_size); | ||
EXPECT_EQ(codepoint, expected[i]); | ||
i++; | ||
} | ||
EXPECT_EQ(i, expected_size); | ||
} | ||
|
||
TEST_CASE(validate_invalid_ut8) | ||
{ | ||
char invalid_utf8_1[] = { 42, 35, (char)182, 9, 0 }; | ||
Utf8View utf8_1 { invalid_utf8_1 }; | ||
EXPECT(!utf8_1.validate()); | ||
|
||
char invalid_utf8_2[] = { 42, 35, (char)208, (char)208, 0 }; | ||
Utf8View utf8_2 { invalid_utf8_2 }; | ||
EXPECT(!utf8_2.validate()); | ||
|
||
char invalid_utf8_3[] = { (char)208, 0 }; | ||
Utf8View utf8_3 { invalid_utf8_3 }; | ||
EXPECT(!utf8_3.validate()); | ||
|
||
char invalid_utf8_4[] = { (char)208, 35, 0 }; | ||
Utf8View utf8_4 { invalid_utf8_4 }; | ||
EXPECT(!utf8_4.validate()); | ||
} | ||
|
||
TEST_MAIN(UTF8) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
#include <AK/Utf8View.h> | ||
|
||
namespace AK { | ||
|
||
Utf8View::Utf8View(const StringView& string) | ||
: m_string(string) | ||
{ | ||
} | ||
|
||
const unsigned char* Utf8View::begin_ptr() const | ||
{ | ||
return (const unsigned char*)m_string.characters_without_null_termination(); | ||
} | ||
|
||
const unsigned char* Utf8View::end_ptr() const | ||
{ | ||
return (const unsigned char*)m_string.characters_without_null_termination() + m_string.length(); | ||
} | ||
|
||
Utf8CodepointIterator Utf8View::begin() const | ||
{ | ||
return { begin_ptr(), m_string.length() }; | ||
} | ||
|
||
Utf8CodepointIterator Utf8View::end() const | ||
{ | ||
return { end_ptr(), 0 }; | ||
} | ||
|
||
static inline bool decode_first_byte( | ||
unsigned char byte, | ||
int& out_codepoint_length_in_bytes, | ||
u32& out_value) | ||
{ | ||
if ((byte & 128) == 0) { | ||
out_value = byte; | ||
out_codepoint_length_in_bytes = 1; | ||
return true; | ||
} | ||
if ((byte & 64) == 0) { | ||
return false; | ||
} | ||
if ((byte & 32) == 0) { | ||
out_value = byte & 31; | ||
out_codepoint_length_in_bytes = 2; | ||
return true; | ||
} | ||
if ((byte & 16) == 0) { | ||
out_value = byte & 15; | ||
out_codepoint_length_in_bytes = 3; | ||
return true; | ||
} | ||
if ((byte & 8) == 0) { | ||
out_value = byte & 7; | ||
out_codepoint_length_in_bytes = 4; | ||
return true; | ||
} | ||
|
||
return false; | ||
} | ||
|
||
bool Utf8View::validate() const | ||
{ | ||
for (auto ptr = begin_ptr(); ptr < end_ptr(); ptr++) { | ||
int codepoint_length_in_bytes; | ||
u32 value; | ||
bool first_byte_makes_sense = decode_first_byte(*ptr, codepoint_length_in_bytes, value); | ||
if (!first_byte_makes_sense) | ||
return false; | ||
|
||
for (int i = 1; i < codepoint_length_in_bytes; i++) { | ||
ptr++; | ||
if (ptr >= end_ptr()) | ||
return false; | ||
if (*ptr >> 6 != 2) | ||
return false; | ||
} | ||
} | ||
|
||
return true; | ||
} | ||
|
||
Utf8CodepointIterator::Utf8CodepointIterator(const unsigned char* ptr, int length) | ||
: m_ptr(ptr) | ||
, m_length(length) | ||
{ | ||
} | ||
|
||
bool Utf8CodepointIterator::operator==(const Utf8CodepointIterator& other) const | ||
{ | ||
return m_ptr == other.m_ptr && m_length == other.m_length; | ||
} | ||
|
||
bool Utf8CodepointIterator::operator!=(const Utf8CodepointIterator& other) const | ||
{ | ||
return !(*this == other); | ||
} | ||
|
||
Utf8CodepointIterator& Utf8CodepointIterator::operator++() | ||
{ | ||
do { | ||
ASSERT(m_length > 0); | ||
m_length--; | ||
m_ptr++; | ||
} while (m_ptr[0] >> 6 == 2); | ||
|
||
return *this; | ||
} | ||
|
||
u32 Utf8CodepointIterator::operator*() const | ||
{ | ||
ASSERT(m_length > 0); | ||
|
||
u32 codepoint_value_so_far; | ||
int codepoint_length_in_bytes; | ||
|
||
bool first_byte_makes_sense = decode_first_byte(m_ptr[0], codepoint_length_in_bytes, codepoint_value_so_far); | ||
ASSERT(first_byte_makes_sense); | ||
ASSERT(codepoint_length_in_bytes <= m_length); | ||
|
||
for (int offset = 1; offset < codepoint_length_in_bytes; offset++) { | ||
ASSERT(m_ptr[offset] >> 6 == 2); | ||
codepoint_value_so_far <<= 6; | ||
codepoint_value_so_far |= m_ptr[offset] & 63; | ||
} | ||
|
||
return codepoint_value_so_far; | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
#pragma once | ||
|
||
#include <AK/StringView.h> | ||
#include <AK/Types.h> | ||
|
||
namespace AK { | ||
|
||
class Utf8View; | ||
|
||
class Utf8CodepointIterator { | ||
friend class Utf8View; | ||
|
||
public: | ||
~Utf8CodepointIterator() {} | ||
|
||
bool operator==(const Utf8CodepointIterator&) const; | ||
bool operator!=(const Utf8CodepointIterator&) const; | ||
Utf8CodepointIterator& operator++(); | ||
u32 operator*() const; | ||
|
||
private: | ||
Utf8CodepointIterator(const unsigned char*, int); | ||
const unsigned char* m_ptr { nullptr }; | ||
int m_length { -1 }; | ||
}; | ||
|
||
class Utf8View { | ||
public: | ||
explicit Utf8View(const StringView&); | ||
~Utf8View() {} | ||
|
||
const StringView& as_string() const { return m_string; } | ||
|
||
Utf8CodepointIterator begin() const; | ||
Utf8CodepointIterator end() const; | ||
|
||
bool validate() const; | ||
|
||
private: | ||
const unsigned char* begin_ptr() const; | ||
const unsigned char* end_ptr() const; | ||
|
||
StringView m_string; | ||
}; | ||
|
||
} | ||
|
||
using AK::Utf8View; |