Skip to content

Commit

Permalink
LibUnicode: Parse and generate the Unicode locale list patterns dataset
Browse files Browse the repository at this point in the history
This data informs consumers how to join lists of values. For example,
in en-US, the list ["a", "b", "c"] formatted to a string should become
"a, b, and c".
  • Loading branch information
trflynn89 authored and linusg committed Sep 6, 2021
1 parent 9cd986d commit 3f64a14
Show file tree
Hide file tree
Showing 4 changed files with 172 additions and 10 deletions.
162 changes: 152 additions & 10 deletions Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeLocale.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,15 @@
#include <LibCore/File.h>
#include <LibUnicode/Locale.h>

struct ListPatterns {
String type;
String style;
String start;
String middle;
String end;
String pair;
};

struct Locale {
String language;
Optional<String> territory;
Expand All @@ -29,6 +38,7 @@ struct Locale {
HashMap<String, String> territories;
HashMap<String, String> scripts;
HashMap<String, String> currencies;
Vector<ListPatterns> list_patterns;
};

struct CanonicalLanguageID {
Expand All @@ -50,6 +60,8 @@ struct UnicodeLocaleData {
Vector<String> scripts;
Vector<String> variants;
Vector<String> currencies;
Vector<String> list_pattern_types;
Vector<String> list_pattern_styles;
HashMap<String, String> language_aliases;
HashMap<String, String> territory_aliases;
HashMap<String, String> script_aliases;
Expand Down Expand Up @@ -293,6 +305,58 @@ static void parse_locale_scripts(String locale_path, UnicodeLocaleData& locale_d
});
}

static void parse_locale_list_patters(String misc_path, UnicodeLocaleData& locale_data, Locale& locale)
{
LexicalPath list_patterns_path(move(misc_path));
list_patterns_path = list_patterns_path.append("listPatterns.json"sv);
VERIFY(Core::File::exists(list_patterns_path.string()));

auto list_patterns_file_or_error = Core::File::open(list_patterns_path.string(), Core::OpenMode::ReadOnly);
VERIFY(!list_patterns_file_or_error.is_error());

auto list_patterns = JsonParser(list_patterns_file_or_error.value()->read_all()).parse();
VERIFY(list_patterns.has_value());

auto const& main_object = list_patterns->as_object().get("main"sv);
auto const& locale_object = main_object.as_object().get(list_patterns_path.parent().basename());
auto const& list_patterns_object = locale_object.as_object().get("listPatterns"sv);

auto list_pattern_type = [](StringView key) {
if (key.contains("type-standard"sv))
return "conjunction"sv;
if (key.contains("type-or"sv))
return "disjunction"sv;
if (key.contains("type-unit"sv))
return "unit"sv;
VERIFY_NOT_REACHED();
};

auto list_pattern_style = [](StringView key) {
if (key.contains("short"sv))
return "short"sv;
if (key.contains("narrow"sv))
return "narrow"sv;
return "long"sv;
};

list_patterns_object.as_object().for_each_member([&](auto const& key, JsonValue const& value) {
auto type = list_pattern_type(key);
auto style = list_pattern_style(key);

auto start = value.as_object().get("start"sv).as_string();
auto middle = value.as_object().get("middle"sv).as_string();
auto end = value.as_object().get("end"sv).as_string();
auto pair = value.as_object().get("2"sv).as_string();

if (!locale_data.list_pattern_types.contains_slow(type))
locale_data.list_pattern_types.append(type);
if (!locale_data.list_pattern_styles.contains_slow(style))
locale_data.list_pattern_styles.append(style);

locale.list_patterns.append({ move(type), move(style), move(start), move(middle), move(end), move(pair) });
});
}

static void parse_locale_currencies(String numbers_path, UnicodeLocaleData& locale_data, Locale& locale)
{
LexicalPath currencies_path(move(numbers_path));
Expand Down Expand Up @@ -333,9 +397,10 @@ static Core::DirIterator path_to_dir_iterator(String path)
return iterator;
}

static void parse_all_locales(String core_path, String locale_names_path, String numbers_path, UnicodeLocaleData& locale_data)
static void parse_all_locales(String core_path, String locale_names_path, String misc_path, String numbers_path, UnicodeLocaleData& locale_data)
{
auto locale_names_iterator = path_to_dir_iterator(move(locale_names_path));
auto misc_iterator = path_to_dir_iterator(move(misc_path));
auto numbers_iterator = path_to_dir_iterator(move(numbers_path));

LexicalPath core_supplemental_path(move(core_path));
Expand All @@ -356,6 +421,14 @@ static void parse_all_locales(String core_path, String locale_names_path, String
parse_locale_scripts(locale_path, locale_data, locale);
}

while (misc_iterator.has_next()) {
auto misc_path = misc_iterator.next_full_path();
VERIFY(Core::File::is_directory(misc_path));

auto& locale = locale_data.locales.ensure(LexicalPath::basename(misc_path));
parse_locale_list_patters(misc_path, locale_data, locale);
}

while (numbers_iterator.has_next()) {
auto numbers_path = numbers_iterator.next_full_path();
VERIFY(Core::File::is_directory(numbers_path));
Expand Down Expand Up @@ -423,6 +496,8 @@ namespace Unicode {
generate_enum("ScriptTag"sv, {}, locale_data.scripts);
generate_enum("Currency"sv, {}, locale_data.currencies);
generate_enum("Variant"sv, {}, locale_data.variants);
generate_enum("ListPatternType"sv, {}, locale_data.list_pattern_types);
generate_enum("ListPatternStyle"sv, {}, locale_data.list_pattern_styles);

generator.append(R"~~~(
namespace Detail {
Expand All @@ -444,6 +519,10 @@ Optional<StringView> resolve_script_tag_alias(StringView const& script_tag);
Optional<StringView> get_locale_currency_mapping(StringView locale, StringView currency);
Optional<Currency> currency_from_string(StringView const& currency);
Optional<ListPatterns> get_locale_list_pattern_mapping(StringView locale, StringView list_pattern_type, StringView list_pattern_style);
Optional<ListPatternType> list_pattern_type_from_string(StringView const& list_pattern_type);
Optional<ListPatternStyle> list_pattern_style_from_string(StringView const& list_pattern_style);
Optional<StringView> resolve_variant_alias(StringView const& variant);
Optional<StringView> resolve_subdivision_alias(StringView const& subdivision);
Expand Down Expand Up @@ -476,6 +555,15 @@ static void generate_unicode_locale_implementation(Core::File& file, UnicodeLoca
#include <LibUnicode/UnicodeLocale.h>
namespace Unicode {
struct Patterns {
ListPatternType type;
ListPatternStyle style;
StringView start;
StringView middle;
StringView end;
StringView pair;
};
)~~~");

auto format_mapping_name = [](StringView format, StringView name) {
Expand Down Expand Up @@ -507,7 +595,7 @@ namespace Unicode {
generator.append(String::formatted(" }}, {}", list.size()));
};

auto append_mapping_list = [&](String name, auto const& keys, auto const& mappings) {
auto append_string_list = [&](String name, auto const& keys, auto const& mappings) {
generator.set("name", name);
generator.set("size", String::number(keys.size()));

Expand Down Expand Up @@ -539,21 +627,46 @@ static constexpr Array<StringView, @size@> @name@ { {
)~~~");
};

auto append_mapping = [&](StringView name, StringView format, auto const& keys, auto get_mapping_callback) {
auto append_list_patterns = [&](StringView name, Vector<ListPatterns> const& list_patterns) {
generator.set("name", name);
generator.set("size", String::number(list_patterns.size()));

generator.append(R"~~~(
static constexpr Array<Patterns, @size@> @name@ { {)~~~");

for (auto const& list_pattern : list_patterns) {
generator.set("type"sv, String::formatted("ListPatternType::{}", format_identifier({}, list_pattern.type)));
generator.set("style"sv, String::formatted("ListPatternStyle::{}", format_identifier({}, list_pattern.style)));
generator.set("start"sv, String::formatted("\"{}\"sv", list_pattern.start));
generator.set("middle"sv, String::formatted("\"{}\"sv", list_pattern.middle));
generator.set("end"sv, String::formatted("\"{}\"sv", list_pattern.end));
generator.set("pair"sv, String::formatted("\"{}\"sv", list_pattern.pair));

generator.append(R"~~~(
{ @type@, @style@, @start@, @middle@, @end@, @pair@ },)~~~");
}

generator.append(R"~~~(
} };
)~~~");
};

auto append_mapping = [&](StringView type, StringView name, StringView format, auto format_list_callback) {
Vector<String> mapping_names;

for (auto const& locale : locale_data.locales) {
auto mapping_name = format_mapping_name(format, locale.key);
append_mapping_list(mapping_name, keys, get_mapping_callback(locale.value));
format_list_callback(mapping_name, locale.value);
mapping_names.append(move(mapping_name));
}

quick_sort(mapping_names);

generator.set("type", type);
generator.set("name", name);
generator.set("size", String::number(locale_data.locales.size()));
generator.append(R"~~~(
static constexpr Array<Span<StringView const>, @size@> @name@ { {
static constexpr Array<Span<@type@ const>, @size@> @name@ { {
)~~~");

constexpr size_t max_values_per_row = 10;
Expand All @@ -577,10 +690,11 @@ static constexpr Array<Span<StringView const>, @size@> @name@ { {
)~~~");
};

append_mapping("s_languages"sv, "s_languages_{}", locale_data.languages, [](auto const& value) { return value.languages; });
append_mapping("s_territories"sv, "s_territories_{}", locale_data.territories, [](auto const& value) { return value.territories; });
append_mapping("s_scripts"sv, "s_scripts_{}", locale_data.scripts, [](auto const& value) { return value.scripts; });
append_mapping("s_currencies"sv, "s_currencies_{}", locale_data.currencies, [](auto const& value) { return value.currencies; });
append_mapping("StringView"sv, "s_languages"sv, "s_languages_{}", [&](auto const& name, auto const& value) { append_string_list(name, locale_data.languages, value.languages); });
append_mapping("StringView"sv, "s_territories"sv, "s_territories_{}", [&](auto const& name, auto const& value) { append_string_list(name, locale_data.territories, value.territories); });
append_mapping("StringView"sv, "s_scripts"sv, "s_scripts_{}", [&](auto const& name, auto const& value) { append_string_list(name, locale_data.scripts, value.scripts); });
append_mapping("StringView"sv, "s_currencies"sv, "s_currencies_{}", [&](auto const& name, auto const& value) { append_string_list(name, locale_data.currencies, value.currencies); });
append_mapping("Patterns"sv, "s_list_patterns"sv, "s_list_patterns_{}", [&](auto const& name, auto const& value) { append_list_patterns(name, value.list_patterns); });

generator.append(R"~~~(
struct CanonicalLanguageID {
Expand Down Expand Up @@ -866,7 +980,35 @@ Optional<StringView> resolve_@enum_snake@_alias(StringView const& @enum_snake@)
append_alias_search("variant"sv, locale_data.variant_aliases);
append_alias_search("subdivision"sv, locale_data.subdivision_aliases);

append_from_string("ListPatternType"sv, "list_pattern_type"sv, locale_data.list_pattern_types);
append_from_string("ListPatternStyle"sv, "list_pattern_style"sv, locale_data.list_pattern_styles);

generator.append(R"~~~(
Optional<ListPatterns> get_locale_list_pattern_mapping(StringView locale, StringView list_pattern_type, StringView list_pattern_style)
{
auto locale_value = locale_from_string(locale);
if (!locale_value.has_value())
return {};
auto type_value = list_pattern_type_from_string(list_pattern_type);
if (!type_value.has_value())
return {};
auto style_value = list_pattern_style_from_string(list_pattern_style);
if (!style_value.has_value())
return {};
auto locale_index = to_underlying(*locale_value) - 1; // Subtract 1 because 0 == Locale::None.
auto const& locale_list_patterns = s_list_patterns.at(locale_index);
for (auto const& list_patterns : locale_list_patterns) {
if ((list_patterns.type == type_value) && (list_patterns.style == style_value))
return ListPatterns { list_patterns.start, list_patterns.middle, list_patterns.end, list_patterns.pair };
}
return {};
}
void resolve_complex_language_aliases(Unicode::LanguageID& language_id)
{
for (auto const& map : s_complex_alias) {
Expand Down Expand Up @@ -969,7 +1111,7 @@ int main(int argc, char** argv)
auto generated_implementation_file = open_file(generated_implementation_path, "-c/--generated-implementation-path", Core::OpenMode::ReadWrite);

UnicodeLocaleData locale_data;
parse_all_locales(core_path, locale_names_path, numbers_path, locale_data);
parse_all_locales(core_path, locale_names_path, misc_path, numbers_path, locale_data);

generate_unicode_locale_header(generated_header_file, locale_data);
generate_unicode_locale_implementation(generated_implementation_file, locale_data);
Expand Down
3 changes: 3 additions & 0 deletions Userland/Libraries/LibUnicode/Forward.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ namespace Unicode {
enum class Condition : u8;
enum class GeneralCategory : u8;
enum class Language : u8;
enum class ListPatternStyle : u8;
enum class ListPatternType : u8;
enum class Locale : u16;
enum class Property : u8;
enum class Script : u8;
Expand All @@ -21,6 +23,7 @@ enum class WordBreakProperty : u8;

struct Keyword;
struct LanguageID;
struct ListPatterns;
struct LocaleExtension;
struct LocaleID;
struct OtherExtension;
Expand Down
9 changes: 9 additions & 0 deletions Userland/Libraries/LibUnicode/Locale.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -798,6 +798,15 @@ Optional<StringView> get_locale_currency_mapping([[maybe_unused]] StringView loc
#endif
}

Optional<ListPatterns> get_locale_list_patterns([[maybe_unused]] StringView locale, [[maybe_unused]] StringView type, [[maybe_unused]] StringView style)
{
#if ENABLE_UNICODE_DATA
return Detail::get_locale_list_pattern_mapping(locale, type, style);
#else
return {};
#endif
}

Optional<StringView> resolve_language_alias(StringView language)
{
#if ENABLE_UNICODE_DATA
Expand Down
8 changes: 8 additions & 0 deletions Userland/Libraries/LibUnicode/Locale.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,13 @@ struct LocaleID {
Vector<String> private_use_extensions {};
};

struct ListPatterns {
StringView start;
StringView middle;
StringView end;
StringView pair;
};

// Note: These methods only verify that the provided strings match the EBNF grammar of the
// Unicode identifier subtag (i.e. no validation is done that the tags actually exist).
constexpr bool is_unicode_language_subtag(StringView subtag)
Expand Down Expand Up @@ -130,6 +137,7 @@ Optional<StringView> get_locale_language_mapping(StringView locale, StringView l
Optional<StringView> get_locale_territory_mapping(StringView locale, StringView territory);
Optional<StringView> get_locale_script_mapping(StringView locale, StringView script);
Optional<StringView> get_locale_currency_mapping(StringView locale, StringView currency);
Optional<ListPatterns> get_locale_list_patterns(StringView locale, StringView type, StringView style);

Optional<StringView> resolve_language_alias(StringView language);
Optional<StringView> resolve_territory_alias(StringView territory);
Expand Down

0 comments on commit 3f64a14

Please sign in to comment.