Skip to content

Commit

Permalink
ICU-22757 Remove allow list of known contractions with precomposed fo…
Browse files Browse the repository at this point in the history
…rm from ICU4X mode of genuca

This assumes that future cases will work OK, since the addition that was seen in Unicode 16 alpha
was OK.
  • Loading branch information
hsivonen authored and markusicu committed May 13, 2024
1 parent 564c92d commit f5056cb
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 28 deletions.
Binary file modified icu4c/source/data/in/coll/ucadata-implicithan-icu4x.icu
Binary file not shown.
Binary file modified icu4c/source/data/in/coll/ucadata-unihan-icu4x.icu
Binary file not shown.
10 changes: 3 additions & 7 deletions icu4c/source/data/unidata/generate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,7 @@ rm $ICU_SRC/icu4c/source/common/propname_data.h
rm $ICU_SRC/icu4c/source/common/*_props_data.h
rm $ICU4C_DATA_IN/*.icu
rm $ICU4C_DATA_IN/*.nrm
# TODO: Back to deleting coll/*.icu once ICU4X data generation is fixed.
# rm $ICU4C_DATA_IN/coll/*.icu
rm $ICU4C_DATA_IN/coll/ucadata-implicithan.icu
rm $ICU4C_DATA_IN/coll/ucadata-unihan.icu
rm $ICU4C_DATA_IN/coll/*.icu
# icu4c/source/i18n/collationfcd.cpp is generated by genuca;
# probably hard to build genuca without depending on the old version.

Expand All @@ -49,6 +46,5 @@ bazelisk run //tools/unicode/c/genprops $ICU_SRC/icu4c
bazelisk run //tools/unicode/c/genuca -- --hanOrder implicit $ICU_SRC/icu4c
bazelisk run //tools/unicode/c/genuca -- --hanOrder radical-stroke $ICU_SRC/icu4c
# Also generate the ICU4X versions
# TODO: Currently fails with early Unicode 16.0 FractionalUCA.txt.
# bazelisk run //tools/unicode/c/genuca -- --icu4x --hanOrder implicit $ICU_SRC/icu4c
# bazelisk run //tools/unicode/c/genuca -- --icu4x --hanOrder radical-stroke $ICU_SRC/icu4c
bazelisk run //tools/unicode/c/genuca -- --icu4x --hanOrder implicit $ICU_SRC/icu4c
bazelisk run //tools/unicode/c/genuca -- --icu4x --hanOrder radical-stroke $ICU_SRC/icu4c
34 changes: 13 additions & 21 deletions icu4c/source/i18n/collationdatabuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -586,27 +586,19 @@ CollationDataBuilder::addCE32(const UnicodeString &prefix, const UnicodeString &
if (s != sInNfd) {
// s is not in NFD, so it cannot match in ICU4X, since ICU4X only
// does NFD lookups.
// Now check that we're only rejecting known cases.
if (s.length() == 2) {
char16_t second = s.charAt(1);
if (second == 0x0F73 || second == 0x0F75 || second == 0x0F81) {
// Second is a special decomposing Tibetan vowel sign.
// These also get added in the decomposed form, so ignoring
// this instance is OK.
return;
}
if (c == 0xFDD1 && second == 0xAC00) {
// This strange contraction exists in the root and
// doesn't have a decomposed counterpart there.
// This won't match in ICU4X anyway and is very strange:
// Unassigned Arabic presentation form contracting with
// the very first Hangul syllable. Let's ignore this
// explicitly.
return;
}
}
// Unknown case worth investigating if ever found.
errorCode = U_UNSUPPORTED_ERROR;

// As of Unicode 16 alpha, the cases that come here are:
//
// 1. The second character is a special decomposing Tibetan vowel
// sign. These are OK to ignore in the precomposed form, since
// the decomposed form is added also.
// 2. Likewise for KIRAT RAI VOWEL SIGN AA followed by KIRAT RAI VOWEL SIGN AI
// and other such cases.
// For details see the normalization section of
// https://www.unicode.org/review/pri497/pri497-background.html
// 3. U+FDD1 followed by U+AC00 is a marker for the alphabetical
// index feature of ICU4C, which at this time does not have
// a counterpart in ICU4X.
return;
}

Expand Down

0 comments on commit f5056cb

Please sign in to comment.