ICU-22757 Remove allow list of known contractions with precomposed fo…

…rm from ICU4X mode of genuca This assumes that future cases will work OK, since the addition that was seen in Unicode 16 alpha was OK.
unicode-org · May 13, 2024 · f5056cb · f5056cb
1 parent 564c92d
commit f5056cb
Show file tree

Hide file tree

Showing 4 changed files with 16 additions and 28 deletions.
diff --git a/icu4c/source/data/in/coll/ucadata-implicithan-icu4x.icu b/icu4c/source/data/in/coll/ucadata-implicithan-icu4x.icu
diff --git a/icu4c/source/data/in/coll/ucadata-unihan-icu4x.icu b/icu4c/source/data/in/coll/ucadata-unihan-icu4x.icu
diff --git a/icu4c/source/data/unidata/generate.sh b/icu4c/source/data/unidata/generate.sh
@@ -23,10 +23,7 @@ rm $ICU_SRC/icu4c/source/common/propname_data.h
 rm $ICU_SRC/icu4c/source/common/*_props_data.h
 rm $ICU4C_DATA_IN/*.icu
 rm $ICU4C_DATA_IN/*.nrm
-# TODO: Back to deleting coll/*.icu once ICU4X data generation is fixed.
-# rm $ICU4C_DATA_IN/coll/*.icu
-rm $ICU4C_DATA_IN/coll/ucadata-implicithan.icu
-rm $ICU4C_DATA_IN/coll/ucadata-unihan.icu
+rm $ICU4C_DATA_IN/coll/*.icu
 # icu4c/source/i18n/collationfcd.cpp is generated by genuca;
 # probably hard to build genuca without depending on the old version.
 
@@ -49,6 +46,5 @@ bazelisk run //tools/unicode/c/genprops $ICU_SRC/icu4c
 bazelisk run //tools/unicode/c/genuca -- --hanOrder implicit $ICU_SRC/icu4c
 bazelisk run //tools/unicode/c/genuca -- --hanOrder radical-stroke $ICU_SRC/icu4c
 # Also generate the ICU4X versions
-# TODO: Currently fails with early Unicode 16.0 FractionalUCA.txt.
-# bazelisk run //tools/unicode/c/genuca -- --icu4x --hanOrder implicit $ICU_SRC/icu4c
-# bazelisk run //tools/unicode/c/genuca -- --icu4x --hanOrder radical-stroke $ICU_SRC/icu4c
+bazelisk run //tools/unicode/c/genuca -- --icu4x --hanOrder implicit $ICU_SRC/icu4c
+bazelisk run //tools/unicode/c/genuca -- --icu4x --hanOrder radical-stroke $ICU_SRC/icu4c
diff --git a/icu4c/source/i18n/collationdatabuilder.cpp b/icu4c/source/i18n/collationdatabuilder.cpp
@@ -586,27 +586,19 @@ CollationDataBuilder::addCE32(const UnicodeString &prefix, const UnicodeString &
  if (s != sInNfd) {
  // s is not in NFD, so it cannot match in ICU4X, since ICU4X only
  // does NFD lookups.
- // Now check that we're only rejecting known cases.
- if (s.length() == 2) {
- char16_t second = s.charAt(1);
- if (second == 0x0F73 || second == 0x0F75 || second == 0x0F81) {
- // Second is a special decomposing Tibetan vowel sign.
- // These also get added in the decomposed form, so ignoring
- // this instance is OK.
- return;
- }
- if (c == 0xFDD1 && second == 0xAC00) {
- // This strange contraction exists in the root and
- // doesn't have a decomposed counterpart there.
- // This won't match in ICU4X anyway and is very strange:
- // Unassigned Arabic presentation form contracting with
- // the very first Hangul syllable. Let's ignore this
- // explicitly.
- return;
- }
- }
- // Unknown case worth investigating if ever found.
- errorCode = U_UNSUPPORTED_ERROR;
+
+ // As of Unicode 16 alpha, the cases that come here are:
+ //
+ // 1. The second character is a special decomposing Tibetan vowel
+ // sign. These are OK to ignore in the precomposed form, since
+ // the decomposed form is added also.
+ // 2. Likewise for KIRAT RAI VOWEL SIGN AA followed by KIRAT RAI VOWEL SIGN AI
+ // and other such cases.
+ // For details see the normalization section of
+ // https://www.unicode.org/review/pri497/pri497-background.html
+ // 3. U+FDD1 followed by U+AC00 is a marker for the alphabetical
+ // index feature of ICU4C, which at this time does not have
+ // a counterpart in ICU4X.
  return;
  }