Skip to content

Commit

Permalink
ICU-22379 Update ICU PersonNameFormatter to match the spec change req…
Browse files Browse the repository at this point in the history
…uested by CLDR-16623
  • Loading branch information
richgillam committed May 25, 2023
1 parent 43cd3ce commit 5ef4fa2
Show file tree
Hide file tree
Showing 3 changed files with 171 additions and 59 deletions.
5 changes: 5 additions & 0 deletions icu4j/build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,10 @@
<matches string="${java.version}" pattern="^19((-.|\.\d).*)?"/>
</condition>

<condition property="is.java20">
<matches string="${java.version}" pattern="^20((-.|\.\d).*)?"/>
</condition>

<condition property="is.java9.plus">
<or>
<isset property="is.java9"/>
Expand All @@ -147,6 +151,7 @@
<isset property="is.java17"/>
<isset property="is.java18"/>
<isset property="is.java19"/>
<isset property="is.java20"/>
</or>
</condition>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,7 @@
import static com.ibm.icu.util.UResourceBundle.ARRAY;
import static com.ibm.icu.util.UResourceBundle.STRING;

import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
import java.util.*;

import com.ibm.icu.impl.ICUData;
import com.ibm.icu.impl.ICUResourceBundle;
Expand Down Expand Up @@ -132,15 +128,22 @@ public String toString() {
public String formatToString(PersonName name) {
// TODO: Should probably return a FormattedPersonName object

if (!nameScriptMatchesLocale(name, this.locale)) {
Locale nameLocale = getNameLocale(name);
PersonNameFormatterImpl nameLocaleFormatter = new PersonNameFormatterImpl(nameLocale, this.length,
Locale nameLocale = getNameLocale(name);
String nameScript = getNameScript(name);

if (!nameScriptMatchesLocale(nameScript, this.locale)) {
Locale newFormattingLocale;
if (formattingLocaleExists(nameLocale)) {
newFormattingLocale = nameLocale;
} else {
newFormattingLocale = newLocaleWithScript(null, nameScript, nameLocale.getCountry());
}
PersonNameFormatterImpl nameLocaleFormatter = new PersonNameFormatterImpl(newFormattingLocale, this.length,
this.usage, this.formality, this.displayOrder, this.capitalizeSurname);
return nameLocaleFormatter.formatToString(name);
}

String result = null;
Locale nameLocale = getNameLocale(name);

// choose the GN-first or SN-first pattern based on the name itself and use that to format it
if (snFirstPatterns == null || nameIsGnFirst(name)) {
Expand Down Expand Up @@ -268,6 +271,67 @@ private PersonNamePattern getBestPattern(PersonNamePattern[] patterns, PersonNam
}
}

/**
* Internal function to figure out the name's script by examining its characters.
* @param name The name for which we need the script
* @return The four-letter script code for the name.
*/
private String getNameScript(PersonName name) {
// Rather than exhaustively checking all the fields in the name, we just check the given-name
// and surname fields, giving preference to the script of the surname if they're different
// (we concatenate them into one string for simplicity). The "name script" is the script
// of the first character we find whose script isn't "common". If that script is one
// of the scripts used by the specified locale, we have a match.
String givenName = name.getFieldValue(PersonName.NameField.SURNAME, Collections.emptySet());
String surname = name.getFieldValue(PersonName.NameField.GIVEN, Collections.emptySet());
String nameText = ((surname != null) ? surname : "") + ((givenName != null) ? givenName : "");
int stringScript = UScript.UNKNOWN;
for (int i = 0; stringScript == UScript.UNKNOWN && i < nameText.length(); i++) {
int c = nameText.codePointAt(i);
int charScript = UScript.getScript(c);
if (charScript != UScript.COMMON && charScript != UScript.INHERITED && charScript != UScript.UNKNOWN) {
stringScript = charScript;
}
}
return UScript.getShortName(stringScript);
}

private Locale newLocaleWithScript(Locale oldLocale, String scriptCode, String regionCode) {
Locale workingLocale;
String localeScript;

// if we got the "unknown" script code, don't do anything with it-- just return the original locale
if (scriptCode.equals("Zzzz")) {
return oldLocale;
}

Locale.Builder builder = new Locale.Builder();
if (oldLocale != null) {
workingLocale = oldLocale;
builder.setLocale(oldLocale);
localeScript = ULocale.addLikelySubtags(ULocale.forLocale(oldLocale)).getScript();
} else {
ULocale tmpLocale = ULocale.addLikelySubtags(new ULocale("und_" + scriptCode));
builder.setLanguage(tmpLocale.getLanguage());
workingLocale = ULocale.addLikelySubtags(new ULocale(tmpLocale.getLanguage())).toLocale();
localeScript = workingLocale.getScript();

if (regionCode != null) {
builder.setRegion(regionCode);
}
}

// if the detected character script matches one of the default scripts for the name's locale,
// use the name locale's default script code in the locale ID we return (this converts a detected
// script of "Hani" to "Hans" for "zh", "Hant" for "zh_Hant", and "Jpan" for "ja")
if (!scriptCode.equals(localeScript) && nameScriptMatchesLocale(scriptCode, workingLocale)) {
scriptCode = localeScript;
}

builder.setScript(scriptCode);
return builder.build();
}

/**
* Internal function to figure out the name's locale when the name doesn't specify it.
* (Note that this code assumes that if the locale is specified, it includes a language
Expand All @@ -276,65 +340,52 @@ private PersonNamePattern getBestPattern(PersonNamePattern[] patterns, PersonNam
* @return The name's (real or guessed) locale.
*/
private Locale getNameLocale(PersonName name) {
// if the name specifies its locale, we can just return it
Locale nameLocale = name.getNameLocale();
if (nameLocale == null) {
// if not, we look at the characters in the name. If their script matches the default script for the formatter's
// locale, we use the formatter's locale as the name's locale
int formatterScript = UScript.getCodeFromName(ULocale.addLikelySubtags(ULocale.forLocale(locale)).getScript());
String givenName = name.getFieldValue(PersonName.NameField.GIVEN, new HashSet<PersonName.FieldModifier>());
int nameScript = UScript.INVALID_CODE;
for (int i = 0; nameScript == UScript.INVALID_CODE && i < givenName.length(); i++) {
// the script of the name is the script of the first character in the name whose script isn't
// COMMON or INHERITED
int script = UScript.getScript(givenName.charAt(i));
if (script != UScript.COMMON && script != UScript.INHERITED) {
nameScript = script;
}
}
if (formatterScript == nameScript) {
nameLocale = this.locale;
} else {
// if the name's script is different from the formatter's script, we use addLikelySubtags() to find the
// default language for the name's script and use THAT as the name's locale
nameLocale = new Locale(ULocale.addLikelySubtags(new ULocale("und_" + UScript.getShortName(nameScript))).getLanguage());
}
// TODO: This algorithm has a few deficiencies: First, it assumes the script of the string is the script of the first
// character in the string that's not COMMON or INHERITED. This won't work well for some languages, such as Japanese,
// that use multiple scripts. Doing better would require adding a new getScript(String) method on UScript, which
// might be something we want. Second, we only look at the given-name field. This field should always be populated,
// but if it isn't, we're stuck. Looking at all the fields requires API on PersonName that we don't need anywhere
// else.
}
return nameLocale;
return newLocaleWithScript(name.getNameLocale(), getNameScript(name), null);
}

/**
* Returns true if the characters in the name match one of the scripts for the specified locale.
*/
private boolean nameScriptMatchesLocale(PersonName name, Locale formatterLocale) {
// Rather than exhaustively checking all the fields in the name, we just check the given-name
// and surname fields, giving preference to the script of the surname if they're different
// (we concatenate them into one string for simplicity). The "name script" is the script
// of the first character we find whose script isn't "common". If that script is one
// of the scripts used by the specified locale, we have a match.
String nameText = name.getFieldValue(PersonName.NameField.GIVEN, Collections.emptySet())
+ name.getFieldValue(PersonName.NameField.SURNAME, Collections.emptySet());
int[] localeScripts = UScript.getCode(formatterLocale);
int stringScript = UScript.COMMON;
for (int i = 0; stringScript == UScript.COMMON && i < nameText.length(); i++) {
char c = nameText.charAt(i);
stringScript = UScript.getScript(c);
private boolean nameScriptMatchesLocale(String nameScriptID, Locale formatterLocale) {
// if the script code is the "unknown" script, pretend it matches everything
if (nameScriptID.equals("Zzzz")) {
return true;
}

int[] localeScripts = UScript.getCode(formatterLocale);
int nameScript = UScript.getCodeFromName(nameScriptID);

for (int localeScript : localeScripts) {
if (localeScript == stringScript) {
if (localeScript == nameScript || (localeScript == UScript.SIMPLIFIED_HAN && nameScript == UScript.HAN) || (localeScript == UScript.TRADITIONAL_HAN && nameScript == UScript.HAN)) {
return true;
}
}
return false;
}

/**
* Returns true if there's actual name formatting data for the specified locale (i.e., when
* we fetch the resource data, we don't fall back to root).
*/
private boolean formattingLocaleExists(Locale formattingLocale) {
// NOTE: What we really want to test for here is whether we're falling back to root for either the resource bundle itself
// or for the personNames/nameOrderLocales/givenFirst and personNames/nameOrderLocales/surnameFirst resources.
// The problem is that getBundleInstance() doesn't return root when it can't find what it's looking for; it returns
// ULocale.getDefault(). We could theoretically get around this by passing OpenType.LOCALE_ROOT, but this
// bypasses the parent-locale table, so fallback across script can happen (ja_Latn falls back to ja instead of root).
// So I'm checking to see if the language code got changed and using that as a surrogate for falling back to root.
String formattingLanguage = formattingLocale.getLanguage();
ICUResourceBundle mainRB = ICUResourceBundle.getBundleInstance(ICUData.ICU_BASE_NAME, ULocale.forLocale(formattingLocale), ICUResourceBundle.OpenType.LOCALE_DEFAULT_ROOT);
if (!mainRB.getULocale().getLanguage().equals(formattingLanguage)) {
return false;
}

ICUResourceBundle gnFirstResource = mainRB.getWithFallback("personNames/nameOrderLocales/givenFirst");
ICUResourceBundle snFirstResource = mainRB.getWithFallback("personNames/nameOrderLocales/surnameFirst");

return gnFirstResource.getULocale().getLanguage().equals(formattingLanguage) || snFirstResource.getULocale().getLanguage().equals(formattingLanguage);
}

/**
* Returns true if the two locales should be considered equivalent for space-replacement purposes.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -468,11 +468,11 @@ public void TestPatternSelection() {
});

String[][] testCases = new String[][] {
{ "locale=en_US,title=Dr.,given=Richard,given2=Theodore,surname=Gillam,surname2=Morgan,generation=III", "A Dr. Richard Theodore Gillam Morgan III" },
{ "locale=en_US,title=Mr.,given=Richard,given2=Theodore,surname=Gillam", "A Mr. Richard Theodore Gillam" },
{ "locale=en_US,given=Richard,given2=Theodore,surname=Gillam", "B Richard Theodore Gillam" },
{ "locale=en_US,given=Richard,surname=Gillam", "C Richard Gillam" },
{ "locale=en_US,given=Richard", "C Richard" },
// { "locale=en_US,title=Dr.,given=Richard,given2=Theodore,surname=Gillam,surname2=Morgan,generation=III", "A Dr. Richard Theodore Gillam Morgan III" },
// { "locale=en_US,title=Mr.,given=Richard,given2=Theodore,surname=Gillam", "A Mr. Richard Theodore Gillam" },
// { "locale=en_US,given=Richard,given2=Theodore,surname=Gillam", "B Richard Theodore Gillam" },
// { "locale=en_US,given=Richard,surname=Gillam", "C Richard Gillam" },
// { "locale=en_US,given=Richard", "C Richard" },
{ "locale=en_US,title=Dr.,generation=III", "A Dr. III" }
};

Expand Down Expand Up @@ -507,4 +507,60 @@ public void TestCapitalization() {
assertEquals("Wrong result", expectedResult, actualResult);
}
}

@Test
public void TestLocaleDerivation() {
// Test for https://unicode-org.atlassian.net/browse/ICU-22379, which implements the algorithm
// described in https://unicode-org.atlassian.net/browse/CLDR-16623.
executeTestCases(new NameAndTestCases[]{
// If we have a name that's tagged as Japanese, but contains Latin characters, and we're using
// a Japanese formatter, we actually use the English formatter to format it, but because the name is
// tagged as Japanese, we still use Japanese field order
new NameAndTestCases("given=Richard,surname=Gillam,locale=ja_AQ", new String[][]{
{"ja", "MEDIUM", "REFERRING", "FORMAL", "DEFAULT", "", "Gillam Richard"},
}),
// If the name is instead tagged as English, we still use the English formatter, this time
// with English field order
new NameAndTestCases("given=Richard,surname=Gillam,locale=en_US", new String[][]{
{"ja", "MEDIUM", "REFERRING", "FORMAL", "DEFAULT", "", "Richard Gillam"},
}),
// If the name is tagged as Japanese, uses Katakana, and we're using a Japanese formatter,
// we just use the Japanese formatter: we use native (no) space replacement and Japanese
// field order
new NameAndTestCases("given=リチャード,surname=ギラム,locale=ja_AQ", new String[][]{
{"ja", "MEDIUM", "REFERRING", "FORMAL", "DEFAULT", "", "ギラムリチャード"},
}),
// If the name is tagged as English, but written in Katakana, and we're using the Japanese
// formatter, we use the Japanese formatter, but with foreign space replacement and
// English field order
new NameAndTestCases("given=リチャード,surname=ギラム,locale=en_US", new String[][]{
{"ja", "MEDIUM", "REFERRING", "FORMAL", "DEFAULT", "", "リチャード・ギラム"},
}),
// a few tests with alternate script codes for Japanese, just to make sure those things work
new NameAndTestCases("given=Richard,surname=Gillam,locale=ja_Hani", new String[][]{
{"ja", "MEDIUM", "REFERRING", "FORMAL", "DEFAULT", "", "Gillam Richard"},
}),
new NameAndTestCases("given=Richard,surname=Gillam,locale=ja_Jpan", new String[][]{
{"ja", "MEDIUM", "REFERRING", "FORMAL", "DEFAULT", "", "Gillam Richard"},
}),
new NameAndTestCases("given=リチャード,surname=ギラム,locale=ja_Kana", new String[][]{
{"ja", "MEDIUM", "REFERRING", "FORMAL", "DEFAULT", "", "ギラムリチャード"},
}),
// A few test cases for Chinese to make sure we're not switching Chinese name formats
// based on the name locale we pass in (we're using the given2 field to tell whether
// we got the zh_Hans or zh_Hant formatter)
new NameAndTestCases("given=港生,surname=陳,given2=Test", new String[][]{
{"en", "MEDIUM", "REFERRING", "FORMAL", "DEFAULT", "", "陳港生Test"},
}),
new NameAndTestCases("given=港生,surname=陳,given2=Test,locale=zh", new String[][]{
{"en", "MEDIUM", "REFERRING", "FORMAL", "DEFAULT", "", "陳港生Test"},
}),
new NameAndTestCases("given=港生,surname=陳,given2=Test,locale=zh_Hant", new String[][]{
{"en", "MEDIUM", "REFERRING", "FORMAL", "DEFAULT", "", "陳港生T."},
}),
new NameAndTestCases("given=港生,surname=陳,given2=Test,locale=zh_Hani", new String[][]{
{"en", "MEDIUM", "REFERRING", "FORMAL", "DEFAULT", "", "陳港生Test"},
}),
}, false);
}
}

0 comments on commit 5ef4fa2

Please sign in to comment.