1 /* Copyright (C) 2007-2013 Google and others. All Rights Reserved. */ 2 /* Copyright (C) 2007-2013 IBM Corp. and others. All Rights Reserved. */ 3 4 package org.unicode.cldr.test; 5 6 import com.google.common.base.Joiner; 7 import com.google.common.base.Splitter; 8 import com.google.common.collect.TreeMultimap; 9 import com.google.myanmartools.ZawgyiDetector; 10 import com.ibm.icu.lang.UCharacter; 11 import com.ibm.icu.text.Collator; 12 import com.ibm.icu.text.DateIntervalInfo; 13 import com.ibm.icu.text.DateTimePatternGenerator; 14 import com.ibm.icu.text.DecimalFormat; 15 import com.ibm.icu.text.Normalizer; 16 import com.ibm.icu.text.Transform; 17 import com.ibm.icu.text.Transliterator; 18 import com.ibm.icu.text.UnicodeSet; 19 import com.ibm.icu.text.UnicodeSetIterator; 20 import com.ibm.icu.util.Output; 21 import com.ibm.icu.util.ULocale; 22 import java.util.ArrayList; 23 import java.util.Arrays; 24 import java.util.Comparator; 25 import java.util.HashMap; 26 import java.util.HashSet; 27 import java.util.List; 28 import java.util.Locale; 29 import java.util.Map; 30 import java.util.Set; 31 import java.util.TreeSet; 32 import java.util.regex.Matcher; 33 import java.util.regex.Pattern; 34 import org.unicode.cldr.test.CheckExemplars.ExemplarType; 35 import org.unicode.cldr.util.AnnotationUtil; 36 import org.unicode.cldr.util.Builder; 37 import org.unicode.cldr.util.CLDRConfig; 38 import org.unicode.cldr.util.CLDRFile; 39 import org.unicode.cldr.util.CLDRLocale; 40 import org.unicode.cldr.util.CldrUtility; 41 import org.unicode.cldr.util.ComparatorUtilities; 42 import org.unicode.cldr.util.DateTimeCanonicalizer; 43 import org.unicode.cldr.util.DateTimeCanonicalizer.DateTimePatternType; 44 import org.unicode.cldr.util.Emoji; 45 import org.unicode.cldr.util.LocaleNames; 46 import org.unicode.cldr.util.PatternCache; 47 import org.unicode.cldr.util.SimpleUnicodeSetFormatter; 48 import org.unicode.cldr.util.SupplementalDataInfo; 49 import org.unicode.cldr.util.UnicodeSetPrettyPrinter; 50 import org.unicode.cldr.util.VoteResolver; 51 import org.unicode.cldr.util.XMLSource; 52 import org.unicode.cldr.util.XPathParts; 53 54 /** 55 * Class for processing the input and output of CLDR data for use in the Survey Tool and other 56 * tools. 57 */ 58 public class DisplayAndInputProcessor { 59 60 /** Special PersonName paths that allow empty string, public for testing */ 61 public static final String NOL_START_PATH = "//ldml/personNames/nameOrderLocales"; 62 63 public static final String FSR_START_PATH = "//ldml/personNames/foreignSpaceReplacement"; 64 public static final String NSR_START_PATH = "//ldml/personNames/nativeSpaceReplacement"; 65 66 public static final String EMPTY_ELEMENT_VALUE = "❮EMPTY❯"; 67 68 private static final boolean FIX_YEARS = true; 69 70 public static final boolean DEBUG_DAIP = CldrUtility.getProperty("DEBUG_DAIP", false); 71 72 public static final UnicodeSet RTL = 73 new UnicodeSet("[[:Bidi_Class=Arabic_Letter:][:Bidi_Class=Right_To_Left:]]").freeze(); 74 75 public static final Pattern NUMBER_SEPARATOR_PATTERN = 76 Pattern.compile("//ldml/numbers/symbols.*/(decimal|group)"); 77 78 private static final Pattern APOSTROPHE_SKIP_PATHS = 79 PatternCache.get( 80 "//ldml/(" 81 + "localeDisplayNames/languages/language\\[@type=\"mic\"].*|" 82 + "characters/.*|" 83 + "delimiters/.*|" 84 + "dates/.+/(pattern|intervalFormatItem|dateFormatItem).*|" 85 + "units/.+/unitPattern.*|" 86 + "units/.+/durationUnitPattern.*|" 87 + "numbers/symbols.*|" 88 + "numbers/miscPatterns.*|" 89 + "numbers/(decimal|currency|percent|scientific)Formats.+/(decimal|currency|percent|scientific)Format.*)"); 90 private static final Pattern INTERVAL_FORMAT_PATHS = 91 PatternCache.get("//ldml/dates/.+/intervalFormat(Item.*|Fallback)"); 92 private static final Pattern NON_DECIMAL_PERIOD = PatternCache.get("(?<![0#'])\\.(?![0#'])"); 93 94 // Pattern to match against paths that might have time formats with h or K (12-hour cycles) 95 private static final Pattern HOUR_FORMAT_XPATHS = 96 PatternCache.get( 97 "//ldml/dates/calendars/calendar\\[@type=\"[^\"]*\"]/(" 98 + "timeFormats/timeFormatLength\\[@type=\"[^\"]*\"]/timeFormat\\[@type=\"standard\"]/pattern\\[@type=\"standard\"].*|" 99 + "dateTimeFormats/availableFormats/dateFormatItem\\[@id=\"[A-GL-Ma-gl-m]*[hK][A-Za-z]*\"].*|" 100 + "dateTimeFormats/intervalFormats/intervalFormatItem\\[@id=\"[A-GL-Ma-gl-m]*[hK][A-Za-z]*\"].*)"); 101 102 private static final Pattern AMPM_SPACE_BEFORE = 103 PatternCache.get("([Khms])([ \\u00A0\\u202F]+)(a+)"); // time, space, a+ 104 private static final Pattern AMPM_SPACE_AFTER = 105 PatternCache.get("(a+)([ \\u00A0\\u202F]+)([Kh])"); // a+, space, hour 106 107 // Pattern to match against paths that might have date formats with y 108 private static final Pattern YEAR_FORMAT_XPATHS = 109 PatternCache.get( 110 "//ldml/dates/calendars/calendar\\[@type=\"[^\"]*\"]/(" 111 + "dateFormats/dateFormatLength\\[@type=\"[^\"]*\"]/dateFormat\\[@type=\"standard\"]/pattern\\[@type=\"standard\"].*|" 112 + "dateTimeFormats/availableFormats/dateFormatItem\\[@id=\"[A-XZa-xz]*y[A-Za-z]*\"].*|" 113 + "dateTimeFormats/intervalFormats/intervalFormatItem\\[@id=\"[A-XZa-xz]*y[A-Za-z]*\"].*)"); 114 115 // Cyrillic year markers are or begin with (in various languages) \u0430 \u0433 \u0435 \u0436 116 // \u043E \u0440 \u0441 117 private static final Pattern YEAR_SPACE_YEARMARKER = 118 PatternCache.get("y[ \\u00A0]+('?[агежорс])"); // y, space, Cyrillic year marker start 119 120 public static final Pattern UNIT_NARROW_XPATHS = 121 PatternCache.get( 122 "//ldml/units/unitLength\\[@type=\"narrow\"]unit\\[@type=\"[^\"]*\"]/unitPattern.*"); 123 124 public static final Pattern UNIT_SHORT_XPATHS = 125 PatternCache.get( 126 "//ldml/units/unitLength\\[@type=\"short\"]unit\\[@type=\"[^\"]*\"]/unitPattern.*"); 127 128 private static final Pattern PLACEHOLDER_SPACE_AFTER = 129 PatternCache.get("\\}[ \\u00A0\\u202F]+"); 130 private static final Pattern PLACEHOLDER_SPACE_BEFORE = 131 PatternCache.get("[ \\u00A0\\u202F]+\\{"); 132 private static final Pattern INTERVAL_FALLBACK_RANGE = PatternCache.get("\\} [\\u2013-] \\{"); 133 134 /** string of whitespace not including NBSP, i.e. [\t\n\r]+ */ 135 private static final Pattern WHITESPACE_NO_NBSP_TO_NORMALIZE = PatternCache.get("\\s+"); // 136 137 /** string of whitespace, possibly including NBSP and/or NNBSP, ie., [\u00A0\t\n\r\u202F]+ */ 138 private static final Pattern WHITESPACE_AND_NBSP_TO_NORMALIZE = 139 PatternCache.get("[\\s\\u00A0]+"); 140 // Reverted 2022-12-08 from: 141 // private static final Pattern WHITESPACE_AND_NBSP_TO_NORMALIZE = 142 // PatternCache.get("[\\s\\u00A0\\u202F]+"); 143 144 /** one or more NBSP (or NNBSP) followed by one or more regular spaces */ 145 private static final Pattern NBSP_PLUS_SPACE_TO_NORMALIZE = 146 PatternCache.get("\\u00A0+\\u0020+"); 147 // Reverted 2022-12-08 from: 148 // private static final Pattern NBSP_PLUS_SPACE_TO_NORMALIZE = 149 // PatternCache.get("[\\u00A0\\u202F]+\\u0020+"); 150 151 /** one or more regular spaces followed by one or more NBSP (or NNBSP) */ 152 private static final Pattern SPACE_PLUS_NBSP_TO_NORMALIZE = 153 PatternCache.get("\\u0020+\\u00A0+"); 154 // Reverted 2022-12-08 from: 155 // private static final Pattern SPACE_PLUS_NBSP_TO_NORMALIZE = 156 // PatternCache.get("\\u0020+[\\u00A0\\u202F]+"); 157 158 // NNBSP 202F among other horizontal spaces (includes 0020, 00A0, 2009, 202F, etc.) 159 private static final Pattern NNBSP_AMONG_OTHER_SPACES = 160 PatternCache.get("[\\h&&[^\\u202F]]+\\u202F\\h*|\\u202F\\h+"); 161 // NBSP 00A0 among other horizontal spaces 162 private static final Pattern NBSP_AMONG_OTHER_SPACES = 163 PatternCache.get("[\\h&&[^\\u00A0]]+\\u00A0\\h*|\\u00A0\\h+"); 164 // THIN SPACE 2009 among other horizontal spaces 165 private static final Pattern THIN_SPACE_AMONG_OTHER_SPACES = 166 PatternCache.get("[\\h&&[^\\u2009]]+\\u2009\\h*|\\u2009\\h+"); 167 168 private static final Pattern INITIAL_NBSP = PatternCache.get("^[\\u00A0\\u202F]+"); 169 private static final Pattern FINAL_NBSP = PatternCache.get("[\\u00A0\\u202F]+$"); 170 171 private static final Pattern MULTIPLE_NBSP = PatternCache.get("\\u00A0\\u00A0+"); 172 // Reverted 2022-12-08 from: 173 // private static final Pattern MULTIPLE_NBSP = 174 // PatternCache.get("[\\u00A0\\u202F][\\u00A0\\u202F]+"); 175 176 // The following includes (among others) \u0009, \u0020, \u00A0, \u2007, \u2009, \u202F, \u3000 177 private static final UnicodeSet UNICODE_WHITESPACE = new UnicodeSet("[:whitespace:]").freeze(); 178 179 private static final CLDRLocale MALAYALAM = CLDRLocale.getInstance("ml"); 180 private static final CLDRLocale ROMANIAN = CLDRLocale.getInstance("ro"); 181 private static final CLDRLocale CATALAN = CLDRLocale.getInstance("ca"); 182 private static final CLDRLocale NGOMBA = CLDRLocale.getInstance("jgo"); 183 private static final CLDRLocale KWASIO = CLDRLocale.getInstance("nmg"); 184 private static final CLDRLocale HEBREW = CLDRLocale.getInstance("he"); 185 private static final CLDRLocale MYANMAR = CLDRLocale.getInstance("my"); 186 private static final CLDRLocale KYRGYZ = CLDRLocale.getInstance("ky"); 187 private static final CLDRLocale URDU = CLDRLocale.getInstance("ur"); 188 private static final CLDRLocale PASHTO = CLDRLocale.getInstance("ps"); 189 private static final CLDRLocale FARSI = CLDRLocale.getInstance("fa"); 190 private static final CLDRLocale GERMAN_SWITZERLAND = CLDRLocale.getInstance("de_CH"); 191 private static final CLDRLocale SWISS_GERMAN = CLDRLocale.getInstance("gsw"); 192 private static final CLDRLocale FF_ADLAM = CLDRLocale.getInstance("ff_Adlm"); 193 private static final CLDRLocale KASHMIRI = CLDRLocale.getInstance("ks"); 194 public static final Set<String> LANGUAGES_USING_MODIFIER_APOSTROPHE = 195 new HashSet<>( 196 Arrays.asList( 197 "br", "bss", "cad", "cic", "cch", "gn", "ha", "ha_Latn", "lkt", "mgo", 198 "mic", "moh", "mus", "nnh", "qu", "quc", "uk", "uz", "uz_Latn")); 199 200 // Ş ş Ţ ţ => Ș ș Ț ț 201 private static final char[][] ROMANIAN_CONVERSIONS = { 202 {'\u015E', '\u0218'}, {'\u015F', '\u0219'}, {'\u0162', '\u021A'}, {'\u0163', '\u021B'} 203 }; 204 205 private static final char[][] CATALAN_CONVERSIONS = { 206 {'\u013F', '\u004C', '\u00B7'}, // Ŀ -> L· 207 {'\u0140', '\u006C', '\u00B7'} 208 }; // ŀ -> l· 209 210 private static final char[][] NGOMBA_CONVERSIONS = { 211 {'\u0251', '\u0061'}, {'\u0261', '\u0067'}, // ɑ -> a , ɡ -> g , See ticket #5691 212 {'\u2019', '\uA78C'}, {'\u02BC', '\uA78C'} 213 }; // Saltillo, see ticket #6805 214 215 private static final char[][] KWASIO_CONVERSIONS = { 216 {'\u0306', '\u030C'}, // See ticket #6571, use caron instead of breve 217 {'\u0103', '\u01CE'}, 218 {'\u0102', '\u01CD'}, // a-breve -> a-caron 219 {'\u0115', '\u011B'}, 220 {'\u011A', '\u01CD'}, // e-breve -> e-caron 221 {'\u012D', '\u01D0'}, 222 {'\u012C', '\u01CF'}, // i-breve -> i-caron 223 {'\u014F', '\u01D2'}, 224 {'\u014E', '\u01D1'}, // o-breve -> o-caron 225 {'\u016D', '\u01D4'}, 226 {'\u016C', '\u01D3'} // u-breve -> u-caron 227 }; 228 229 private static final char[][] HEBREW_CONVERSIONS = { 230 {'\'', '\u05F3'}, {'"', '\u05F4'} 231 }; // ' -> geresh " -> gershayim 232 233 private static final char[][] KYRGYZ_CONVERSIONS = {{'ӊ', 'ң'}, {'Ӊ', 'Ң'}}; // right modifier 234 235 private static final char[][] URDU_PLUS_CONVERSIONS = {{'\u0643', '\u06A9'}}; // wrong char 236 237 private static final char[][] KASHMIRI_CONVERSIONS = { 238 {'ۍ', 'ؠ'} 239 }; // wrong char (see CLDR-16595) 240 241 private static final ZawgyiDetector detector = new ZawgyiDetector(); 242 private static final Transliterator zawgyiUnicodeTransliterator = 243 Transliterator.getInstance("Zawgyi-my"); 244 245 private SimpleUnicodeSetFormatter pp = new SimpleUnicodeSetFormatter(); // default collator 246 private UnicodeSetPrettyPrinter rawFormatter = new UnicodeSetPrettyPrinter(); // default 247 248 private final CLDRLocale locale; 249 private String scriptCode; // actual or default script code (not null after init) 250 private boolean isPosix; 251 252 private CLDRFile cldrFileForBailey = null; 253 254 /** 255 * Constructor, taking cldrFile. 256 * 257 * @param cldrFileToCheck 258 */ DisplayAndInputProcessor(CLDRFile cldrFileToCheck, boolean needsCollator)259 public DisplayAndInputProcessor(CLDRFile cldrFileToCheck, boolean needsCollator) { 260 init(this.locale = CLDRLocale.getInstance(cldrFileToCheck.getLocaleID()), needsCollator); 261 } 262 DisplayAndInputProcessor(CLDRFile cldrFileToCheck)263 public DisplayAndInputProcessor(CLDRFile cldrFileToCheck) { 264 init(this.locale = CLDRLocale.getInstance(cldrFileToCheck.getLocaleID()), true); 265 } 266 init(CLDRLocale locale, boolean needsCollator)267 void init(CLDRLocale locale, boolean needsCollator) { 268 isPosix = locale.toString().contains("POSIX"); 269 if (needsCollator) { 270 Collator col = 271 ComparatorUtilities.getCldrCollator(locale.toString(), Collator.IDENTICAL); 272 Collator spaceCol = 273 ComparatorUtilities.getCldrCollator(locale.toString(), Collator.PRIMARY); 274 pp = new SimpleUnicodeSetFormatter((Comparator) col); 275 rawFormatter = UnicodeSetPrettyPrinter.from((Comparator) col, (Comparator) spaceCol); 276 } else { 277 pp = new SimpleUnicodeSetFormatter(); // default collator 278 rawFormatter = new UnicodeSetPrettyPrinter(); // default 279 } 280 String script = locale.getScript(); 281 if (script == null || script.length() < 4) { 282 SupplementalDataInfo sdi = CLDRConfig.getInstance().getSupplementalDataInfo(); 283 script = sdi.getDefaultScript(locale.getBaseName()); 284 if (script == null || script.length() < 4 || script.equals("Zzzz")) { 285 script = sdi.getDefaultScript(locale.getLanguage()); 286 } 287 if (script == null || script.length() < 4) { 288 script = "Zzzz"; 289 } 290 } 291 scriptCode = script; 292 } 293 getPrettyPrinter()294 public SimpleUnicodeSetFormatter getPrettyPrinter() { 295 return pp; 296 } 297 298 /** 299 * Constructor, taking ULocale and boolean. 300 * 301 * @param locale the ULocale 302 * @param needsCollator true or false 303 * <p>Called by getProcessor, with locale = SurveyMain.TRANS_HINT_LOCALE 304 */ DisplayAndInputProcessor(ULocale locale, boolean needsCollator)305 public DisplayAndInputProcessor(ULocale locale, boolean needsCollator) { 306 init(this.locale = CLDRLocale.getInstance(locale), needsCollator); 307 } 308 309 /** 310 * Constructor, taking ULocale. 311 * 312 * @param locale the ULocale 313 */ DisplayAndInputProcessor(ULocale locale)314 public DisplayAndInputProcessor(ULocale locale) { 315 init(this.locale = CLDRLocale.getInstance(locale), true /* needsCollator */); 316 } 317 318 /** 319 * Constructor, taking CLDRLocale and boolean. 320 * 321 * @param locale the CLDRLocale 322 * @param needsCollator true or false 323 */ DisplayAndInputProcessor(CLDRLocale locale, boolean needsCollator)324 public DisplayAndInputProcessor(CLDRLocale locale, boolean needsCollator) { 325 init(this.locale = locale, needsCollator); 326 } 327 328 /** 329 * Constructor, taking locale. 330 * 331 * @param locale 332 */ DisplayAndInputProcessor(CLDRLocale locale)333 public DisplayAndInputProcessor(CLDRLocale locale) { 334 init(this.locale = locale, true); 335 } 336 337 /** 338 * Process the value for display. The result is a string for display in the Survey tool or 339 * similar program. 340 * 341 * @param path 342 * @param value 343 * @return 344 */ processForDisplay(String path, String value)345 public synchronized String processForDisplay(String path, String value) { 346 if (value == null) { 347 return null; 348 } 349 if (CldrUtility.INHERITANCE_MARKER.equals(value)) { 350 return value; 351 } 352 value = Normalizer.compose(value, false); // Always normalize all text to NFC. 353 if (hasUnicodeSetValue(path)) { 354 return displayUnicodeSet(value); 355 } else if (path.contains("stopword")) { 356 return value.trim().isEmpty() ? "NONE" : value; 357 } else { 358 NumericType numericType = NumericType.getNumericType(path); 359 if (numericType != NumericType.NOT_NUMERIC) { 360 // Canonicalize existing values that aren't canonicalized yet. 361 // New values will be canonicalized on input using processInput(). 362 try { 363 value = getCanonicalPattern(value, numericType, isPosix); 364 } catch (IllegalArgumentException e) { 365 if (DEBUG_DAIP) System.err.println("Illegal pattern: " + value); 366 } 367 if (numericType != NumericType.CURRENCY 368 && numericType != NumericType.CURRENCY_ABBREVIATED) { 369 value = value.replace("'", ""); 370 } 371 } 372 } 373 // Fix up any apostrophes in number symbols 374 if (NUMBER_SEPARATOR_PATTERN.matcher(path).matches()) { 375 value = value.replace('\'', '\u2019'); 376 } 377 // Fix up any apostrophes as appropriate (Don't do so for things like date patterns... 378 if (!APOSTROPHE_SKIP_PATHS.matcher(path).matches()) { 379 value = normalizeApostrophes(value); 380 } 381 // Fix up hyphens, replacing with N-dash as appropriate 382 if (INTERVAL_FORMAT_PATHS.matcher(path).matches()) { 383 value = 384 normalizeIntervalHyphensAndSpaces( 385 value); // This may also adjust spaces around en dash 386 } else { 387 value = normalizeHyphens(value); 388 } 389 // Fix up possibly empty field 390 if (value.isEmpty() 391 && (path.startsWith(FSR_START_PATH) 392 || path.startsWith(NSR_START_PATH) 393 || path.startsWith(NOL_START_PATH))) { 394 value = EMPTY_ELEMENT_VALUE; 395 } 396 return value; 397 } 398 hasUnicodeSetValue(String path)399 private boolean hasUnicodeSetValue(String path) { 400 return path.startsWith("//ldml/characters/exemplarCharacters") 401 || path.startsWith("//ldml/characters/parseLenients"); 402 } 403 404 static final DateTimeCanonicalizer dtc = new DateTimeCanonicalizer(FIX_YEARS); 405 406 private static final String BAR_VL = "\\|"; // U+007C VERTICAL LINE (pipe, bar) literal 407 private static final String BAR_EL = "\\s+l\\s+"; // U+006C LATIN SMALL LETTER L with space 408 private static final String BAR_DANDA = "।"; // U+0964 DEVANAGARI DANDA 409 private static final String BAR_REGEX = "(" + BAR_VL + "|" + BAR_EL + "|" + BAR_DANDA + ")"; 410 public static final Splitter SPLIT_BAR = 411 Splitter.on(Pattern.compile(BAR_REGEX)).trimResults().omitEmptyStrings(); 412 static final Splitter SPLIT_SPACE = Splitter.on(' ').trimResults().omitEmptyStrings(); 413 static final Joiner JOIN_BAR = Joiner.on(" | "); 414 static final Joiner JOIN_SPACE = Joiner.on(' '); 415 416 /** 417 * Process the value for input. The result is a cleaned-up value. For example, an exemplar set 418 * is modified to be in the normal format, and any missing [ ] are added (a common omission on 419 * entry). If there are any failures then the original value is returned, so that the proper 420 * error message can be given. 421 * 422 * @param path 423 * @param value 424 * @param internalException to be filled in if RuntimeException occurs 425 * @return the possibly modified value 426 */ processInput( String path, String value, Exception[] internalException)427 public synchronized String processInput( 428 String path, String value, Exception[] internalException) { 429 // skip processing for inheritance marker 430 if (CldrUtility.INHERITANCE_MARKER.equals(value)) { 431 return value; 432 } 433 final String original = value; 434 value = stripProblematicControlCharacters(value); 435 value = Normalizer.compose(value, false); // Always normalize all input to NFC. 436 value = value.replace('\u00B5', '\u03BC'); // use the right Greek mu character 437 if (internalException != null) { 438 internalException[0] = null; 439 } 440 // for root annotations 441 if (CLDRLocale.ROOT.equals(locale) && path.contains("/annotations")) { 442 return value; 443 } 444 try { 445 value = processInputMore(path, value); 446 } catch (RuntimeException e) { 447 if (internalException != null) { 448 internalException[0] = e; 449 } 450 return original; 451 } 452 return value; 453 } 454 processInputMore(String path, String value)455 private String processInputMore(String path, String value) { 456 final boolean isUnicodeSet = hasUnicodeSetValue(path); 457 if (isUnicodeSet) { 458 return inputUnicodeSet(path, value); 459 } 460 461 value = processLocaleSpecificInput(path, value, isUnicodeSet); 462 463 if (UNICODE_WHITESPACE.containsSome(value)) { 464 value = normalizeWhitespace(path, value); 465 } 466 467 // remove the empty value (mostly relevant for person names, 468 // but prevents it showing up elsewhere by mistake 469 value = value.replace(EMPTY_ELEMENT_VALUE, ""); 470 471 // all of our values should not have leading or trailing spaces, except insertBetween, 472 // foreignSpaceReplacement, and anything with built-in attribute xml:space="preserve" 473 if (!path.contains("/insertBetween") 474 && !path.contains("/foreignSpaceReplacement") 475 && !path.contains("/nativeSpaceReplacement") 476 && !path.contains("[@xml:space=\"preserve\"]") 477 && !isUnicodeSet) { 478 value = value.trim(); 479 } 480 481 // fix grouping separator if space 482 if (path.startsWith("//ldml/numbers/symbols") && !path.contains("/alias")) { 483 if (value.isEmpty()) { 484 value = "\u00A0"; 485 } 486 value = value.replace(' ', '\u00A0'); 487 } 488 489 // fix date patterns 490 DateTimePatternType datetimePatternType = DateTimePatternType.fromPath(path); 491 if (DateTimePatternType.STOCK_AVAILABLE_INTERVAL_PATTERNS.contains(datetimePatternType)) { 492 try { 493 value = dtc.getCanonicalDatePattern(path, value, datetimePatternType); 494 } catch (IllegalArgumentException ex) { 495 return value; 496 } 497 } 498 499 if (path.startsWith("//ldml/numbers/currencies/currency") && path.contains("displayName")) { 500 value = normalizeCurrencyDisplayName(value); 501 } 502 NumericType numericType = NumericType.getNumericType(path); 503 if (numericType != NumericType.NOT_NUMERIC) { 504 if (numericType == NumericType.CURRENCY) { 505 value = value.replaceAll(" ", "\u00A0"); 506 // NOTE: the following "if ... NumericType.CURRENCY_ABBREVIATED" was false here, 507 // since we know it is NumericType.CURRENCY; so now the code is commented out; if 508 // anyone 509 // understands what the intention was, maybe the condition should be restored 510 // somehow, 511 // such as with "else if" 512 // if (numericType == NumericType.CURRENCY_ABBREVIATED) { 513 // value = value.replaceAll("0\\.0+", "0"); 514 // } 515 } else { 516 value = 517 value.replaceAll("([%\u00A4]) ", "$1\u00A0") 518 .replaceAll(" ([%\u00A4])", "\u00A0$1"); 519 value = replace(NON_DECIMAL_PERIOD, value, "'.'"); 520 if (numericType == NumericType.DECIMAL_ABBREVIATED) { 521 value = value.replaceAll("0\\.0+", "0"); 522 } 523 } 524 value = getCanonicalPattern(value, numericType, isPosix); 525 } 526 527 // fix [,] 528 if (path.startsWith("//ldml/localeDisplayNames/languages/language") 529 || path.startsWith("//ldml/localeDisplayNames/scripts/script") 530 || path.startsWith("//ldml/localeDisplayNames/territories/territory") 531 || path.startsWith("//ldml/localeDisplayNames/variants/variant") 532 || path.startsWith("//ldml/localeDisplayNames/keys/key") 533 || path.startsWith("//ldml/localeDisplayNames/types/type")) { 534 value = value.replace('[', '(').replace(']', ')').replace('[', '(').replace(']', ')'); 535 } 536 537 // Normalize two single quotes for the inches symbol. 538 if (path.contains("/units")) { 539 value = value.replace("''", "″"); 540 } 541 542 // check specific cases 543 // if (isUnicodeSet) { 544 // value = inputUnicodeSet(path, value); 545 // } else 546 if (path.contains("stopword")) { 547 if (value.equals("NONE")) { 548 value = ""; 549 } 550 } 551 552 // Normalize ellipsis data. 553 if (path.startsWith("//ldml/characters/ellipsis")) { 554 value = value.replace("...", "…"); 555 } 556 557 if (path.startsWith(NOL_START_PATH)) { 558 value = normalizeNameOrderLocales(value); 559 } 560 561 // Replace Arabic presentation forms with their nominal counterparts 562 value = replaceArabicPresentationForms(value); 563 564 // Fix up any apostrophes as appropriate (Don't do so for things like date patterns... 565 if (!APOSTROPHE_SKIP_PATHS.matcher(path).matches()) { 566 value = normalizeApostrophes(value); 567 } 568 // Fix up any apostrophes in number symbols 569 if (NUMBER_SEPARATOR_PATTERN.matcher(path).matches()) { 570 value = value.replace('\'', '\u2019'); 571 } 572 // Fix up hyphens, replacing with N-dash as appropriate 573 if (INTERVAL_FORMAT_PATHS.matcher(path).matches()) { 574 value = 575 normalizeIntervalHyphensAndSpaces( 576 value); // This may also adjust spaces around en dash 577 } else if (!isUnicodeSet) { 578 value = normalizeHyphens(value); 579 } 580 value = processAnnotations(path, value); 581 value = normalizeZeroWidthSpace(value); 582 if (VoteResolver.DROP_HARD_INHERITANCE) { 583 value = replaceBaileyWithInheritanceMarker(path, value); 584 } 585 return value; 586 } 587 processLocaleSpecificInput(String path, String value, boolean isUnicodeSet)588 private String processLocaleSpecificInput(String path, String value, boolean isUnicodeSet) { 589 if (locale.childOf(MALAYALAM)) { 590 String newvalue = normalizeMalayalam(value); 591 if (DEBUG_DAIP) 592 System.out.println( 593 "DAIP: Normalized Malayalam '" + value + "' to '" + newvalue + "'"); 594 value = newvalue; 595 } else if (locale.childOf(ROMANIAN) && !isUnicodeSet) { 596 value = standardizeRomanian(value); 597 } else if (locale.childOf(CATALAN) && !isUnicodeSet) { 598 value = standardizeCatalan(value); 599 } else if (locale.childOf(NGOMBA) && !isUnicodeSet) { 600 value = standardizeNgomba(value); 601 } else if (locale.childOf(KWASIO) && !isUnicodeSet) { 602 value = standardizeKwasio(value); 603 } else if (locale.childOf(HEBREW) && !APOSTROPHE_SKIP_PATHS.matcher(path).matches()) { 604 value = replaceChars(path, value, HEBREW_CONVERSIONS, false); 605 } else if ((locale.childOf(SWISS_GERMAN) || locale.childOf(GERMAN_SWITZERLAND)) 606 && !isUnicodeSet) { 607 value = standardizeSwissGerman(value); 608 } else if (locale.childOf(MYANMAR) && !isUnicodeSet) { 609 value = standardizeMyanmar(value); 610 } else if (locale.childOf(KYRGYZ)) { 611 value = replaceChars(path, value, KYRGYZ_CONVERSIONS, false); 612 } else if (locale.childOf(URDU) || locale.childOf(PASHTO) || locale.childOf(FARSI)) { 613 value = replaceChars(path, value, URDU_PLUS_CONVERSIONS, true); 614 } else if (locale.childOf(FF_ADLAM) && !isUnicodeSet) { 615 value = fixAdlamNasalization(value); 616 } else if (locale.childOf(KASHMIRI)) { 617 value = replaceChars(path, value, KASHMIRI_CONVERSIONS, false); 618 } 619 return value; 620 } 621 processAnnotations(String path, String value)622 private String processAnnotations(String path, String value) { 623 if (AnnotationUtil.pathIsAnnotation(path)) { 624 if (path.contains(Emoji.TYPE_TTS)) { 625 // The row has something like " -name" in the first column. Cf. namePath, 626 // getNamePaths. 627 // Normally the value is like "zebra" or "unicorn face", without "|". 628 // If the user enters a value with "|", discard anything after "|"; e.g., change "a 629 // | b | c" to "a". 630 value = SPLIT_BAR.split(value).iterator().next(); 631 } else { 632 // The row has something like " –keywords" in the first column. Cf. keywordPath, 633 // getKeywordPaths. 634 // Normally the value is like "stripe | zebra", with "|". 635 value = annotationsForDisplay(value); 636 } 637 } 638 return value; 639 } 640 normalizeNameOrderLocales(String value)641 private String normalizeNameOrderLocales(String value) { 642 value = value.replace(EMPTY_ELEMENT_VALUE, ""); 643 TreeSet<String> result = new TreeSet<>(SPLIT_SPACE.splitToList(value)); 644 result.remove(LocaleNames.ZXX); 645 if (result.remove(LocaleNames.UND)) { // put und at the front 646 if (result.isEmpty()) { 647 return LocaleNames.UND; 648 } else { 649 return LocaleNames.UND + " " + JOIN_SPACE.join(result); 650 } 651 } 652 return JOIN_SPACE.join(result); 653 } 654 655 /** 656 * Strip out all code points less than U+0020 except for U+0009 tab, U+000A line feed, and 657 * U+000D carriage return. 658 * 659 * @param s the string 660 * @return the resulting string 661 */ stripProblematicControlCharacters(String s)662 private String stripProblematicControlCharacters(String s) { 663 if (s == null || s.isEmpty()) { 664 return s; 665 } 666 return s.codePoints() 667 .filter(c -> (c >= 0x20 || c == 9 || c == 0xA || c == 0xD)) 668 .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append) 669 .toString(); 670 } 671 672 private static final boolean REMOVE_COVERED_KEYWORDS = true; 673 674 /** 675 * Produce a modification of the given annotation by sorting its components and filtering 676 * covered keywords. 677 * 678 * <p>Examples: Given "b | a", return "a | b". Given "bear | panda | panda bear", return "bear | 679 * panda". 680 * 681 * @param value the string 682 * @return the possibly modified string 683 */ annotationsForDisplay(String value)684 private static String annotationsForDisplay(String value) { 685 TreeSet<String> sorted = new TreeSet<>(Collator.getInstance(ULocale.ROOT)); 686 sorted.addAll(SPLIT_BAR.splitToList(value)); 687 if (REMOVE_COVERED_KEYWORDS) { 688 filterCoveredKeywords(sorted); 689 } 690 value = JOIN_BAR.join(sorted); 691 return value; 692 } 693 694 /** 695 * Filter from the given set some keywords that include spaces, if they duplicate, or are 696 * "covered by", other keywords in the set. 697 * 698 * <p>For example, if the set is {"bear", "panda", "panda bear"} (annotation was "bear | panda | 699 * panda bear"), then remove "panda bear", treating it as "covered" since the set already 700 * includes "panda" and "bear". Also, for example, if the set is {"bear", "panda", "PANDA 701 * BEAR"}, then remove "PANDA BEAR" even though the casing differs. 702 * 703 * <p>Since casing is complex in many languages/scripts, this method does not attempt to 704 * recognize all occurrences of case-insensitive matching. Instead, it first checks for 705 * case-sensitive (exact) matching, then it checks for case-insensitive (loose) matching 706 * according to Locale.ROOT. The intended effect is only to remove an item like "PANDA BEAR" if 707 * both "panda" and "bear" are already present as individual items. The intended effect is never 708 * to modify the casing of any item that is already present. 709 * 710 * @param sorted the set from which items may be removed 711 */ filterCoveredKeywords(TreeSet<String> sorted)712 public static void filterCoveredKeywords(TreeSet<String> sorted) { 713 // for now, just do single items 714 HashSet<String> toRemove = new HashSet<>(); 715 716 TreeSet<String> sortedLower = new TreeSet<>(); 717 for (String item : sorted) { 718 sortedLower.add(item.toLowerCase(Locale.ROOT)); 719 } 720 for (String item : sorted) { 721 List<String> list = SPLIT_SPACE.splitToList(item); 722 if (list.size() < 2) { 723 continue; 724 } 725 if (sorted.containsAll(list)) { 726 toRemove.add(item); 727 } else { 728 List<String> listLower = new ArrayList<>(); 729 for (String s : list) { 730 listLower.add(s.toLowerCase(Locale.ROOT)); 731 } 732 if (sortedLower.containsAll(listLower)) { 733 toRemove.add(item); 734 } 735 } 736 } 737 sorted.removeAll(toRemove); 738 } 739 740 /** 741 * Given a sorted list like "BEAR | Bear | PANDA | Panda | panda",filter out any items that 742 * duplicate other items aside from case, leaving only, for example, "BEAR | PANDA" 743 * 744 * @param sorted the set from which items may be removed 745 */ filterKeywordsDifferingOnlyInCase(TreeSet<String> sorted)746 public static void filterKeywordsDifferingOnlyInCase(TreeSet<String> sorted) { 747 TreeMultimap<String, String> mapFromLower = TreeMultimap.create(); 748 for (String item : sorted) { 749 mapFromLower.put(item.toLowerCase(), item); 750 } 751 TreeSet<String> toRetain = new TreeSet<>(); 752 for (String lower : mapFromLower.keySet()) { 753 Set<String> variants = mapFromLower.get(lower); 754 for (String var : variants) { 755 toRetain.add(var); 756 break; 757 } 758 } 759 sorted.retainAll(toRetain); 760 } 761 displayUnicodeSet(String value)762 private String displayUnicodeSet(String value) { 763 return pp.format( 764 new UnicodeSet(value)); // will throw exception if bad format, eg missing [...] 765 } 766 inputUnicodeSet(String path, String value)767 private String inputUnicodeSet(String path, String value) { 768 UnicodeSet exemplar = null; 769 // hack, in case the input is called twice 770 value = value.trim(); 771 if (value.startsWith("[") && value.endsWith("]")) { 772 try { 773 exemplar = new UnicodeSet(value); 774 } catch (Exception e2) { 775 // fall through 776 } 777 } 778 if (exemplar == null) { 779 try { 780 exemplar = pp.parse(value); 781 } catch (Exception e) { 782 // can't parse at all 783 return value; // we can't throw an exception because clients won't expect it. 784 } 785 } 786 XPathParts parts = XPathParts.getFrozenInstance(path); 787 // if (parts.getElement(2).equals("parseLenients")) { 788 // return exemplar.toPattern(false); 789 // } 790 final String type = parts.getAttributeValue(-1, "type"); 791 ExemplarType exemplarType = 792 !path.contains("exemplarCharacters") 793 ? null 794 : type == null ? ExemplarType.main : ExemplarType.valueOf(type); 795 value = getCleanedUnicodeSet(exemplar, exemplarType); 796 return value; 797 } 798 normalizeCurrencyDisplayName(String value)799 private String normalizeCurrencyDisplayName(String value) { 800 StringBuilder result = new StringBuilder(); 801 boolean inParentheses = false; 802 for (int i = 0; i < value.length(); i++) { 803 char c = value.charAt(i); 804 if (c == '(') { 805 inParentheses = true; 806 } else if (c == ')') { 807 inParentheses = false; 808 } 809 if (inParentheses && c == '-' && Character.isDigit(value.charAt(i - 1))) { 810 c = 0x2013; /* Replace hyphen-minus with dash for date ranges */ 811 } 812 result.append(c); 813 } 814 return result.toString(); 815 } 816 normalizeApostrophes(String value)817 private String normalizeApostrophes(String value) { 818 // If our DAIP always had a CLDRFile to work with, then we could just check the exemplar set 819 // in it to see. 820 // But since we don't, we just maintain the list internally and use it. 821 if (LANGUAGES_USING_MODIFIER_APOSTROPHE.contains(locale.getLanguage())) { 822 return value.replace('\'', '\u02bc'); 823 } else { 824 char prev = 0; 825 StringBuilder builder = new StringBuilder(); 826 for (char c : value.toCharArray()) { 827 if (c == '\'') { 828 if (Character.isLetter(prev)) { 829 builder.append('\u2019'); 830 } else { 831 builder.append('\u2018'); 832 } 833 } else { 834 builder.append(c); 835 } 836 prev = c; 837 } 838 return builder.toString(); 839 } 840 } 841 normalizeIntervalHyphensAndSpaces(String value)842 private String normalizeIntervalHyphensAndSpaces(String value) { 843 if (value.contains("{0}")) { 844 // intervalFormatFallback pattern, not handled by DateTimePatternGenerator.FormatParser 845 if (scriptCode.equals("Latn")) { 846 value = INTERVAL_FALLBACK_RANGE.matcher(value).replaceAll("}\u2009\u2013\u2009{"); 847 } 848 return value; 849 } 850 DateTimePatternGenerator.FormatParser fp = new DateTimePatternGenerator.FormatParser(); 851 fp.set( 852 DateIntervalInfo.genPatternInfo(value, false) 853 .getFirstPart()); // first format & separator including spaces 854 List<Object> items = fp.getItems(); 855 Object last = items.get(items.size() - 1); 856 if (last instanceof String) { 857 String separator = 858 last.toString(); // separator including spaces, and possibly preceding 859 // literal text (. or quoted) 860 String replacement = separator; 861 if (scriptCode.equals("Latn") 862 && (separator.endsWith(" - ") || separator.endsWith(" \u2013 "))) { 863 replacement = 864 separator.substring(0, separator.length() - 3) 865 + "\u2009\u2013\u2009"; // Per CLDR-14032,16308 866 } else if (separator.contains("-")) { 867 replacement = separator.replace("-", "\u2013"); 868 } 869 if (!replacement.equals(separator)) { 870 StringBuilder sb = new StringBuilder(); 871 sb.append(DateIntervalInfo.genPatternInfo(value, false).getFirstPart()); 872 if (sb.lastIndexOf(separator) >= 0) { 873 sb.delete(sb.lastIndexOf(separator), sb.length()); 874 sb.append(replacement); 875 sb.append( 876 DateIntervalInfo.genPatternInfo(value, false) 877 .getSecondPart()); // second format only 878 return sb.toString(); 879 } 880 } 881 } 882 return value; 883 } 884 normalizeHyphens(String value)885 private String normalizeHyphens(String value) { 886 int hyphenLocation = value.indexOf("-"); 887 if (hyphenLocation > 0 888 && Character.isDigit(value.charAt(hyphenLocation - 1)) 889 && hyphenLocation < value.length() - 1 890 && Character.isDigit(value.charAt(hyphenLocation + 1))) { 891 return value.substring(0, hyphenLocation) 892 + "\u2013" 893 + value.substring(hyphenLocation + 1); 894 } 895 return value; 896 } 897 standardizeRomanian(String value)898 private String standardizeRomanian(String value) { 899 StringBuilder builder = new StringBuilder(); 900 for (char c : value.toCharArray()) { 901 for (char[] pair : ROMANIAN_CONVERSIONS) { 902 if (c == pair[0]) { 903 c = pair[1]; 904 break; 905 } 906 } 907 builder.append(c); 908 } 909 return builder.toString(); 910 } 911 standardizeKwasio(String value)912 private String standardizeKwasio(String value) { 913 StringBuilder builder = new StringBuilder(); 914 for (char c : value.toCharArray()) { 915 for (char[] pair : KWASIO_CONVERSIONS) { 916 if (c == pair[0]) { 917 c = pair[1]; 918 break; 919 } 920 } 921 builder.append(c); 922 } 923 return builder.toString(); 924 } 925 926 // Use the myanmar-tools detector. standardizeMyanmar(String value)927 private String standardizeMyanmar(String value) { 928 if (detector.getZawgyiProbability(value) > 0.90) { 929 return zawgyiUnicodeTransliterator.transform(value); 930 } 931 return value; 932 } 933 standardizeNgomba(String value)934 private String standardizeNgomba(String value) { 935 StringBuilder builder = new StringBuilder(); 936 char[] charArray = value.toCharArray(); 937 for (int i = 0; i < charArray.length; i++) { 938 char c = charArray[i]; 939 boolean convertedSaltillo = false; 940 for (char[] pair : NGOMBA_CONVERSIONS) { 941 if (c == pair[0]) { 942 c = pair[1]; 943 if (c == '\uA78C') { 944 convertedSaltillo = true; 945 } 946 break; 947 } 948 } 949 if (convertedSaltillo 950 && ((i > 0 951 && i < charArray.length - 1 952 && Character.isUpperCase(charArray[i - 1]) 953 && Character.isUpperCase(charArray[i + 1])) 954 || (i > 1 955 && Character.isUpperCase(charArray[i - 1]) 956 && Character.isUpperCase(charArray[i - 2])))) { 957 c = '\uA78B'; // UPPER CASE SALTILLO 958 } 959 builder.append(c); 960 } 961 return builder.toString(); 962 } 963 replaceChars( String path, String value, char[][] charsToReplace, boolean skipAuxExemplars)964 private String replaceChars( 965 String path, String value, char[][] charsToReplace, boolean skipAuxExemplars) { 966 if (skipAuxExemplars && path.contains("/exemplarCharacters[@type=\"auxiliary\"]")) { 967 return value; 968 } 969 StringBuilder builder = new StringBuilder(); 970 for (char c : value.toCharArray()) { 971 for (char[] pair : charsToReplace) { 972 if (c == pair[0]) { 973 c = pair[1]; 974 break; 975 } 976 } 977 builder.append(c); 978 } 979 return builder.toString(); 980 } 981 standardizeSwissGerman(String value)982 private String standardizeSwissGerman(String value) { 983 return value.replaceAll("\u00DF", "ss"); 984 } 985 standardizeCatalan(String value)986 private String standardizeCatalan(String value) { 987 StringBuilder builder = new StringBuilder(); 988 for (char c : value.toCharArray()) { 989 boolean didSubstitute = false; 990 for (char[] triple : CATALAN_CONVERSIONS) { 991 if (c == triple[0]) { 992 builder.append(triple[1]); 993 builder.append(triple[2]); 994 didSubstitute = true; 995 break; 996 } 997 } 998 if (!didSubstitute) { 999 builder.append(c); 1000 } 1001 } 1002 return builder.toString(); 1003 } 1004 replace(Pattern pattern, String value, String replacement)1005 private String replace(Pattern pattern, String value, String replacement) { 1006 String value2 = pattern.matcher(value).replaceAll(replacement); 1007 if (DEBUG_DAIP && !value.equals(value2)) { 1008 System.out.println("\n" + value + " => " + value2); 1009 } 1010 return value2; 1011 } 1012 1013 private static final Pattern UNNORMALIZED_MALAYALAM = 1014 PatternCache.get("(\u0D23|\u0D28|\u0D30|\u0D32|\u0D33|\u0D15)\u0D4D\u200D"); 1015 1016 private static final Map<Character, Character> NORMALIZING_MAP = 1017 Builder.with(new HashMap<Character, Character>()) 1018 .put('\u0D23', '\u0D7A') 1019 .put('\u0D28', '\u0D7B') 1020 .put('\u0D30', '\u0D7C') 1021 .put('\u0D32', '\u0D7D') 1022 .put('\u0D33', '\u0D7E') 1023 .put('\u0D15', '\u0D7F') 1024 .get(); 1025 1026 /** 1027 * Normalizes the Malayalam characters in the specified input. 1028 * 1029 * @param value the input to be normalized 1030 * @return 1031 */ normalizeMalayalam(String value)1032 private String normalizeMalayalam(String value) { 1033 // Normalize Malayalam characters. 1034 Matcher matcher = UNNORMALIZED_MALAYALAM.matcher(value); 1035 if (matcher.find()) { 1036 StringBuffer buffer = new StringBuffer(); 1037 int start = 0; 1038 do { 1039 buffer.append(value, start, matcher.start(0)); 1040 char codePoint = matcher.group(1).charAt(0); 1041 buffer.append(NORMALIZING_MAP.get(codePoint)); 1042 start = matcher.end(0); 1043 } while (matcher.find()); 1044 buffer.append(value.substring(start)); 1045 value = buffer.toString(); 1046 } 1047 return value; 1048 } 1049 1050 static final Transform<String, String> fixArabicPresentation = 1051 Transliterator.getInstance( 1052 "[[:block=Arabic_Presentation_Forms_A:][:block=Arabic_Presentation_Forms_B:]] nfkc"); 1053 1054 /** 1055 * Normalizes the Arabic presentation forms characters in the specified input. 1056 * 1057 * @param value the input to be normalized 1058 * @return 1059 */ replaceArabicPresentationForms(String value)1060 private String replaceArabicPresentationForms(String value) { 1061 value = fixArabicPresentation.transform(value); 1062 return value; 1063 } 1064 1065 static Pattern ADLAM_MISNASALIZED = PatternCache.get("([])['’‘]([])"); 1066 public static String ADLAM_NASALIZATION = ""; // U+1E94B (Unicode 12.0) 1067 fixAdlamNasalization(String fromString)1068 public static String fixAdlamNasalization(String fromString) { 1069 return ADLAM_MISNASALIZED 1070 .matcher(fromString) 1071 .replaceAll("$1" + ADLAM_NASALIZATION + "$2"); // replace quote with 1072 } 1073 getCleanedUnicodeSet(UnicodeSet exemplar, ExemplarType exemplarType)1074 public String getCleanedUnicodeSet(UnicodeSet exemplar, ExemplarType exemplarType) { 1075 1076 if (rawFormatter == null) { 1077 throw new IllegalArgumentException("Formatter must not be null"); 1078 } 1079 if (exemplar == null) { 1080 throw new IllegalArgumentException("set to be cleaned must not be null"); 1081 } 1082 1083 String value; 1084 // prettyPrinter.setCompressRanges(exemplar.size() > 300); 1085 // value = exemplar.toPattern(false); 1086 UnicodeSet toAdd = new UnicodeSet(); 1087 1088 for (UnicodeSetIterator usi = new UnicodeSetIterator(exemplar); usi.next(); ) { 1089 String string = usi.getString(); 1090 if (string.equals("ß") || string.equals("İ")) { 1091 toAdd.add(string); 1092 continue; 1093 } 1094 switch (string) { 1095 case "\u2011": 1096 toAdd.add("-"); 1097 break; // nobreak hyphen 1098 case "-": 1099 toAdd.add("\u2011"); 1100 break; // nobreak hyphen 1101 1102 case " ": 1103 toAdd.add("\u00a0"); 1104 break; // nobreak space 1105 case "\u00a0": 1106 toAdd.add(" "); 1107 break; // nobreak space 1108 1109 case "\u202F": 1110 toAdd.add("\u2009"); 1111 break; // nobreak narrow space 1112 case "\u2009": 1113 toAdd.add("\u202F"); 1114 break; // nobreak narrow space 1115 } 1116 if (exemplarType != null && exemplarType.convertUppercase) { 1117 string = UCharacter.toLowerCase(ULocale.ENGLISH, string); 1118 } 1119 toAdd.add(string); 1120 // we allow 1121 String composed = Normalizer.compose(string, false); 1122 if (!string.equals(composed)) { 1123 toAdd.add(composed); 1124 } 1125 } 1126 1127 if (exemplarType != null) { 1128 toAdd.removeAll(exemplarType.toRemove); 1129 } 1130 value = rawFormatter.format(toAdd); 1131 return value; 1132 } 1133 1134 static final Splitter SEMI_SPLITTER = Splitter.on(';').trimResults(); 1135 1136 /** 1137 * @return a canonical numeric pattern, based on the type, and the isPOSIX flag. The latter is 1138 * set for en_US_POSIX. 1139 */ getCanonicalPattern(String inpattern, NumericType type, boolean isPOSIX)1140 public static String getCanonicalPattern(String inpattern, NumericType type, boolean isPOSIX) { 1141 // TODO fix later to properly handle quoted ; 1142 1143 DecimalFormat df = new DecimalFormat(inpattern); 1144 if (type == NumericType.DECIMAL_ABBREVIATED 1145 || type == NumericType.CURRENCY_ABBREVIATED 1146 || CldrUtility.INHERITANCE_MARKER.equals(inpattern)) { 1147 return inpattern; // TODO fix when ICU bug is fixed 1148 // df.setMaximumFractionDigits(df.getMinimumFractionDigits()); 1149 // df.setMaximumIntegerDigits(Math.max(1, df.getMinimumIntegerDigits())); 1150 } else { 1151 // int decimals = type == CURRENCY_TYPE ? 2 : 1; 1152 int[] digits = isPOSIX ? type.posixDigitCount : type.digitCount; 1153 df.setMinimumIntegerDigits(digits[0]); 1154 df.setMinimumFractionDigits(digits[1]); 1155 df.setMaximumFractionDigits(digits[2]); 1156 } 1157 String pattern = df.toPattern(); 1158 List<String> parts = SEMI_SPLITTER.splitToList(pattern); 1159 String pattern2 = parts.get(0); 1160 if (parts.size() > 1) { 1161 pattern2 += ";" + parts.get(1); 1162 } 1163 if (!pattern2.equals(pattern)) { 1164 pattern = pattern2; 1165 } 1166 // int pos = pattern.indexOf(';'); 1167 // if (pos < 0) return pattern + ";-" + pattern; 1168 return pattern; 1169 } 1170 enableInheritanceReplacement(CLDRFile cldrFile)1171 public void enableInheritanceReplacement(CLDRFile cldrFile) { 1172 cldrFileForBailey = cldrFile; 1173 } 1174 1175 /* 1176 * This tests what type a numeric pattern is. 1177 */ 1178 public enum NumericType { 1179 CURRENCY(new int[] {1, 2, 2}, new int[] {1, 2, 2}), 1180 CURRENCY_ABBREVIATED(), 1181 DECIMAL(new int[] {1, 0, 3}, new int[] {1, 0, 6}), 1182 DECIMAL_ABBREVIATED(), 1183 PERCENT(new int[] {1, 0, 0}, new int[] {1, 0, 0}), 1184 SCIENTIFIC(new int[] {0, 0, 0}, new int[] {1, 6, 6}), 1185 NOT_NUMERIC; 1186 1187 private static final Pattern NUMBER_PATH = 1188 Pattern.compile( 1189 "//ldml/numbers/((currency|decimal|percent|scientific)Formats|currencies/currency).*"); 1190 private int[] digitCount; 1191 private int[] posixDigitCount; 1192 NumericType()1193 NumericType() {} 1194 NumericType(int[] digitCount, int[] posixDigitCount)1195 NumericType(int[] digitCount, int[] posixDigitCount) { 1196 this.digitCount = digitCount; 1197 this.posixDigitCount = posixDigitCount; 1198 } 1199 1200 /** 1201 * @return the numeric type of the xpath 1202 */ getNumericType(String xpath)1203 public static NumericType getNumericType(String xpath) { 1204 Matcher matcher = NUMBER_PATH.matcher(xpath); 1205 if (!xpath.contains("/pattern")) { 1206 return NOT_NUMERIC; 1207 } else if (matcher.matches()) { 1208 if (matcher.group(1).equals("currencies/currency")) { 1209 return CURRENCY; 1210 } else { 1211 NumericType type = NumericType.valueOf(matcher.group(2).toUpperCase()); 1212 if (xpath.contains("=\"1000")) { 1213 if (type == DECIMAL) { 1214 type = DECIMAL_ABBREVIATED; 1215 } else if (type == CURRENCY) { 1216 type = CURRENCY_ABBREVIATED; 1217 } else { 1218 throw new IllegalArgumentException("Internal Error"); 1219 } 1220 } 1221 return type; 1222 } 1223 } else { 1224 return NOT_NUMERIC; 1225 } 1226 } 1227 getDigitCount()1228 public int[] getDigitCount() { 1229 return digitCount; 1230 } 1231 getPosixDigitCount()1232 public int[] getPosixDigitCount() { 1233 return posixDigitCount; 1234 } 1235 } 1236 1237 /** 1238 * Turn all whitespace sequences (including tab and newline, and NBSP for certain paths) into a 1239 * single space or a single NBSP depending on path. Also trim initial/final NBSP, unless the 1240 * value is only the one character, "\u00A0" 1241 * 1242 * @param path 1243 * @param value 1244 * @return the normalized value 1245 */ normalizeWhitespace(String path, String value)1246 private String normalizeWhitespace(String path, String value) { 1247 PathSpaceType pst = PathSpaceType.get(path); 1248 if (pst == PathSpaceType.allowSp) { 1249 value = 1250 WHITESPACE_AND_NBSP_TO_NORMALIZE 1251 .matcher(value) 1252 .replaceAll(" "); // replace with regular space 1253 } else if (pst == PathSpaceType.allowNbsp) { 1254 value = 1255 WHITESPACE_AND_NBSP_TO_NORMALIZE 1256 .matcher(value) 1257 .replaceAll("\u00A0"); // replace with NBSP 1258 value = trimNBSP(value); 1259 } else if (pst == PathSpaceType.allowNNbsp) { 1260 value = 1261 WHITESPACE_AND_NBSP_TO_NORMALIZE 1262 .matcher(value) 1263 .replaceAll("\u202F"); // replace with NNBSP 1264 value = trimNBSP(value); 1265 } else if (pst == PathSpaceType.allowSpOrNbsp) { 1266 /* 1267 * in this case don't normalize away NBSP 1268 */ 1269 value = 1270 WHITESPACE_NO_NBSP_TO_NORMALIZE 1271 .matcher(value) 1272 .replaceAll(" "); // replace with regular space 1273 /* 1274 * if any NBSP and regular space are adjacent, replace with NBSP 1275 */ 1276 value = NBSP_PLUS_SPACE_TO_NORMALIZE.matcher(value).replaceAll("\u00A0"); 1277 value = SPACE_PLUS_NBSP_TO_NORMALIZE.matcher(value).replaceAll("\u00A0"); 1278 value = MULTIPLE_NBSP.matcher(value).replaceAll("\u00A0"); 1279 value = trimNBSP(value); 1280 } else { 1281 throw new IllegalArgumentException("Unknown PathSpaceType " + pst); 1282 } 1283 1284 // Further whitespace adjustments per CLDR-14032 1285 if ((scriptCode.equals("Latn") || scriptCode.equals("Cyrl") || scriptCode.equals("Grek")) 1286 && HOUR_FORMAT_XPATHS.matcher(path).matches()) { 1287 String test = AMPM_SPACE_BEFORE.matcher(value).replaceAll("$1$2"); // value without a+ 1288 String spaceReplace = path.contains("ascii") ? "$1\u0020$3" : "$1\u202F$3"; 1289 if (value.length() - test.length() != 4) { // exclude patterns with aaaa 1290 value = AMPM_SPACE_BEFORE.matcher(value).replaceAll(spaceReplace); 1291 } 1292 test = AMPM_SPACE_AFTER.matcher(value).replaceAll("$2$3"); // value without a+ 1293 if (value.length() - test.length() != 4) { // exclude patterns with aaaa 1294 value = AMPM_SPACE_AFTER.matcher(value).replaceAll(spaceReplace); 1295 } 1296 } 1297 if (scriptCode.equals("Cyrl") && YEAR_FORMAT_XPATHS.matcher(path).matches()) { 1298 value = YEAR_SPACE_YEARMARKER.matcher(value).replaceAll("y\u202F$1"); 1299 } 1300 if (UNIT_NARROW_XPATHS.matcher(path).matches()) { 1301 value = PLACEHOLDER_SPACE_AFTER.matcher(value).replaceAll("}\u202F"); // Narrow NBSP 1302 value = PLACEHOLDER_SPACE_BEFORE.matcher(value).replaceAll("\u202F{"); 1303 } 1304 if (UNIT_SHORT_XPATHS.matcher(path).matches()) { 1305 value = PLACEHOLDER_SPACE_AFTER.matcher(value).replaceAll("}\u00A0"); // Regular NBSP 1306 value = PLACEHOLDER_SPACE_BEFORE.matcher(value).replaceAll("\u00A0{"); 1307 } 1308 1309 // Finally, replace remaining space combinations with most restrictive type CLDR-17233 1310 // If we have NNBSP U+202F in combination with other spaces, keep just it 1311 value = NNBSP_AMONG_OTHER_SPACES.matcher(value).replaceAll("\u202F"); 1312 // Else if we have NBSP U+00A0 in combination with other spaces, keep just it 1313 value = NBSP_AMONG_OTHER_SPACES.matcher(value).replaceAll("\u00A0"); 1314 // Else if we have THIN SPACE U+2009 in combination with other spaces, keep just it 1315 value = THIN_SPACE_AMONG_OTHER_SPACES.matcher(value).replaceAll("\u2009"); 1316 1317 return value; 1318 } 1319 1320 /** 1321 * Delete any initial or final NBSP or NNBSP, unless the value is just NBSP or NNBSP 1322 * 1323 * @param value 1324 * @return the trimmed value 1325 */ trimNBSP(String value)1326 private String trimNBSP(String value) { 1327 if (!value.equals("\u00A0") && !value.equals("\u202F")) { 1328 value = INITIAL_NBSP.matcher(value).replaceAll(""); 1329 value = FINAL_NBSP.matcher(value).replaceAll(""); 1330 } 1331 return value; 1332 } 1333 1334 /** Categorize xpaths according to whether they allow space, NBSP, or both */ 1335 public enum PathSpaceType { 1336 allowSp, 1337 allowNbsp, 1338 allowNNbsp, 1339 allowSpOrNbsp; 1340 get(String path)1341 public static PathSpaceType get(String path) { 1342 if (wantsRegularSpace(path)) { 1343 return allowSp; 1344 } else if (wantsNBSP(path)) { 1345 return allowNbsp; 1346 } else if (wantsNNBSP(path)) { 1347 return allowNNbsp; 1348 } else { 1349 return allowSpOrNbsp; 1350 } 1351 } 1352 wantsRegularSpace(String path)1353 private static boolean wantsRegularSpace(String path) { 1354 if ((path.contains("/dateFormatLength") && path.contains("/pattern")) 1355 || path.contains("/availableFormats/dateFormatItem") 1356 || (path.startsWith("//ldml/dates/timeZoneNames/metazone") 1357 && path.contains("/long")) 1358 || path.startsWith("//ldml/dates/timeZoneNames/regionFormat") 1359 || path.startsWith("//ldml/localeDisplayNames/codePatterns/codePattern") 1360 || path.startsWith("//ldml/localeDisplayNames/languages/language") 1361 || path.startsWith("//ldml/localeDisplayNames/territories/territory") 1362 || path.startsWith("//ldml/localeDisplayNames/types/type") 1363 || (path.startsWith("//ldml/numbers/currencies/currency") 1364 && path.contains("/displayName")) 1365 || (path.contains("/decimalFormatLength[@type=\"long\"]") 1366 && path.contains("/pattern")) 1367 || path.startsWith("//ldml/posix/messages") 1368 || (path.startsWith("//ldml/units/uni") && path.contains("/unitPattern "))) { 1369 return true; 1370 } 1371 return false; 1372 } 1373 wantsNBSP(String path)1374 private static boolean wantsNBSP(String path) { 1375 if ((path.contains("/currencies/currency") 1376 && (path.contains("/group") || path.contains("/pattern"))) 1377 || (path.contains("/currencyFormatLength") && path.contains("/pattern")) 1378 || (path.contains("/currencySpacing") && path.contains("/insertBetween")) 1379 || (path.contains("/decimalFormatLength") && path.contains("/pattern")) 1380 || // i.e. the non-long ones 1381 (path.contains("/percentFormatLength") && path.contains("/pattern")) 1382 || (path.startsWith("//ldml/numbers/symbols") 1383 && (path.contains("/group") || path.contains("/nan")))) { 1384 return true; 1385 } 1386 return false; 1387 } 1388 wantsNNBSP(String path)1389 private static boolean wantsNNBSP(String path) { 1390 if ((path.contains("/dayPeriodWidth[@type=\"abbreviated\"]") 1391 || path.contains("/dayPeriodWidth[@type=\"narrow\"]")) 1392 && (path.contains("/dayPeriod[@type=\"am\"]") 1393 || path.contains("/dayPeriod[@type=\"pm\"]"))) { 1394 return true; 1395 } 1396 return false; 1397 } 1398 } 1399 1400 private static final Pattern ZERO_WIDTH_SPACES = PatternCache.get("\\u200B+"); 1401 private static final Set<String> LOCALES_NOT_ALLOWING_ZWS = 1402 new HashSet<>(Arrays.asList("da", "fr")); 1403 1404 /** 1405 * Remove occurrences of U+200B ZERO_WIDTH_SPACE under certain conditions 1406 * 1407 * @param value the value to be normalized 1408 * @return the normalized value 1409 * <p>TODO: extend this method to address more concerns, after clarifying the conditions - 1410 * enlarge the set LOCALES_NOT_ALLOWING_ZWS? - strip initial and final ZWS in all locales? - 1411 * reduce two or more adjacent ZWS to one ZWS? - allow or prohibit ZWS by itself as currency 1412 * symbol, as currently in locales kea, pt_CV, pt_PT - allow or prohibit ZWS preceding URL 1413 * as in "as per [U+200B]https://www.unicode.org/reports/tr35/tr35-general.html#Annotations 1414 * " Reference: https://unicode-org.atlassian.net/browse/CLDR-15976 1415 */ normalizeZeroWidthSpace(String value)1416 private String normalizeZeroWidthSpace(String value) { 1417 if (ZERO_WIDTH_SPACES.matcher(value).find()) { 1418 final String localeId = locale.getBaseName(); 1419 if (LOCALES_NOT_ALLOWING_ZWS.contains(localeId)) { 1420 value = ZERO_WIDTH_SPACES.matcher(value).replaceAll(""); 1421 } 1422 } 1423 return value; 1424 } 1425 1426 /** 1427 * If inheritance replacement is enabled and the value matches the Bailey (inherited) value, 1428 * replace the value with CldrUtility.INHERITANCE_MARKER 1429 * 1430 * <p>This is only appropriate if cldrFileForBailey != null, meaning that 1431 * enableInheritanceReplacement has been called -- some cost may be involved in getting 1432 * cldrFileForBailey and calling getBaileyValue, and some callers of DAIP may not want the 1433 * replacement, so the default, when enableInheritanceReplacement has not been called, is no 1434 * replacement 1435 * 1436 * @param path 1437 * @param value 1438 * @return the value or CldrUtility.INHERITANCE_MARKER 1439 */ replaceBaileyWithInheritanceMarker(String path, String value)1440 public String replaceBaileyWithInheritanceMarker(String path, String value) { 1441 if (cldrFileForBailey != null && !value.isEmpty()) { 1442 Output<String> pathWhereFound = new Output<>(); 1443 Output<String> localeWhereFound = new Output<>(); 1444 String baileyValue = 1445 cldrFileForBailey.getBaileyValue(path, pathWhereFound, localeWhereFound); 1446 if (value.equals(baileyValue) 1447 && !XMLSource.ROOT_ID.equals(localeWhereFound.value) 1448 && !XMLSource.CODE_FALLBACK_ID.equals(localeWhereFound.value)) { 1449 return CldrUtility.INHERITANCE_MARKER; 1450 } 1451 } 1452 return value; 1453 } 1454 } 1455