xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/test/DisplayAndInputProcessor.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 /* Copyright (C) 2007-2013 Google and others.  All Rights Reserved. */
2 /* Copyright (C) 2007-2013 IBM Corp. and others. All Rights Reserved. */
3 
4 package org.unicode.cldr.test;
5 
6 import com.google.common.base.Joiner;
7 import com.google.common.base.Splitter;
8 import com.google.common.collect.TreeMultimap;
9 import com.google.myanmartools.ZawgyiDetector;
10 import com.ibm.icu.lang.UCharacter;
11 import com.ibm.icu.text.Collator;
12 import com.ibm.icu.text.DateIntervalInfo;
13 import com.ibm.icu.text.DateTimePatternGenerator;
14 import com.ibm.icu.text.DecimalFormat;
15 import com.ibm.icu.text.Normalizer;
16 import com.ibm.icu.text.Transform;
17 import com.ibm.icu.text.Transliterator;
18 import com.ibm.icu.text.UnicodeSet;
19 import com.ibm.icu.text.UnicodeSetIterator;
20 import com.ibm.icu.util.Output;
21 import com.ibm.icu.util.ULocale;
22 import java.util.ArrayList;
23 import java.util.Arrays;
24 import java.util.Comparator;
25 import java.util.HashMap;
26 import java.util.HashSet;
27 import java.util.List;
28 import java.util.Locale;
29 import java.util.Map;
30 import java.util.Set;
31 import java.util.TreeSet;
32 import java.util.regex.Matcher;
33 import java.util.regex.Pattern;
34 import org.unicode.cldr.test.CheckExemplars.ExemplarType;
35 import org.unicode.cldr.util.AnnotationUtil;
36 import org.unicode.cldr.util.Builder;
37 import org.unicode.cldr.util.CLDRConfig;
38 import org.unicode.cldr.util.CLDRFile;
39 import org.unicode.cldr.util.CLDRLocale;
40 import org.unicode.cldr.util.CldrUtility;
41 import org.unicode.cldr.util.ComparatorUtilities;
42 import org.unicode.cldr.util.DateTimeCanonicalizer;
43 import org.unicode.cldr.util.DateTimeCanonicalizer.DateTimePatternType;
44 import org.unicode.cldr.util.Emoji;
45 import org.unicode.cldr.util.LocaleNames;
46 import org.unicode.cldr.util.PatternCache;
47 import org.unicode.cldr.util.SimpleUnicodeSetFormatter;
48 import org.unicode.cldr.util.SupplementalDataInfo;
49 import org.unicode.cldr.util.UnicodeSetPrettyPrinter;
50 import org.unicode.cldr.util.VoteResolver;
51 import org.unicode.cldr.util.XMLSource;
52 import org.unicode.cldr.util.XPathParts;
53 
54 /**
55  * Class for processing the input and output of CLDR data for use in the Survey Tool and other
56  * tools.
57  */
58 public class DisplayAndInputProcessor {
59 
60     /** Special PersonName paths that allow empty string, public for testing */
61     public static final String NOL_START_PATH = "//ldml/personNames/nameOrderLocales";
62 
63     public static final String FSR_START_PATH = "//ldml/personNames/foreignSpaceReplacement";
64     public static final String NSR_START_PATH = "//ldml/personNames/nativeSpaceReplacement";
65 
66     public static final String EMPTY_ELEMENT_VALUE = "❮EMPTY❯";
67 
68     private static final boolean FIX_YEARS = true;
69 
70     public static final boolean DEBUG_DAIP = CldrUtility.getProperty("DEBUG_DAIP", false);
71 
72     public static final UnicodeSet RTL =
73             new UnicodeSet("[[:Bidi_Class=Arabic_Letter:][:Bidi_Class=Right_To_Left:]]").freeze();
74 
75     public static final Pattern NUMBER_SEPARATOR_PATTERN =
76             Pattern.compile("//ldml/numbers/symbols.*/(decimal|group)");
77 
78     private static final Pattern APOSTROPHE_SKIP_PATHS =
79             PatternCache.get(
80                     "//ldml/("
81                             + "localeDisplayNames/languages/language\\[@type=\"mic\"].*|"
82                             + "characters/.*|"
83                             + "delimiters/.*|"
84                             + "dates/.+/(pattern|intervalFormatItem|dateFormatItem).*|"
85                             + "units/.+/unitPattern.*|"
86                             + "units/.+/durationUnitPattern.*|"
87                             + "numbers/symbols.*|"
88                             + "numbers/miscPatterns.*|"
89                             + "numbers/(decimal|currency|percent|scientific)Formats.+/(decimal|currency|percent|scientific)Format.*)");
90     private static final Pattern INTERVAL_FORMAT_PATHS =
91             PatternCache.get("//ldml/dates/.+/intervalFormat(Item.*|Fallback)");
92     private static final Pattern NON_DECIMAL_PERIOD = PatternCache.get("(?<![0#'])\\.(?![0#'])");
93 
94     // Pattern to match against paths that might have time formats with h or K (12-hour cycles)
95     private static final Pattern HOUR_FORMAT_XPATHS =
96             PatternCache.get(
97                     "//ldml/dates/calendars/calendar\\[@type=\"[^\"]*\"]/("
98                             + "timeFormats/timeFormatLength\\[@type=\"[^\"]*\"]/timeFormat\\[@type=\"standard\"]/pattern\\[@type=\"standard\"].*|"
99                             + "dateTimeFormats/availableFormats/dateFormatItem\\[@id=\"[A-GL-Ma-gl-m]*[hK][A-Za-z]*\"].*|"
100                             + "dateTimeFormats/intervalFormats/intervalFormatItem\\[@id=\"[A-GL-Ma-gl-m]*[hK][A-Za-z]*\"].*)");
101 
102     private static final Pattern AMPM_SPACE_BEFORE =
103             PatternCache.get("([Khms])([ \\u00A0\\u202F]+)(a+)"); // time, space, a+
104     private static final Pattern AMPM_SPACE_AFTER =
105             PatternCache.get("(a+)([ \\u00A0\\u202F]+)([Kh])"); // a+, space, hour
106 
107     // Pattern to match against paths that might have date formats with y
108     private static final Pattern YEAR_FORMAT_XPATHS =
109             PatternCache.get(
110                     "//ldml/dates/calendars/calendar\\[@type=\"[^\"]*\"]/("
111                             + "dateFormats/dateFormatLength\\[@type=\"[^\"]*\"]/dateFormat\\[@type=\"standard\"]/pattern\\[@type=\"standard\"].*|"
112                             + "dateTimeFormats/availableFormats/dateFormatItem\\[@id=\"[A-XZa-xz]*y[A-Za-z]*\"].*|"
113                             + "dateTimeFormats/intervalFormats/intervalFormatItem\\[@id=\"[A-XZa-xz]*y[A-Za-z]*\"].*)");
114 
115     // Cyrillic year markers are or begin with (in various languages) \u0430 \u0433 \u0435 \u0436
116     // \u043E \u0440 \u0441
117     private static final Pattern YEAR_SPACE_YEARMARKER =
118             PatternCache.get("y[ \\u00A0]+('?[агежорс])"); // y, space, Cyrillic year marker start
119 
120     public static final Pattern UNIT_NARROW_XPATHS =
121             PatternCache.get(
122                     "//ldml/units/unitLength\\[@type=\"narrow\"]unit\\[@type=\"[^\"]*\"]/unitPattern.*");
123 
124     public static final Pattern UNIT_SHORT_XPATHS =
125             PatternCache.get(
126                     "//ldml/units/unitLength\\[@type=\"short\"]unit\\[@type=\"[^\"]*\"]/unitPattern.*");
127 
128     private static final Pattern PLACEHOLDER_SPACE_AFTER =
129             PatternCache.get("\\}[ \\u00A0\\u202F]+");
130     private static final Pattern PLACEHOLDER_SPACE_BEFORE =
131             PatternCache.get("[ \\u00A0\\u202F]+\\{");
132     private static final Pattern INTERVAL_FALLBACK_RANGE = PatternCache.get("\\} [\\u2013-] \\{");
133 
134     /** string of whitespace not including NBSP, i.e. [\t\n\r]+ */
135     private static final Pattern WHITESPACE_NO_NBSP_TO_NORMALIZE = PatternCache.get("\\s+"); //
136 
137     /** string of whitespace, possibly including NBSP and/or NNBSP, ie., [\u00A0\t\n\r\u202F]+ */
138     private static final Pattern WHITESPACE_AND_NBSP_TO_NORMALIZE =
139             PatternCache.get("[\\s\\u00A0]+");
140     // Reverted 2022-12-08 from:
141     // private static final Pattern WHITESPACE_AND_NBSP_TO_NORMALIZE =
142     // PatternCache.get("[\\s\\u00A0\\u202F]+");
143 
144     /** one or more NBSP (or NNBSP) followed by one or more regular spaces */
145     private static final Pattern NBSP_PLUS_SPACE_TO_NORMALIZE =
146             PatternCache.get("\\u00A0+\\u0020+");
147     // Reverted 2022-12-08 from:
148     // private static final Pattern NBSP_PLUS_SPACE_TO_NORMALIZE =
149     // PatternCache.get("[\\u00A0\\u202F]+\\u0020+");
150 
151     /** one or more regular spaces followed by one or more NBSP (or NNBSP) */
152     private static final Pattern SPACE_PLUS_NBSP_TO_NORMALIZE =
153             PatternCache.get("\\u0020+\\u00A0+");
154     // Reverted 2022-12-08 from:
155     // private static final Pattern SPACE_PLUS_NBSP_TO_NORMALIZE =
156     // PatternCache.get("\\u0020+[\\u00A0\\u202F]+");
157 
158     // NNBSP 202F among other horizontal spaces (includes 0020, 00A0, 2009, 202F, etc.)
159     private static final Pattern NNBSP_AMONG_OTHER_SPACES =
160             PatternCache.get("[\\h&&[^\\u202F]]+\\u202F\\h*|\\u202F\\h+");
161     // NBSP 00A0 among other horizontal spaces
162     private static final Pattern NBSP_AMONG_OTHER_SPACES =
163             PatternCache.get("[\\h&&[^\\u00A0]]+\\u00A0\\h*|\\u00A0\\h+");
164     // THIN SPACE 2009 among other horizontal spaces
165     private static final Pattern THIN_SPACE_AMONG_OTHER_SPACES =
166             PatternCache.get("[\\h&&[^\\u2009]]+\\u2009\\h*|\\u2009\\h+");
167 
168     private static final Pattern INITIAL_NBSP = PatternCache.get("^[\\u00A0\\u202F]+");
169     private static final Pattern FINAL_NBSP = PatternCache.get("[\\u00A0\\u202F]+$");
170 
171     private static final Pattern MULTIPLE_NBSP = PatternCache.get("\\u00A0\\u00A0+");
172     // Reverted 2022-12-08 from:
173     // private static final Pattern MULTIPLE_NBSP =
174     // PatternCache.get("[\\u00A0\\u202F][\\u00A0\\u202F]+");
175 
176     // The following includes (among others) \u0009, \u0020, \u00A0, \u2007, \u2009, \u202F, \u3000
177     private static final UnicodeSet UNICODE_WHITESPACE = new UnicodeSet("[:whitespace:]").freeze();
178 
179     private static final CLDRLocale MALAYALAM = CLDRLocale.getInstance("ml");
180     private static final CLDRLocale ROMANIAN = CLDRLocale.getInstance("ro");
181     private static final CLDRLocale CATALAN = CLDRLocale.getInstance("ca");
182     private static final CLDRLocale NGOMBA = CLDRLocale.getInstance("jgo");
183     private static final CLDRLocale KWASIO = CLDRLocale.getInstance("nmg");
184     private static final CLDRLocale HEBREW = CLDRLocale.getInstance("he");
185     private static final CLDRLocale MYANMAR = CLDRLocale.getInstance("my");
186     private static final CLDRLocale KYRGYZ = CLDRLocale.getInstance("ky");
187     private static final CLDRLocale URDU = CLDRLocale.getInstance("ur");
188     private static final CLDRLocale PASHTO = CLDRLocale.getInstance("ps");
189     private static final CLDRLocale FARSI = CLDRLocale.getInstance("fa");
190     private static final CLDRLocale GERMAN_SWITZERLAND = CLDRLocale.getInstance("de_CH");
191     private static final CLDRLocale SWISS_GERMAN = CLDRLocale.getInstance("gsw");
192     private static final CLDRLocale FF_ADLAM = CLDRLocale.getInstance("ff_Adlm");
193     private static final CLDRLocale KASHMIRI = CLDRLocale.getInstance("ks");
194     public static final Set<String> LANGUAGES_USING_MODIFIER_APOSTROPHE =
195             new HashSet<>(
196                     Arrays.asList(
197                             "br", "bss", "cad", "cic", "cch", "gn", "ha", "ha_Latn", "lkt", "mgo",
198                             "mic", "moh", "mus", "nnh", "qu", "quc", "uk", "uz", "uz_Latn"));
199 
200     // Ş ş Ţ ţ  =>  Ș ș Ț ț
201     private static final char[][] ROMANIAN_CONVERSIONS = {
202         {'\u015E', '\u0218'}, {'\u015F', '\u0219'}, {'\u0162', '\u021A'}, {'\u0163', '\u021B'}
203     };
204 
205     private static final char[][] CATALAN_CONVERSIONS = {
206         {'\u013F', '\u004C', '\u00B7'}, // Ŀ -> L·
207         {'\u0140', '\u006C', '\u00B7'}
208     }; // ŀ -> l·
209 
210     private static final char[][] NGOMBA_CONVERSIONS = {
211         {'\u0251', '\u0061'}, {'\u0261', '\u0067'}, //  ɑ -> a , ɡ -> g , See ticket #5691
212         {'\u2019', '\uA78C'}, {'\u02BC', '\uA78C'}
213     }; //  Saltillo, see ticket #6805
214 
215     private static final char[][] KWASIO_CONVERSIONS = {
216         {'\u0306', '\u030C'}, // See ticket #6571, use caron instead of breve
217         {'\u0103', '\u01CE'},
218         {'\u0102', '\u01CD'}, // a-breve -> a-caron
219         {'\u0115', '\u011B'},
220         {'\u011A', '\u01CD'}, // e-breve -> e-caron
221         {'\u012D', '\u01D0'},
222         {'\u012C', '\u01CF'}, // i-breve -> i-caron
223         {'\u014F', '\u01D2'},
224         {'\u014E', '\u01D1'}, // o-breve -> o-caron
225         {'\u016D', '\u01D4'},
226         {'\u016C', '\u01D3'} // u-breve -> u-caron
227     };
228 
229     private static final char[][] HEBREW_CONVERSIONS = {
230         {'\'', '\u05F3'}, {'"', '\u05F4'}
231     }; //  ' -> geresh  " -> gershayim
232 
233     private static final char[][] KYRGYZ_CONVERSIONS = {{'ӊ', 'ң'}, {'Ӊ', 'Ң'}}; //  right modifier
234 
235     private static final char[][] URDU_PLUS_CONVERSIONS = {{'\u0643', '\u06A9'}}; //  wrong char
236 
237     private static final char[][] KASHMIRI_CONVERSIONS = {
238         {'ۍ', 'ؠ'}
239     }; //  wrong char (see CLDR-16595)
240 
241     private static final ZawgyiDetector detector = new ZawgyiDetector();
242     private static final Transliterator zawgyiUnicodeTransliterator =
243             Transliterator.getInstance("Zawgyi-my");
244 
245     private SimpleUnicodeSetFormatter pp = new SimpleUnicodeSetFormatter(); // default collator
246     private UnicodeSetPrettyPrinter rawFormatter = new UnicodeSetPrettyPrinter(); // default
247 
248     private final CLDRLocale locale;
249     private String scriptCode; // actual or default script code (not null after init)
250     private boolean isPosix;
251 
252     private CLDRFile cldrFileForBailey = null;
253 
254     /**
255      * Constructor, taking cldrFile.
256      *
257      * @param cldrFileToCheck
258      */
DisplayAndInputProcessor(CLDRFile cldrFileToCheck, boolean needsCollator)259     public DisplayAndInputProcessor(CLDRFile cldrFileToCheck, boolean needsCollator) {
260         init(this.locale = CLDRLocale.getInstance(cldrFileToCheck.getLocaleID()), needsCollator);
261     }
262 
DisplayAndInputProcessor(CLDRFile cldrFileToCheck)263     public DisplayAndInputProcessor(CLDRFile cldrFileToCheck) {
264         init(this.locale = CLDRLocale.getInstance(cldrFileToCheck.getLocaleID()), true);
265     }
266 
init(CLDRLocale locale, boolean needsCollator)267     void init(CLDRLocale locale, boolean needsCollator) {
268         isPosix = locale.toString().contains("POSIX");
269         if (needsCollator) {
270             Collator col =
271                     ComparatorUtilities.getCldrCollator(locale.toString(), Collator.IDENTICAL);
272             Collator spaceCol =
273                     ComparatorUtilities.getCldrCollator(locale.toString(), Collator.PRIMARY);
274             pp = new SimpleUnicodeSetFormatter((Comparator) col);
275             rawFormatter = UnicodeSetPrettyPrinter.from((Comparator) col, (Comparator) spaceCol);
276         } else {
277             pp = new SimpleUnicodeSetFormatter(); // default collator
278             rawFormatter = new UnicodeSetPrettyPrinter(); // default
279         }
280         String script = locale.getScript();
281         if (script == null || script.length() < 4) {
282             SupplementalDataInfo sdi = CLDRConfig.getInstance().getSupplementalDataInfo();
283             script = sdi.getDefaultScript(locale.getBaseName());
284             if (script == null || script.length() < 4 || script.equals("Zzzz")) {
285                 script = sdi.getDefaultScript(locale.getLanguage());
286             }
287             if (script == null || script.length() < 4) {
288                 script = "Zzzz";
289             }
290         }
291         scriptCode = script;
292     }
293 
getPrettyPrinter()294     public SimpleUnicodeSetFormatter getPrettyPrinter() {
295         return pp;
296     }
297 
298     /**
299      * Constructor, taking ULocale and boolean.
300      *
301      * @param locale the ULocale
302      * @param needsCollator true or false
303      *     <p>Called by getProcessor, with locale = SurveyMain.TRANS_HINT_LOCALE
304      */
DisplayAndInputProcessor(ULocale locale, boolean needsCollator)305     public DisplayAndInputProcessor(ULocale locale, boolean needsCollator) {
306         init(this.locale = CLDRLocale.getInstance(locale), needsCollator);
307     }
308 
309     /**
310      * Constructor, taking ULocale.
311      *
312      * @param locale the ULocale
313      */
DisplayAndInputProcessor(ULocale locale)314     public DisplayAndInputProcessor(ULocale locale) {
315         init(this.locale = CLDRLocale.getInstance(locale), true /* needsCollator */);
316     }
317 
318     /**
319      * Constructor, taking CLDRLocale and boolean.
320      *
321      * @param locale the CLDRLocale
322      * @param needsCollator true or false
323      */
DisplayAndInputProcessor(CLDRLocale locale, boolean needsCollator)324     public DisplayAndInputProcessor(CLDRLocale locale, boolean needsCollator) {
325         init(this.locale = locale, needsCollator);
326     }
327 
328     /**
329      * Constructor, taking locale.
330      *
331      * @param locale
332      */
DisplayAndInputProcessor(CLDRLocale locale)333     public DisplayAndInputProcessor(CLDRLocale locale) {
334         init(this.locale = locale, true);
335     }
336 
337     /**
338      * Process the value for display. The result is a string for display in the Survey tool or
339      * similar program.
340      *
341      * @param path
342      * @param value
343      * @return
344      */
processForDisplay(String path, String value)345     public synchronized String processForDisplay(String path, String value) {
346         if (value == null) {
347             return null;
348         }
349         if (CldrUtility.INHERITANCE_MARKER.equals(value)) {
350             return value;
351         }
352         value = Normalizer.compose(value, false); // Always normalize all text to NFC.
353         if (hasUnicodeSetValue(path)) {
354             return displayUnicodeSet(value);
355         } else if (path.contains("stopword")) {
356             return value.trim().isEmpty() ? "NONE" : value;
357         } else {
358             NumericType numericType = NumericType.getNumericType(path);
359             if (numericType != NumericType.NOT_NUMERIC) {
360                 // Canonicalize existing values that aren't canonicalized yet.
361                 // New values will be canonicalized on input using processInput().
362                 try {
363                     value = getCanonicalPattern(value, numericType, isPosix);
364                 } catch (IllegalArgumentException e) {
365                     if (DEBUG_DAIP) System.err.println("Illegal pattern: " + value);
366                 }
367                 if (numericType != NumericType.CURRENCY
368                         && numericType != NumericType.CURRENCY_ABBREVIATED) {
369                     value = value.replace("'", "");
370                 }
371             }
372         }
373         // Fix up any apostrophes in number symbols
374         if (NUMBER_SEPARATOR_PATTERN.matcher(path).matches()) {
375             value = value.replace('\'', '\u2019');
376         }
377         // Fix up any apostrophes as appropriate (Don't do so for things like date patterns...
378         if (!APOSTROPHE_SKIP_PATHS.matcher(path).matches()) {
379             value = normalizeApostrophes(value);
380         }
381         // Fix up hyphens, replacing with N-dash as appropriate
382         if (INTERVAL_FORMAT_PATHS.matcher(path).matches()) {
383             value =
384                     normalizeIntervalHyphensAndSpaces(
385                             value); // This may also adjust spaces around en dash
386         } else {
387             value = normalizeHyphens(value);
388         }
389         // Fix up possibly empty field
390         if (value.isEmpty()
391                 && (path.startsWith(FSR_START_PATH)
392                         || path.startsWith(NSR_START_PATH)
393                         || path.startsWith(NOL_START_PATH))) {
394             value = EMPTY_ELEMENT_VALUE;
395         }
396         return value;
397     }
398 
hasUnicodeSetValue(String path)399     private boolean hasUnicodeSetValue(String path) {
400         return path.startsWith("//ldml/characters/exemplarCharacters")
401                 || path.startsWith("//ldml/characters/parseLenients");
402     }
403 
404     static final DateTimeCanonicalizer dtc = new DateTimeCanonicalizer(FIX_YEARS);
405 
406     private static final String BAR_VL = "\\|"; // U+007C VERTICAL LINE (pipe, bar) literal
407     private static final String BAR_EL = "\\s+l\\s+"; // U+006C LATIN SMALL LETTER L with space
408     private static final String BAR_DANDA = "।"; // U+0964 DEVANAGARI DANDA
409     private static final String BAR_REGEX = "(" + BAR_VL + "|" + BAR_EL + "|" + BAR_DANDA + ")";
410     public static final Splitter SPLIT_BAR =
411             Splitter.on(Pattern.compile(BAR_REGEX)).trimResults().omitEmptyStrings();
412     static final Splitter SPLIT_SPACE = Splitter.on(' ').trimResults().omitEmptyStrings();
413     static final Joiner JOIN_BAR = Joiner.on(" | ");
414     static final Joiner JOIN_SPACE = Joiner.on(' ');
415 
416     /**
417      * Process the value for input. The result is a cleaned-up value. For example, an exemplar set
418      * is modified to be in the normal format, and any missing [ ] are added (a common omission on
419      * entry). If there are any failures then the original value is returned, so that the proper
420      * error message can be given.
421      *
422      * @param path
423      * @param value
424      * @param internalException to be filled in if RuntimeException occurs
425      * @return the possibly modified value
426      */
processInput( String path, String value, Exception[] internalException)427     public synchronized String processInput(
428             String path, String value, Exception[] internalException) {
429         // skip processing for inheritance marker
430         if (CldrUtility.INHERITANCE_MARKER.equals(value)) {
431             return value;
432         }
433         final String original = value;
434         value = stripProblematicControlCharacters(value);
435         value = Normalizer.compose(value, false); // Always normalize all input to NFC.
436         value = value.replace('\u00B5', '\u03BC'); // use the right Greek mu character
437         if (internalException != null) {
438             internalException[0] = null;
439         }
440         // for root annotations
441         if (CLDRLocale.ROOT.equals(locale) && path.contains("/annotations")) {
442             return value;
443         }
444         try {
445             value = processInputMore(path, value);
446         } catch (RuntimeException e) {
447             if (internalException != null) {
448                 internalException[0] = e;
449             }
450             return original;
451         }
452         return value;
453     }
454 
processInputMore(String path, String value)455     private String processInputMore(String path, String value) {
456         final boolean isUnicodeSet = hasUnicodeSetValue(path);
457         if (isUnicodeSet) {
458             return inputUnicodeSet(path, value);
459         }
460 
461         value = processLocaleSpecificInput(path, value, isUnicodeSet);
462 
463         if (UNICODE_WHITESPACE.containsSome(value)) {
464             value = normalizeWhitespace(path, value);
465         }
466 
467         // remove the empty value (mostly relevant for person names,
468         // but prevents it showing up elsewhere by mistake
469         value = value.replace(EMPTY_ELEMENT_VALUE, "");
470 
471         // all of our values should not have leading or trailing spaces, except insertBetween,
472         // foreignSpaceReplacement, and anything with built-in attribute xml:space="preserve"
473         if (!path.contains("/insertBetween")
474                 && !path.contains("/foreignSpaceReplacement")
475                 && !path.contains("/nativeSpaceReplacement")
476                 && !path.contains("[@xml:space=\"preserve\"]")
477                 && !isUnicodeSet) {
478             value = value.trim();
479         }
480 
481         // fix grouping separator if space
482         if (path.startsWith("//ldml/numbers/symbols") && !path.contains("/alias")) {
483             if (value.isEmpty()) {
484                 value = "\u00A0";
485             }
486             value = value.replace(' ', '\u00A0');
487         }
488 
489         // fix date patterns
490         DateTimePatternType datetimePatternType = DateTimePatternType.fromPath(path);
491         if (DateTimePatternType.STOCK_AVAILABLE_INTERVAL_PATTERNS.contains(datetimePatternType)) {
492             try {
493                 value = dtc.getCanonicalDatePattern(path, value, datetimePatternType);
494             } catch (IllegalArgumentException ex) {
495                 return value;
496             }
497         }
498 
499         if (path.startsWith("//ldml/numbers/currencies/currency") && path.contains("displayName")) {
500             value = normalizeCurrencyDisplayName(value);
501         }
502         NumericType numericType = NumericType.getNumericType(path);
503         if (numericType != NumericType.NOT_NUMERIC) {
504             if (numericType == NumericType.CURRENCY) {
505                 value = value.replaceAll(" ", "\u00A0");
506                 // NOTE: the following "if ... NumericType.CURRENCY_ABBREVIATED" was false here,
507                 // since we know it is NumericType.CURRENCY; so now the code is commented out; if
508                 // anyone
509                 // understands what the intention was, maybe the condition should be restored
510                 // somehow,
511                 // such as with "else if"
512                 // if (numericType == NumericType.CURRENCY_ABBREVIATED) {
513                 //    value = value.replaceAll("0\\.0+", "0");
514                 // }
515             } else {
516                 value =
517                         value.replaceAll("([%\u00A4]) ", "$1\u00A0")
518                                 .replaceAll(" ([%\u00A4])", "\u00A0$1");
519                 value = replace(NON_DECIMAL_PERIOD, value, "'.'");
520                 if (numericType == NumericType.DECIMAL_ABBREVIATED) {
521                     value = value.replaceAll("0\\.0+", "0");
522                 }
523             }
524             value = getCanonicalPattern(value, numericType, isPosix);
525         }
526 
527         // fix [,]
528         if (path.startsWith("//ldml/localeDisplayNames/languages/language")
529                 || path.startsWith("//ldml/localeDisplayNames/scripts/script")
530                 || path.startsWith("//ldml/localeDisplayNames/territories/territory")
531                 || path.startsWith("//ldml/localeDisplayNames/variants/variant")
532                 || path.startsWith("//ldml/localeDisplayNames/keys/key")
533                 || path.startsWith("//ldml/localeDisplayNames/types/type")) {
534             value = value.replace('[', '(').replace(']', ')').replace('[', '(').replace(']', ')');
535         }
536 
537         // Normalize two single quotes for the inches symbol.
538         if (path.contains("/units")) {
539             value = value.replace("''", "″");
540         }
541 
542         // check specific cases
543         //        if (isUnicodeSet) {
544         //            value = inputUnicodeSet(path, value);
545         //        } else
546         if (path.contains("stopword")) {
547             if (value.equals("NONE")) {
548                 value = "";
549             }
550         }
551 
552         // Normalize ellipsis data.
553         if (path.startsWith("//ldml/characters/ellipsis")) {
554             value = value.replace("...", "…");
555         }
556 
557         if (path.startsWith(NOL_START_PATH)) {
558             value = normalizeNameOrderLocales(value);
559         }
560 
561         // Replace Arabic presentation forms with their nominal counterparts
562         value = replaceArabicPresentationForms(value);
563 
564         // Fix up any apostrophes as appropriate (Don't do so for things like date patterns...
565         if (!APOSTROPHE_SKIP_PATHS.matcher(path).matches()) {
566             value = normalizeApostrophes(value);
567         }
568         // Fix up any apostrophes in number symbols
569         if (NUMBER_SEPARATOR_PATTERN.matcher(path).matches()) {
570             value = value.replace('\'', '\u2019');
571         }
572         // Fix up hyphens, replacing with N-dash as appropriate
573         if (INTERVAL_FORMAT_PATHS.matcher(path).matches()) {
574             value =
575                     normalizeIntervalHyphensAndSpaces(
576                             value); // This may also adjust spaces around en dash
577         } else if (!isUnicodeSet) {
578             value = normalizeHyphens(value);
579         }
580         value = processAnnotations(path, value);
581         value = normalizeZeroWidthSpace(value);
582         if (VoteResolver.DROP_HARD_INHERITANCE) {
583             value = replaceBaileyWithInheritanceMarker(path, value);
584         }
585         return value;
586     }
587 
processLocaleSpecificInput(String path, String value, boolean isUnicodeSet)588     private String processLocaleSpecificInput(String path, String value, boolean isUnicodeSet) {
589         if (locale.childOf(MALAYALAM)) {
590             String newvalue = normalizeMalayalam(value);
591             if (DEBUG_DAIP)
592                 System.out.println(
593                         "DAIP: Normalized Malayalam '" + value + "' to '" + newvalue + "'");
594             value = newvalue;
595         } else if (locale.childOf(ROMANIAN) && !isUnicodeSet) {
596             value = standardizeRomanian(value);
597         } else if (locale.childOf(CATALAN) && !isUnicodeSet) {
598             value = standardizeCatalan(value);
599         } else if (locale.childOf(NGOMBA) && !isUnicodeSet) {
600             value = standardizeNgomba(value);
601         } else if (locale.childOf(KWASIO) && !isUnicodeSet) {
602             value = standardizeKwasio(value);
603         } else if (locale.childOf(HEBREW) && !APOSTROPHE_SKIP_PATHS.matcher(path).matches()) {
604             value = replaceChars(path, value, HEBREW_CONVERSIONS, false);
605         } else if ((locale.childOf(SWISS_GERMAN) || locale.childOf(GERMAN_SWITZERLAND))
606                 && !isUnicodeSet) {
607             value = standardizeSwissGerman(value);
608         } else if (locale.childOf(MYANMAR) && !isUnicodeSet) {
609             value = standardizeMyanmar(value);
610         } else if (locale.childOf(KYRGYZ)) {
611             value = replaceChars(path, value, KYRGYZ_CONVERSIONS, false);
612         } else if (locale.childOf(URDU) || locale.childOf(PASHTO) || locale.childOf(FARSI)) {
613             value = replaceChars(path, value, URDU_PLUS_CONVERSIONS, true);
614         } else if (locale.childOf(FF_ADLAM) && !isUnicodeSet) {
615             value = fixAdlamNasalization(value);
616         } else if (locale.childOf(KASHMIRI)) {
617             value = replaceChars(path, value, KASHMIRI_CONVERSIONS, false);
618         }
619         return value;
620     }
621 
processAnnotations(String path, String value)622     private String processAnnotations(String path, String value) {
623         if (AnnotationUtil.pathIsAnnotation(path)) {
624             if (path.contains(Emoji.TYPE_TTS)) {
625                 // The row has something like "�� -name" in the first column. Cf. namePath,
626                 // getNamePaths.
627                 // Normally the value is like "zebra" or "unicorn face", without "|".
628                 // If the user enters a value with "|",  discard anything after "|"; e.g., change "a
629                 // | b | c" to "a".
630                 value = SPLIT_BAR.split(value).iterator().next();
631             } else {
632                 // The row has something like "�� –keywords" in the first column. Cf. keywordPath,
633                 // getKeywordPaths.
634                 // Normally the value is like "stripe | zebra", with "|".
635                 value = annotationsForDisplay(value);
636             }
637         }
638         return value;
639     }
640 
normalizeNameOrderLocales(String value)641     private String normalizeNameOrderLocales(String value) {
642         value = value.replace(EMPTY_ELEMENT_VALUE, "");
643         TreeSet<String> result = new TreeSet<>(SPLIT_SPACE.splitToList(value));
644         result.remove(LocaleNames.ZXX);
645         if (result.remove(LocaleNames.UND)) { // put und at the front
646             if (result.isEmpty()) {
647                 return LocaleNames.UND;
648             } else {
649                 return LocaleNames.UND + " " + JOIN_SPACE.join(result);
650             }
651         }
652         return JOIN_SPACE.join(result);
653     }
654 
655     /**
656      * Strip out all code points less than U+0020 except for U+0009 tab, U+000A line feed, and
657      * U+000D carriage return.
658      *
659      * @param s the string
660      * @return the resulting string
661      */
stripProblematicControlCharacters(String s)662     private String stripProblematicControlCharacters(String s) {
663         if (s == null || s.isEmpty()) {
664             return s;
665         }
666         return s.codePoints()
667                 .filter(c -> (c >= 0x20 || c == 9 || c == 0xA || c == 0xD))
668                 .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append)
669                 .toString();
670     }
671 
672     private static final boolean REMOVE_COVERED_KEYWORDS = true;
673 
674     /**
675      * Produce a modification of the given annotation by sorting its components and filtering
676      * covered keywords.
677      *
678      * <p>Examples: Given "b | a", return "a | b". Given "bear | panda | panda bear", return "bear |
679      * panda".
680      *
681      * @param value the string
682      * @return the possibly modified string
683      */
annotationsForDisplay(String value)684     private static String annotationsForDisplay(String value) {
685         TreeSet<String> sorted = new TreeSet<>(Collator.getInstance(ULocale.ROOT));
686         sorted.addAll(SPLIT_BAR.splitToList(value));
687         if (REMOVE_COVERED_KEYWORDS) {
688             filterCoveredKeywords(sorted);
689         }
690         value = JOIN_BAR.join(sorted);
691         return value;
692     }
693 
694     /**
695      * Filter from the given set some keywords that include spaces, if they duplicate, or are
696      * "covered by", other keywords in the set.
697      *
698      * <p>For example, if the set is {"bear", "panda", "panda bear"} (annotation was "bear | panda |
699      * panda bear"), then remove "panda bear", treating it as "covered" since the set already
700      * includes "panda" and "bear". Also, for example, if the set is {"bear", "panda", "PANDA
701      * BEAR"}, then remove "PANDA BEAR" even though the casing differs.
702      *
703      * <p>Since casing is complex in many languages/scripts, this method does not attempt to
704      * recognize all occurrences of case-insensitive matching. Instead, it first checks for
705      * case-sensitive (exact) matching, then it checks for case-insensitive (loose) matching
706      * according to Locale.ROOT. The intended effect is only to remove an item like "PANDA BEAR" if
707      * both "panda" and "bear" are already present as individual items. The intended effect is never
708      * to modify the casing of any item that is already present.
709      *
710      * @param sorted the set from which items may be removed
711      */
filterCoveredKeywords(TreeSet<String> sorted)712     public static void filterCoveredKeywords(TreeSet<String> sorted) {
713         // for now, just do single items
714         HashSet<String> toRemove = new HashSet<>();
715 
716         TreeSet<String> sortedLower = new TreeSet<>();
717         for (String item : sorted) {
718             sortedLower.add(item.toLowerCase(Locale.ROOT));
719         }
720         for (String item : sorted) {
721             List<String> list = SPLIT_SPACE.splitToList(item);
722             if (list.size() < 2) {
723                 continue;
724             }
725             if (sorted.containsAll(list)) {
726                 toRemove.add(item);
727             } else {
728                 List<String> listLower = new ArrayList<>();
729                 for (String s : list) {
730                     listLower.add(s.toLowerCase(Locale.ROOT));
731                 }
732                 if (sortedLower.containsAll(listLower)) {
733                     toRemove.add(item);
734                 }
735             }
736         }
737         sorted.removeAll(toRemove);
738     }
739 
740     /**
741      * Given a sorted list like "BEAR | Bear | PANDA | Panda | panda",filter out any items that
742      * duplicate other items aside from case, leaving only, for example, "BEAR | PANDA"
743      *
744      * @param sorted the set from which items may be removed
745      */
filterKeywordsDifferingOnlyInCase(TreeSet<String> sorted)746     public static void filterKeywordsDifferingOnlyInCase(TreeSet<String> sorted) {
747         TreeMultimap<String, String> mapFromLower = TreeMultimap.create();
748         for (String item : sorted) {
749             mapFromLower.put(item.toLowerCase(), item);
750         }
751         TreeSet<String> toRetain = new TreeSet<>();
752         for (String lower : mapFromLower.keySet()) {
753             Set<String> variants = mapFromLower.get(lower);
754             for (String var : variants) {
755                 toRetain.add(var);
756                 break;
757             }
758         }
759         sorted.retainAll(toRetain);
760     }
761 
displayUnicodeSet(String value)762     private String displayUnicodeSet(String value) {
763         return pp.format(
764                 new UnicodeSet(value)); // will throw exception if bad format, eg missing [...]
765     }
766 
inputUnicodeSet(String path, String value)767     private String inputUnicodeSet(String path, String value) {
768         UnicodeSet exemplar = null;
769         // hack, in case the input is called twice
770         value = value.trim();
771         if (value.startsWith("[") && value.endsWith("]")) {
772             try {
773                 exemplar = new UnicodeSet(value);
774             } catch (Exception e2) {
775                 // fall through
776             }
777         }
778         if (exemplar == null) {
779             try {
780                 exemplar = pp.parse(value);
781             } catch (Exception e) {
782                 // can't parse at all
783                 return value; // we can't throw an exception because clients won't expect it.
784             }
785         }
786         XPathParts parts = XPathParts.getFrozenInstance(path);
787         //        if (parts.getElement(2).equals("parseLenients")) {
788         //            return exemplar.toPattern(false);
789         //        }
790         final String type = parts.getAttributeValue(-1, "type");
791         ExemplarType exemplarType =
792                 !path.contains("exemplarCharacters")
793                         ? null
794                         : type == null ? ExemplarType.main : ExemplarType.valueOf(type);
795         value = getCleanedUnicodeSet(exemplar, exemplarType);
796         return value;
797     }
798 
normalizeCurrencyDisplayName(String value)799     private String normalizeCurrencyDisplayName(String value) {
800         StringBuilder result = new StringBuilder();
801         boolean inParentheses = false;
802         for (int i = 0; i < value.length(); i++) {
803             char c = value.charAt(i);
804             if (c == '(') {
805                 inParentheses = true;
806             } else if (c == ')') {
807                 inParentheses = false;
808             }
809             if (inParentheses && c == '-' && Character.isDigit(value.charAt(i - 1))) {
810                 c = 0x2013; /* Replace hyphen-minus with dash for date ranges */
811             }
812             result.append(c);
813         }
814         return result.toString();
815     }
816 
normalizeApostrophes(String value)817     private String normalizeApostrophes(String value) {
818         // If our DAIP always had a CLDRFile to work with, then we could just check the exemplar set
819         // in it to see.
820         // But since we don't, we just maintain the list internally and use it.
821         if (LANGUAGES_USING_MODIFIER_APOSTROPHE.contains(locale.getLanguage())) {
822             return value.replace('\'', '\u02bc');
823         } else {
824             char prev = 0;
825             StringBuilder builder = new StringBuilder();
826             for (char c : value.toCharArray()) {
827                 if (c == '\'') {
828                     if (Character.isLetter(prev)) {
829                         builder.append('\u2019');
830                     } else {
831                         builder.append('\u2018');
832                     }
833                 } else {
834                     builder.append(c);
835                 }
836                 prev = c;
837             }
838             return builder.toString();
839         }
840     }
841 
normalizeIntervalHyphensAndSpaces(String value)842     private String normalizeIntervalHyphensAndSpaces(String value) {
843         if (value.contains("{0}")) {
844             // intervalFormatFallback pattern, not handled by DateTimePatternGenerator.FormatParser
845             if (scriptCode.equals("Latn")) {
846                 value = INTERVAL_FALLBACK_RANGE.matcher(value).replaceAll("}\u2009\u2013\u2009{");
847             }
848             return value;
849         }
850         DateTimePatternGenerator.FormatParser fp = new DateTimePatternGenerator.FormatParser();
851         fp.set(
852                 DateIntervalInfo.genPatternInfo(value, false)
853                         .getFirstPart()); // first format & separator including spaces
854         List<Object> items = fp.getItems();
855         Object last = items.get(items.size() - 1);
856         if (last instanceof String) {
857             String separator =
858                     last.toString(); // separator including spaces, and possibly preceding
859             // literal text (. or quoted)
860             String replacement = separator;
861             if (scriptCode.equals("Latn")
862                     && (separator.endsWith(" - ") || separator.endsWith(" \u2013 "))) {
863                 replacement =
864                         separator.substring(0, separator.length() - 3)
865                                 + "\u2009\u2013\u2009"; // Per CLDR-14032,16308
866             } else if (separator.contains("-")) {
867                 replacement = separator.replace("-", "\u2013");
868             }
869             if (!replacement.equals(separator)) {
870                 StringBuilder sb = new StringBuilder();
871                 sb.append(DateIntervalInfo.genPatternInfo(value, false).getFirstPart());
872                 if (sb.lastIndexOf(separator) >= 0) {
873                     sb.delete(sb.lastIndexOf(separator), sb.length());
874                     sb.append(replacement);
875                     sb.append(
876                             DateIntervalInfo.genPatternInfo(value, false)
877                                     .getSecondPart()); // second format only
878                     return sb.toString();
879                 }
880             }
881         }
882         return value;
883     }
884 
normalizeHyphens(String value)885     private String normalizeHyphens(String value) {
886         int hyphenLocation = value.indexOf("-");
887         if (hyphenLocation > 0
888                 && Character.isDigit(value.charAt(hyphenLocation - 1))
889                 && hyphenLocation < value.length() - 1
890                 && Character.isDigit(value.charAt(hyphenLocation + 1))) {
891             return value.substring(0, hyphenLocation)
892                     + "\u2013"
893                     + value.substring(hyphenLocation + 1);
894         }
895         return value;
896     }
897 
standardizeRomanian(String value)898     private String standardizeRomanian(String value) {
899         StringBuilder builder = new StringBuilder();
900         for (char c : value.toCharArray()) {
901             for (char[] pair : ROMANIAN_CONVERSIONS) {
902                 if (c == pair[0]) {
903                     c = pair[1];
904                     break;
905                 }
906             }
907             builder.append(c);
908         }
909         return builder.toString();
910     }
911 
standardizeKwasio(String value)912     private String standardizeKwasio(String value) {
913         StringBuilder builder = new StringBuilder();
914         for (char c : value.toCharArray()) {
915             for (char[] pair : KWASIO_CONVERSIONS) {
916                 if (c == pair[0]) {
917                     c = pair[1];
918                     break;
919                 }
920             }
921             builder.append(c);
922         }
923         return builder.toString();
924     }
925 
926     // Use the myanmar-tools detector.
standardizeMyanmar(String value)927     private String standardizeMyanmar(String value) {
928         if (detector.getZawgyiProbability(value) > 0.90) {
929             return zawgyiUnicodeTransliterator.transform(value);
930         }
931         return value;
932     }
933 
standardizeNgomba(String value)934     private String standardizeNgomba(String value) {
935         StringBuilder builder = new StringBuilder();
936         char[] charArray = value.toCharArray();
937         for (int i = 0; i < charArray.length; i++) {
938             char c = charArray[i];
939             boolean convertedSaltillo = false;
940             for (char[] pair : NGOMBA_CONVERSIONS) {
941                 if (c == pair[0]) {
942                     c = pair[1];
943                     if (c == '\uA78C') {
944                         convertedSaltillo = true;
945                     }
946                     break;
947                 }
948             }
949             if (convertedSaltillo
950                     && ((i > 0
951                                     && i < charArray.length - 1
952                                     && Character.isUpperCase(charArray[i - 1])
953                                     && Character.isUpperCase(charArray[i + 1]))
954                             || (i > 1
955                                     && Character.isUpperCase(charArray[i - 1])
956                                     && Character.isUpperCase(charArray[i - 2])))) {
957                 c = '\uA78B'; // UPPER CASE SALTILLO
958             }
959             builder.append(c);
960         }
961         return builder.toString();
962     }
963 
replaceChars( String path, String value, char[][] charsToReplace, boolean skipAuxExemplars)964     private String replaceChars(
965             String path, String value, char[][] charsToReplace, boolean skipAuxExemplars) {
966         if (skipAuxExemplars && path.contains("/exemplarCharacters[@type=\"auxiliary\"]")) {
967             return value;
968         }
969         StringBuilder builder = new StringBuilder();
970         for (char c : value.toCharArray()) {
971             for (char[] pair : charsToReplace) {
972                 if (c == pair[0]) {
973                     c = pair[1];
974                     break;
975                 }
976             }
977             builder.append(c);
978         }
979         return builder.toString();
980     }
981 
standardizeSwissGerman(String value)982     private String standardizeSwissGerman(String value) {
983         return value.replaceAll("\u00DF", "ss");
984     }
985 
standardizeCatalan(String value)986     private String standardizeCatalan(String value) {
987         StringBuilder builder = new StringBuilder();
988         for (char c : value.toCharArray()) {
989             boolean didSubstitute = false;
990             for (char[] triple : CATALAN_CONVERSIONS) {
991                 if (c == triple[0]) {
992                     builder.append(triple[1]);
993                     builder.append(triple[2]);
994                     didSubstitute = true;
995                     break;
996                 }
997             }
998             if (!didSubstitute) {
999                 builder.append(c);
1000             }
1001         }
1002         return builder.toString();
1003     }
1004 
replace(Pattern pattern, String value, String replacement)1005     private String replace(Pattern pattern, String value, String replacement) {
1006         String value2 = pattern.matcher(value).replaceAll(replacement);
1007         if (DEBUG_DAIP && !value.equals(value2)) {
1008             System.out.println("\n" + value + " => " + value2);
1009         }
1010         return value2;
1011     }
1012 
1013     private static final Pattern UNNORMALIZED_MALAYALAM =
1014             PatternCache.get("(\u0D23|\u0D28|\u0D30|\u0D32|\u0D33|\u0D15)\u0D4D\u200D");
1015 
1016     private static final Map<Character, Character> NORMALIZING_MAP =
1017             Builder.with(new HashMap<Character, Character>())
1018                     .put('\u0D23', '\u0D7A')
1019                     .put('\u0D28', '\u0D7B')
1020                     .put('\u0D30', '\u0D7C')
1021                     .put('\u0D32', '\u0D7D')
1022                     .put('\u0D33', '\u0D7E')
1023                     .put('\u0D15', '\u0D7F')
1024                     .get();
1025 
1026     /**
1027      * Normalizes the Malayalam characters in the specified input.
1028      *
1029      * @param value the input to be normalized
1030      * @return
1031      */
normalizeMalayalam(String value)1032     private String normalizeMalayalam(String value) {
1033         // Normalize Malayalam characters.
1034         Matcher matcher = UNNORMALIZED_MALAYALAM.matcher(value);
1035         if (matcher.find()) {
1036             StringBuffer buffer = new StringBuffer();
1037             int start = 0;
1038             do {
1039                 buffer.append(value, start, matcher.start(0));
1040                 char codePoint = matcher.group(1).charAt(0);
1041                 buffer.append(NORMALIZING_MAP.get(codePoint));
1042                 start = matcher.end(0);
1043             } while (matcher.find());
1044             buffer.append(value.substring(start));
1045             value = buffer.toString();
1046         }
1047         return value;
1048     }
1049 
1050     static final Transform<String, String> fixArabicPresentation =
1051             Transliterator.getInstance(
1052                     "[[:block=Arabic_Presentation_Forms_A:][:block=Arabic_Presentation_Forms_B:]] nfkc");
1053 
1054     /**
1055      * Normalizes the Arabic presentation forms characters in the specified input.
1056      *
1057      * @param value the input to be normalized
1058      * @return
1059      */
replaceArabicPresentationForms(String value)1060     private String replaceArabicPresentationForms(String value) {
1061         value = fixArabicPresentation.transform(value);
1062         return value;
1063     }
1064 
1065     static Pattern ADLAM_MISNASALIZED = PatternCache.get("([����])['’‘]([����������������])");
1066     public static String ADLAM_NASALIZATION = "��"; // U+1E94B (Unicode 12.0)
1067 
fixAdlamNasalization(String fromString)1068     public static String fixAdlamNasalization(String fromString) {
1069         return ADLAM_MISNASALIZED
1070                 .matcher(fromString)
1071                 .replaceAll("$1" + ADLAM_NASALIZATION + "$2"); // replace quote with ��
1072     }
1073 
getCleanedUnicodeSet(UnicodeSet exemplar, ExemplarType exemplarType)1074     public String getCleanedUnicodeSet(UnicodeSet exemplar, ExemplarType exemplarType) {
1075 
1076         if (rawFormatter == null) {
1077             throw new IllegalArgumentException("Formatter must not be null");
1078         }
1079         if (exemplar == null) {
1080             throw new IllegalArgumentException("set to be cleaned must not be null");
1081         }
1082 
1083         String value;
1084         // prettyPrinter.setCompressRanges(exemplar.size() > 300);
1085         // value = exemplar.toPattern(false);
1086         UnicodeSet toAdd = new UnicodeSet();
1087 
1088         for (UnicodeSetIterator usi = new UnicodeSetIterator(exemplar); usi.next(); ) {
1089             String string = usi.getString();
1090             if (string.equals("ß") || string.equals("İ")) {
1091                 toAdd.add(string);
1092                 continue;
1093             }
1094             switch (string) {
1095                 case "\u2011":
1096                     toAdd.add("-");
1097                     break; // nobreak hyphen
1098                 case "-":
1099                     toAdd.add("\u2011");
1100                     break; // nobreak hyphen
1101 
1102                 case " ":
1103                     toAdd.add("\u00a0");
1104                     break; // nobreak space
1105                 case "\u00a0":
1106                     toAdd.add(" ");
1107                     break; // nobreak space
1108 
1109                 case "\u202F":
1110                     toAdd.add("\u2009");
1111                     break; // nobreak narrow space
1112                 case "\u2009":
1113                     toAdd.add("\u202F");
1114                     break; // nobreak narrow space
1115             }
1116             if (exemplarType != null && exemplarType.convertUppercase) {
1117                 string = UCharacter.toLowerCase(ULocale.ENGLISH, string);
1118             }
1119             toAdd.add(string);
1120             // we allow
1121             String composed = Normalizer.compose(string, false);
1122             if (!string.equals(composed)) {
1123                 toAdd.add(composed);
1124             }
1125         }
1126 
1127         if (exemplarType != null) {
1128             toAdd.removeAll(exemplarType.toRemove);
1129         }
1130         value = rawFormatter.format(toAdd);
1131         return value;
1132     }
1133 
1134     static final Splitter SEMI_SPLITTER = Splitter.on(';').trimResults();
1135 
1136     /**
1137      * @return a canonical numeric pattern, based on the type, and the isPOSIX flag. The latter is
1138      *     set for en_US_POSIX.
1139      */
getCanonicalPattern(String inpattern, NumericType type, boolean isPOSIX)1140     public static String getCanonicalPattern(String inpattern, NumericType type, boolean isPOSIX) {
1141         // TODO fix later to properly handle quoted ;
1142 
1143         DecimalFormat df = new DecimalFormat(inpattern);
1144         if (type == NumericType.DECIMAL_ABBREVIATED
1145                 || type == NumericType.CURRENCY_ABBREVIATED
1146                 || CldrUtility.INHERITANCE_MARKER.equals(inpattern)) {
1147             return inpattern; // TODO fix when ICU bug is fixed
1148             // df.setMaximumFractionDigits(df.getMinimumFractionDigits());
1149             // df.setMaximumIntegerDigits(Math.max(1, df.getMinimumIntegerDigits()));
1150         } else {
1151             // int decimals = type == CURRENCY_TYPE ? 2 : 1;
1152             int[] digits = isPOSIX ? type.posixDigitCount : type.digitCount;
1153             df.setMinimumIntegerDigits(digits[0]);
1154             df.setMinimumFractionDigits(digits[1]);
1155             df.setMaximumFractionDigits(digits[2]);
1156         }
1157         String pattern = df.toPattern();
1158         List<String> parts = SEMI_SPLITTER.splitToList(pattern);
1159         String pattern2 = parts.get(0);
1160         if (parts.size() > 1) {
1161             pattern2 += ";" + parts.get(1);
1162         }
1163         if (!pattern2.equals(pattern)) {
1164             pattern = pattern2;
1165         }
1166         // int pos = pattern.indexOf(';');
1167         // if (pos < 0) return pattern + ";-" + pattern;
1168         return pattern;
1169     }
1170 
enableInheritanceReplacement(CLDRFile cldrFile)1171     public void enableInheritanceReplacement(CLDRFile cldrFile) {
1172         cldrFileForBailey = cldrFile;
1173     }
1174 
1175     /*
1176      * This tests what type a numeric pattern is.
1177      */
1178     public enum NumericType {
1179         CURRENCY(new int[] {1, 2, 2}, new int[] {1, 2, 2}),
1180         CURRENCY_ABBREVIATED(),
1181         DECIMAL(new int[] {1, 0, 3}, new int[] {1, 0, 6}),
1182         DECIMAL_ABBREVIATED(),
1183         PERCENT(new int[] {1, 0, 0}, new int[] {1, 0, 0}),
1184         SCIENTIFIC(new int[] {0, 0, 0}, new int[] {1, 6, 6}),
1185         NOT_NUMERIC;
1186 
1187         private static final Pattern NUMBER_PATH =
1188                 Pattern.compile(
1189                         "//ldml/numbers/((currency|decimal|percent|scientific)Formats|currencies/currency).*");
1190         private int[] digitCount;
1191         private int[] posixDigitCount;
1192 
NumericType()1193         NumericType() {}
1194 
NumericType(int[] digitCount, int[] posixDigitCount)1195         NumericType(int[] digitCount, int[] posixDigitCount) {
1196             this.digitCount = digitCount;
1197             this.posixDigitCount = posixDigitCount;
1198         }
1199 
1200         /**
1201          * @return the numeric type of the xpath
1202          */
getNumericType(String xpath)1203         public static NumericType getNumericType(String xpath) {
1204             Matcher matcher = NUMBER_PATH.matcher(xpath);
1205             if (!xpath.contains("/pattern")) {
1206                 return NOT_NUMERIC;
1207             } else if (matcher.matches()) {
1208                 if (matcher.group(1).equals("currencies/currency")) {
1209                     return CURRENCY;
1210                 } else {
1211                     NumericType type = NumericType.valueOf(matcher.group(2).toUpperCase());
1212                     if (xpath.contains("=\"1000")) {
1213                         if (type == DECIMAL) {
1214                             type = DECIMAL_ABBREVIATED;
1215                         } else if (type == CURRENCY) {
1216                             type = CURRENCY_ABBREVIATED;
1217                         } else {
1218                             throw new IllegalArgumentException("Internal Error");
1219                         }
1220                     }
1221                     return type;
1222                 }
1223             } else {
1224                 return NOT_NUMERIC;
1225             }
1226         }
1227 
getDigitCount()1228         public int[] getDigitCount() {
1229             return digitCount;
1230         }
1231 
getPosixDigitCount()1232         public int[] getPosixDigitCount() {
1233             return posixDigitCount;
1234         }
1235     }
1236 
1237     /**
1238      * Turn all whitespace sequences (including tab and newline, and NBSP for certain paths) into a
1239      * single space or a single NBSP depending on path. Also trim initial/final NBSP, unless the
1240      * value is only the one character, "\u00A0"
1241      *
1242      * @param path
1243      * @param value
1244      * @return the normalized value
1245      */
normalizeWhitespace(String path, String value)1246     private String normalizeWhitespace(String path, String value) {
1247         PathSpaceType pst = PathSpaceType.get(path);
1248         if (pst == PathSpaceType.allowSp) {
1249             value =
1250                     WHITESPACE_AND_NBSP_TO_NORMALIZE
1251                             .matcher(value)
1252                             .replaceAll(" "); // replace with regular space
1253         } else if (pst == PathSpaceType.allowNbsp) {
1254             value =
1255                     WHITESPACE_AND_NBSP_TO_NORMALIZE
1256                             .matcher(value)
1257                             .replaceAll("\u00A0"); // replace with NBSP
1258             value = trimNBSP(value);
1259         } else if (pst == PathSpaceType.allowNNbsp) {
1260             value =
1261                     WHITESPACE_AND_NBSP_TO_NORMALIZE
1262                             .matcher(value)
1263                             .replaceAll("\u202F"); // replace with NNBSP
1264             value = trimNBSP(value);
1265         } else if (pst == PathSpaceType.allowSpOrNbsp) {
1266             /*
1267              * in this case don't normalize away NBSP
1268              */
1269             value =
1270                     WHITESPACE_NO_NBSP_TO_NORMALIZE
1271                             .matcher(value)
1272                             .replaceAll(" "); // replace with regular space
1273             /*
1274              * if any NBSP and regular space are adjacent, replace with NBSP
1275              */
1276             value = NBSP_PLUS_SPACE_TO_NORMALIZE.matcher(value).replaceAll("\u00A0");
1277             value = SPACE_PLUS_NBSP_TO_NORMALIZE.matcher(value).replaceAll("\u00A0");
1278             value = MULTIPLE_NBSP.matcher(value).replaceAll("\u00A0");
1279             value = trimNBSP(value);
1280         } else {
1281             throw new IllegalArgumentException("Unknown PathSpaceType " + pst);
1282         }
1283 
1284         // Further whitespace adjustments per CLDR-14032
1285         if ((scriptCode.equals("Latn") || scriptCode.equals("Cyrl") || scriptCode.equals("Grek"))
1286                 && HOUR_FORMAT_XPATHS.matcher(path).matches()) {
1287             String test = AMPM_SPACE_BEFORE.matcher(value).replaceAll("$1$2"); // value without a+
1288             String spaceReplace = path.contains("ascii") ? "$1\u0020$3" : "$1\u202F$3";
1289             if (value.length() - test.length() != 4) { // exclude patterns with aaaa
1290                 value = AMPM_SPACE_BEFORE.matcher(value).replaceAll(spaceReplace);
1291             }
1292             test = AMPM_SPACE_AFTER.matcher(value).replaceAll("$2$3"); // value without a+
1293             if (value.length() - test.length() != 4) { // exclude patterns with aaaa
1294                 value = AMPM_SPACE_AFTER.matcher(value).replaceAll(spaceReplace);
1295             }
1296         }
1297         if (scriptCode.equals("Cyrl") && YEAR_FORMAT_XPATHS.matcher(path).matches()) {
1298             value = YEAR_SPACE_YEARMARKER.matcher(value).replaceAll("y\u202F$1");
1299         }
1300         if (UNIT_NARROW_XPATHS.matcher(path).matches()) {
1301             value = PLACEHOLDER_SPACE_AFTER.matcher(value).replaceAll("}\u202F"); // Narrow NBSP
1302             value = PLACEHOLDER_SPACE_BEFORE.matcher(value).replaceAll("\u202F{");
1303         }
1304         if (UNIT_SHORT_XPATHS.matcher(path).matches()) {
1305             value = PLACEHOLDER_SPACE_AFTER.matcher(value).replaceAll("}\u00A0"); // Regular NBSP
1306             value = PLACEHOLDER_SPACE_BEFORE.matcher(value).replaceAll("\u00A0{");
1307         }
1308 
1309         // Finally, replace remaining space combinations with most restrictive type CLDR-17233
1310         // If we have NNBSP U+202F in combination with other spaces, keep just it
1311         value = NNBSP_AMONG_OTHER_SPACES.matcher(value).replaceAll("\u202F");
1312         // Else if we have NBSP U+00A0 in combination with other spaces, keep just it
1313         value = NBSP_AMONG_OTHER_SPACES.matcher(value).replaceAll("\u00A0");
1314         // Else if we have THIN SPACE U+2009 in combination with other spaces, keep just it
1315         value = THIN_SPACE_AMONG_OTHER_SPACES.matcher(value).replaceAll("\u2009");
1316 
1317         return value;
1318     }
1319 
1320     /**
1321      * Delete any initial or final NBSP or NNBSP, unless the value is just NBSP or NNBSP
1322      *
1323      * @param value
1324      * @return the trimmed value
1325      */
trimNBSP(String value)1326     private String trimNBSP(String value) {
1327         if (!value.equals("\u00A0") && !value.equals("\u202F")) {
1328             value = INITIAL_NBSP.matcher(value).replaceAll("");
1329             value = FINAL_NBSP.matcher(value).replaceAll("");
1330         }
1331         return value;
1332     }
1333 
1334     /** Categorize xpaths according to whether they allow space, NBSP, or both */
1335     public enum PathSpaceType {
1336         allowSp,
1337         allowNbsp,
1338         allowNNbsp,
1339         allowSpOrNbsp;
1340 
get(String path)1341         public static PathSpaceType get(String path) {
1342             if (wantsRegularSpace(path)) {
1343                 return allowSp;
1344             } else if (wantsNBSP(path)) {
1345                 return allowNbsp;
1346             } else if (wantsNNBSP(path)) {
1347                 return allowNNbsp;
1348             } else {
1349                 return allowSpOrNbsp;
1350             }
1351         }
1352 
wantsRegularSpace(String path)1353         private static boolean wantsRegularSpace(String path) {
1354             if ((path.contains("/dateFormatLength") && path.contains("/pattern"))
1355                     || path.contains("/availableFormats/dateFormatItem")
1356                     || (path.startsWith("//ldml/dates/timeZoneNames/metazone")
1357                             && path.contains("/long"))
1358                     || path.startsWith("//ldml/dates/timeZoneNames/regionFormat")
1359                     || path.startsWith("//ldml/localeDisplayNames/codePatterns/codePattern")
1360                     || path.startsWith("//ldml/localeDisplayNames/languages/language")
1361                     || path.startsWith("//ldml/localeDisplayNames/territories/territory")
1362                     || path.startsWith("//ldml/localeDisplayNames/types/type")
1363                     || (path.startsWith("//ldml/numbers/currencies/currency")
1364                             && path.contains("/displayName"))
1365                     || (path.contains("/decimalFormatLength[@type=\"long\"]")
1366                             && path.contains("/pattern"))
1367                     || path.startsWith("//ldml/posix/messages")
1368                     || (path.startsWith("//ldml/units/uni") && path.contains("/unitPattern "))) {
1369                 return true;
1370             }
1371             return false;
1372         }
1373 
wantsNBSP(String path)1374         private static boolean wantsNBSP(String path) {
1375             if ((path.contains("/currencies/currency")
1376                             && (path.contains("/group") || path.contains("/pattern")))
1377                     || (path.contains("/currencyFormatLength") && path.contains("/pattern"))
1378                     || (path.contains("/currencySpacing") && path.contains("/insertBetween"))
1379                     || (path.contains("/decimalFormatLength") && path.contains("/pattern"))
1380                     || // i.e. the non-long ones
1381                     (path.contains("/percentFormatLength") && path.contains("/pattern"))
1382                     || (path.startsWith("//ldml/numbers/symbols")
1383                             && (path.contains("/group") || path.contains("/nan")))) {
1384                 return true;
1385             }
1386             return false;
1387         }
1388 
wantsNNBSP(String path)1389         private static boolean wantsNNBSP(String path) {
1390             if ((path.contains("/dayPeriodWidth[@type=\"abbreviated\"]")
1391                             || path.contains("/dayPeriodWidth[@type=\"narrow\"]"))
1392                     && (path.contains("/dayPeriod[@type=\"am\"]")
1393                             || path.contains("/dayPeriod[@type=\"pm\"]"))) {
1394                 return true;
1395             }
1396             return false;
1397         }
1398     }
1399 
1400     private static final Pattern ZERO_WIDTH_SPACES = PatternCache.get("\\u200B+");
1401     private static final Set<String> LOCALES_NOT_ALLOWING_ZWS =
1402             new HashSet<>(Arrays.asList("da", "fr"));
1403 
1404     /**
1405      * Remove occurrences of U+200B ZERO_WIDTH_SPACE under certain conditions
1406      *
1407      * @param value the value to be normalized
1408      * @return the normalized value
1409      *     <p>TODO: extend this method to address more concerns, after clarifying the conditions -
1410      *     enlarge the set LOCALES_NOT_ALLOWING_ZWS? - strip initial and final ZWS in all locales? -
1411      *     reduce two or more adjacent ZWS to one ZWS? - allow or prohibit ZWS by itself as currency
1412      *     symbol, as currently in locales kea, pt_CV, pt_PT - allow or prohibit ZWS preceding URL
1413      *     as in "as per [U+200B]https://www.unicode.org/reports/tr35/tr35-general.html#Annotations
1414      *     " Reference: https://unicode-org.atlassian.net/browse/CLDR-15976
1415      */
normalizeZeroWidthSpace(String value)1416     private String normalizeZeroWidthSpace(String value) {
1417         if (ZERO_WIDTH_SPACES.matcher(value).find()) {
1418             final String localeId = locale.getBaseName();
1419             if (LOCALES_NOT_ALLOWING_ZWS.contains(localeId)) {
1420                 value = ZERO_WIDTH_SPACES.matcher(value).replaceAll("");
1421             }
1422         }
1423         return value;
1424     }
1425 
1426     /**
1427      * If inheritance replacement is enabled and the value matches the Bailey (inherited) value,
1428      * replace the value with CldrUtility.INHERITANCE_MARKER
1429      *
1430      * <p>This is only appropriate if cldrFileForBailey != null, meaning that
1431      * enableInheritanceReplacement has been called -- some cost may be involved in getting
1432      * cldrFileForBailey and calling getBaileyValue, and some callers of DAIP may not want the
1433      * replacement, so the default, when enableInheritanceReplacement has not been called, is no
1434      * replacement
1435      *
1436      * @param path
1437      * @param value
1438      * @return the value or CldrUtility.INHERITANCE_MARKER
1439      */
replaceBaileyWithInheritanceMarker(String path, String value)1440     public String replaceBaileyWithInheritanceMarker(String path, String value) {
1441         if (cldrFileForBailey != null && !value.isEmpty()) {
1442             Output<String> pathWhereFound = new Output<>();
1443             Output<String> localeWhereFound = new Output<>();
1444             String baileyValue =
1445                     cldrFileForBailey.getBaileyValue(path, pathWhereFound, localeWhereFound);
1446             if (value.equals(baileyValue)
1447                     && !XMLSource.ROOT_ID.equals(localeWhereFound.value)
1448                     && !XMLSource.CODE_FALLBACK_ID.equals(localeWhereFound.value)) {
1449                 return CldrUtility.INHERITANCE_MARKER;
1450             }
1451         }
1452         return value;
1453     }
1454 }
1455