DisplayAndInputProcessor.java (revision 912701f9769bb47905792267661f0baf2b85bed5) - OpenGrok cross reference for /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/test/DisplayAndInputProcessor.java

/* Copyright (C) 2007-2013 Google and others.  All Rights Reserved. */
/* Copyright (C) 2007-2013 IBM Corp. and others. All Rights Reserved. */

package org.unicode.cldr.test;

import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.TreeMultimap;
import com.google.myanmartools.ZawgyiDetector;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.DateIntervalInfo;
import com.ibm.icu.text.DateTimePatternGenerator;
import com.ibm.icu.text.DecimalFormat;
import com.ibm.icu.text.Normalizer;
import com.ibm.icu.text.Transform;
import com.ibm.icu.text.Transliterator;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.util.Output;
import com.ibm.icu.util.ULocale;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.unicode.cldr.test.CheckExemplars.ExemplarType;
import org.unicode.cldr.util.AnnotationUtil;
import org.unicode.cldr.util.Builder;
import org.unicode.cldr.util.CLDRConfig;
import org.unicode.cldr.util.CLDRFile;
import org.unicode.cldr.util.CLDRLocale;
import org.unicode.cldr.util.CldrUtility;
import org.unicode.cldr.util.ComparatorUtilities;
import org.unicode.cldr.util.DateTimeCanonicalizer;
import org.unicode.cldr.util.DateTimeCanonicalizer.DateTimePatternType;
import org.unicode.cldr.util.Emoji;
import org.unicode.cldr.util.LocaleNames;
import org.unicode.cldr.util.PatternCache;
import org.unicode.cldr.util.SimpleUnicodeSetFormatter;
import org.unicode.cldr.util.SupplementalDataInfo;
import org.unicode.cldr.util.UnicodeSetPrettyPrinter;
import org.unicode.cldr.util.VoteResolver;
import org.unicode.cldr.util.XMLSource;
import org.unicode.cldr.util.XPathParts;

/**
 * Class for processing the input and output of CLDR data for use in the Survey Tool and other
 * tools.
 */
public class DisplayAndInputProcessor {

    /** Special PersonName paths that allow empty string, public for testing */
    public static final String NOL_START_PATH = "//ldml/personNames/nameOrderLocales";

    public static final String FSR_START_PATH = "//ldml/personNames/foreignSpaceReplacement";
    public static final String NSR_START_PATH = "//ldml/personNames/nativeSpaceReplacement";

    public static final String EMPTY_ELEMENT_VALUE = "❮EMPTY❯";

    private static final boolean FIX_YEARS = true;

    public static final boolean DEBUG_DAIP = CldrUtility.getProperty("DEBUG_DAIP", false);

    public static final UnicodeSet RTL =
            new UnicodeSet("[[:Bidi_Class=Arabic_Letter:][:Bidi_Class=Right_To_Left:]]").freeze();

    public static final Pattern NUMBER_SEPARATOR_PATTERN =
            Pattern.compile("//ldml/numbers/symbols.*/(decimal|group)");

    private static final Pattern APOSTROPHE_SKIP_PATHS =
            PatternCache.get(
                    "//ldml/("
                            + "localeDisplayNames/languages/language\\[@type=\"mic\"].*|"
                            + "characters/.*|"
                            + "delimiters/.*|"
                            + "dates/.+/(pattern|intervalFormatItem|dateFormatItem).*|"
                            + "units/.+/unitPattern.*|"
                            + "units/.+/durationUnitPattern.*|"
                            + "numbers/symbols.*|"
                            + "numbers/miscPatterns.*|"
                            + "numbers/(decimal|currency|percent|scientific)Formats.+/(decimal|currency|percent|scientific)Format.*)");
    private static final Pattern INTERVAL_FORMAT_PATHS =
            PatternCache.get("//ldml/dates/.+/intervalFormat(Item.*|Fallback)");
    private static final Pattern NON_DECIMAL_PERIOD = PatternCache.get("(?<![0#'])\\.(?![0#'])");

    // Pattern to match against paths that might have time formats with h or K (12-hour cycles)
    private static final Pattern HOUR_FORMAT_XPATHS =
            PatternCache.get(
                    "//ldml/dates/calendars/calendar\\[@type=\"[^\"]*\"]/("
                            + "timeFormats/timeFormatLength\\[@type=\"[^\"]*\"]/timeFormat\\[@type=\"standard\"]/pattern\\[@type=\"standard\"].*|"
                            + "dateTimeFormats/availableFormats/dateFormatItem\\[@id=\"[A-GL-Ma-gl-m]*[hK][A-Za-z]*\"].*|"
                            + "dateTimeFormats/intervalFormats/intervalFormatItem\\[@id=\"[A-GL-Ma-gl-m]*[hK][A-Za-z]*\"].*)");

    private static final Pattern AMPM_SPACE_BEFORE =
            PatternCache.get("([Khms])([ \\u00A0\\u202F]+)(a+)"); // time, space, a+
    private static final Pattern AMPM_SPACE_AFTER =
            PatternCache.get("(a+)([ \\u00A0\\u202F]+)([Kh])"); // a+, space, hour

    // Pattern to match against paths that might have date formats with y
    private static final Pattern YEAR_FORMAT_XPATHS =
            PatternCache.get(
                    "//ldml/dates/calendars/calendar\\[@type=\"[^\"]*\"]/("
                            + "dateFormats/dateFormatLength\\[@type=\"[^\"]*\"]/dateFormat\\[@type=\"standard\"]/pattern\\[@type=\"standard\"].*|"
                            + "dateTimeFormats/availableFormats/dateFormatItem\\[@id=\"[A-XZa-xz]*y[A-Za-z]*\"].*|"
                            + "dateTimeFormats/intervalFormats/intervalFormatItem\\[@id=\"[A-XZa-xz]*y[A-Za-z]*\"].*)");

    // Cyrillic year markers are or begin with (in various languages) \u0430 \u0433 \u0435 \u0436
    // \u043E \u0440 \u0441
    private static final Pattern YEAR_SPACE_YEARMARKER =
            PatternCache.get("y[ \\u00A0]+('?[агежорс])"); // y, space, Cyrillic year marker start

    public static final Pattern UNIT_NARROW_XPATHS =
            PatternCache.get(
                    "//ldml/units/unitLength\\[@type=\"narrow\"]unit\\[@type=\"[^\"]*\"]/unitPattern.*");

    public static final Pattern UNIT_SHORT_XPATHS =
            PatternCache.get(
                    "//ldml/units/unitLength\\[@type=\"short\"]unit\\[@type=\"[^\"]*\"]/unitPattern.*");

    private static final Pattern PLACEHOLDER_SPACE_AFTER =
            PatternCache.get("\\}[ \\u00A0\\u202F]+");
    private static final Pattern PLACEHOLDER_SPACE_BEFORE =
            PatternCache.get("[ \\u00A0\\u202F]+\\{");
    private static final Pattern INTERVAL_FALLBACK_RANGE = PatternCache.get("\\} [\\u2013-] \\{");

    /** string of whitespace not including NBSP, i.e. [\t\n\r]+ */
    private static final Pattern WHITESPACE_NO_NBSP_TO_NORMALIZE = PatternCache.get("\\s+"); //

    /** string of whitespace, possibly including NBSP and/or NNBSP, ie., [\u00A0\t\n\r\u202F]+ */
    private static final Pattern WHITESPACE_AND_NBSP_TO_NORMALIZE =
            PatternCache.get("[\\s\\u00A0]+");
    // Reverted 2022-12-08 from:
    // private static final Pattern WHITESPACE_AND_NBSP_TO_NORMALIZE =
    // PatternCache.get("[\\s\\u00A0\\u202F]+");

    /** one or more NBSP (or NNBSP) followed by one or more regular spaces */
    private static final Pattern NBSP_PLUS_SPACE_TO_NORMALIZE =
            PatternCache.get("\\u00A0+\\u0020+");
    // Reverted 2022-12-08 from:
    // private static final Pattern NBSP_PLUS_SPACE_TO_NORMALIZE =
    // PatternCache.get("[\\u00A0\\u202F]+\\u0020+");

    /** one or more regular spaces followed by one or more NBSP (or NNBSP) */
    private static final Pattern SPACE_PLUS_NBSP_TO_NORMALIZE =
            PatternCache.get("\\u0020+\\u00A0+");
    // Reverted 2022-12-08 from:
    // private static final Pattern SPACE_PLUS_NBSP_TO_NORMALIZE =
    // PatternCache.get("\\u0020+[\\u00A0\\u202F]+");

    // NNBSP 202F among other horizontal spaces (includes 0020, 00A0, 2009, 202F, etc.)
    private static final Pattern NNBSP_AMONG_OTHER_SPACES =
            PatternCache.get("[\\h&&[^\\u202F]]+\\u202F\\h*|\\u202F\\h+");
    // NBSP 00A0 among other horizontal spaces
    private static final Pattern NBSP_AMONG_OTHER_SPACES =
            PatternCache.get("[\\h&&[^\\u00A0]]+\\u00A0\\h*|\\u00A0\\h+");
    // THIN SPACE 2009 among other horizontal spaces
    private static final Pattern THIN_SPACE_AMONG_OTHER_SPACES =
            PatternCache.get("[\\h&&[^\\u2009]]+\\u2009\\h*|\\u2009\\h+");

    private static final Pattern INITIAL_NBSP = PatternCache.get("^[\\u00A0\\u202F]+");
    private static final Pattern FINAL_NBSP = PatternCache.get("[\\u00A0\\u202F]+$");

    private static final Pattern MULTIPLE_NBSP = PatternCache.get("\\u00A0\\u00A0+");
    // Reverted 2022-12-08 from:
    // private static final Pattern MULTIPLE_NBSP =
    // PatternCache.get("[\\u00A0\\u202F][\\u00A0\\u202F]+");

    // The following includes (among others) \u0009, \u0020, \u00A0, \u2007, \u2009, \u202F, \u3000
    private static final UnicodeSet UNICODE_WHITESPACE = new UnicodeSet("[:whitespace:]").freeze();

    private static final CLDRLocale MALAYALAM = CLDRLocale.getInstance("ml");
    private static final CLDRLocale ROMANIAN = CLDRLocale.getInstance("ro");
    private static final CLDRLocale CATALAN = CLDRLocale.getInstance("ca");
    private static final CLDRLocale NGOMBA = CLDRLocale.getInstance("jgo");
    private static final CLDRLocale KWASIO = CLDRLocale.getInstance("nmg");
    private static final CLDRLocale HEBREW = CLDRLocale.getInstance("he");
    private static final CLDRLocale MYANMAR = CLDRLocale.getInstance("my");
    private static final CLDRLocale KYRGYZ = CLDRLocale.getInstance("ky");
    private static final CLDRLocale URDU = CLDRLocale.getInstance("ur");
    private static final CLDRLocale PASHTO = CLDRLocale.getInstance("ps");
    private static final CLDRLocale FARSI = CLDRLocale.getInstance("fa");
    private static final CLDRLocale GERMAN_SWITZERLAND = CLDRLocale.getInstance("de_CH");
    private static final CLDRLocale SWISS_GERMAN = CLDRLocale.getInstance("gsw");
    private static final CLDRLocale FF_ADLAM = CLDRLocale.getInstance("ff_Adlm");
    private static final CLDRLocale KASHMIRI = CLDRLocale.getInstance("ks");
    public static final Set<String> LANGUAGES_USING_MODIFIER_APOSTROPHE =
            new HashSet<>(
                    Arrays.asList(
                            "br", "bss", "cad", "cic", "cch", "gn", "ha", "ha_Latn", "lkt", "mgo",
                            "mic", "moh", "mus", "nnh", "qu", "quc", "uk", "uz", "uz_Latn"));

    // Ş ş Ţ ţ  =>  Ș ș Ț ț
    private static final char[][] ROMANIAN_CONVERSIONS = {
        {'\u015E', '\u0218'}, {'\u015F', '\u0219'}, {'\u0162', '\u021A'}, {'\u0163', '\u021B'}
    };

    private static final char[][] CATALAN_CONVERSIONS = {
        {'\u013F', '\u004C', '\u00B7'}, // Ŀ -> L·
        {'\u0140', '\u006C', '\u00B7'}
    }; // ŀ -> l·

    private static final char[][] NGOMBA_CONVERSIONS = {
        {'\u0251', '\u0061'}, {'\u0261', '\u0067'}, //  ɑ -> a , ɡ -> g , See ticket #5691
        {'\u2019', '\uA78C'}, {'\u02BC', '\uA78C'}
    }; //  Saltillo, see ticket #6805

    private static final char[][] KWASIO_CONVERSIONS = {
        {'\u0306', '\u030C'}, // See ticket #6571, use caron instead of breve
        {'\u0103', '\u01CE'},
        {'\u0102', '\u01CD'}, // a-breve -> a-caron
        {'\u0115', '\u011B'},
        {'\u011A', '\u01CD'}, // e-breve -> e-caron
        {'\u012D', '\u01D0'},
        {'\u012C', '\u01CF'}, // i-breve -> i-caron
        {'\u014F', '\u01D2'},
        {'\u014E', '\u01D1'}, // o-breve -> o-caron
        {'\u016D', '\u01D4'},
        {'\u016C', '\u01D3'} // u-breve -> u-caron
    };

    private static final char[][] HEBREW_CONVERSIONS = {
        {'\'', '\u05F3'}, {'"', '\u05F4'}
    }; //  ' -> geresh  " -> gershayim

    private static final char[][] KYRGYZ_CONVERSIONS = {{'ӊ', 'ң'}, {'Ӊ', 'Ң'}}; //  right modifier

    private static final char[][] URDU_PLUS_CONVERSIONS = {{'\u0643', '\u06A9'}}; //  wrong char

    private static final char[][] KASHMIRI_CONVERSIONS = {
        {'ۍ', 'ؠ'}
    }; //  wrong char (see CLDR-16595)

    private static final ZawgyiDetector detector = new ZawgyiDetector();
    private static final Transliterator zawgyiUnicodeTransliterator =
            Transliterator.getInstance("Zawgyi-my");

    private SimpleUnicodeSetFormatter pp = new SimpleUnicodeSetFormatter(); // default collator
    private UnicodeSetPrettyPrinter rawFormatter = new UnicodeSetPrettyPrinter(); // default

    private final CLDRLocale locale;
    private String scriptCode; // actual or default script code (not null after init)
    private boolean isPosix;

    private CLDRFile cldrFileForBailey = null;

    /**
     * Constructor, taking cldrFile.
     *
     * @param cldrFileToCheck
     */
    public DisplayAndInputProcessor(CLDRFile cldrFileToCheck, boolean needsCollator) {
        init(this.locale = CLDRLocale.getInstance(cldrFileToCheck.getLocaleID()), needsCollator);
    }

    public DisplayAndInputProcessor(CLDRFile cldrFileToCheck) {
        init(this.locale = CLDRLocale.getInstance(cldrFileToCheck.getLocaleID()), true);
    }

    void init(CLDRLocale locale, boolean needsCollator) {
        isPosix = locale.toString().contains("POSIX");
        if (needsCollator) {
            Collator col =
                    ComparatorUtilities.getCldrCollator(locale.toString(), Collator.IDENTICAL);
            Collator spaceCol =
                    ComparatorUtilities.getCldrCollator(locale.toString(), Collator.PRIMARY);
            pp = new SimpleUnicodeSetFormatter((Comparator) col);
            rawFormatter = UnicodeSetPrettyPrinter.from((Comparator) col, (Comparator) spaceCol);
        } else {
            pp = new SimpleUnicodeSetFormatter(); // default collator
            rawFormatter = new UnicodeSetPrettyPrinter(); // default
        }
        String script = locale.getScript();
        if (script == null || script.length() < 4) {
            SupplementalDataInfo sdi = CLDRConfig.getInstance().getSupplementalDataInfo();
            script = sdi.getDefaultScript(locale.getBaseName());
            if (script == null || script.length() < 4 || script.equals("Zzzz")) {
                script = sdi.getDefaultScript(locale.getLanguage());
            }
            if (script == null || script.length() < 4) {
                script = "Zzzz";
            }
        }
        scriptCode = script;
    }

    public SimpleUnicodeSetFormatter getPrettyPrinter() {
        return pp;
    }

    /**
     * Constructor, taking ULocale and boolean.
     *
     * @param locale the ULocale
     * @param needsCollator true or false
     *     <p>Called by getProcessor, with locale = SurveyMain.TRANS_HINT_LOCALE
     */
    public DisplayAndInputProcessor(ULocale locale, boolean needsCollator) {
        init(this.locale = CLDRLocale.getInstance(locale), needsCollator);
    }

    /**
     * Constructor, taking ULocale.
     *
     * @param locale the ULocale
     */
    public DisplayAndInputProcessor(ULocale locale) {
        init(this.locale = CLDRLocale.getInstance(locale), true /* needsCollator */);
    }

    /**
     * Constructor, taking CLDRLocale and boolean.
     *
     * @param locale the CLDRLocale
     * @param needsCollator true or false
     */
    public DisplayAndInputProcessor(CLDRLocale locale, boolean needsCollator) {
        init(this.locale = locale, needsCollator);
    }

    /**
     * Constructor, taking locale.
     *
     * @param locale
     */
    public DisplayAndInputProcessor(CLDRLocale locale) {
        init(this.locale = locale, true);
    }

    /**
     * Process the value for display. The result is a string for display in the Survey tool or
     * similar program.
     *
     * @param path
     * @param value
     * @return
     */
    public synchronized String processForDisplay(String path, String value) {
        if (value == null) {
            return null;
        }
        if (CldrUtility.INHERITANCE_MARKER.equals(value)) {
            return value;
        }
        value = Normalizer.compose(value, false); // Always normalize all text to NFC.
        if (hasUnicodeSetValue(path)) {
            return displayUnicodeSet(value);
        } else if (path.contains("stopword")) {
            return value.trim().isEmpty() ? "NONE" : value;
        } else {
            NumericType numericType = NumericType.getNumericType(path);
            if (numericType != NumericType.NOT_NUMERIC) {
                // Canonicalize existing values that aren't canonicalized yet.
                // New values will be canonicalized on input using processInput().
                try {
                    value = getCanonicalPattern(value, numericType, isPosix);
                } catch (IllegalArgumentException e) {
                    if (DEBUG_DAIP) System.err.println("Illegal pattern: " + value);
                }
                if (numericType != NumericType.CURRENCY
                        && numericType != NumericType.CURRENCY_ABBREVIATED) {
                    value = value.replace("'", "");
                }
            }
        }
        // Fix up any apostrophes in number symbols
        if (NUMBER_SEPARATOR_PATTERN.matcher(path).matches()) {
            value = value.replace('\'', '\u2019');
        }
        // Fix up any apostrophes as appropriate (Don't do so for things like date patterns...
        if (!APOSTROPHE_SKIP_PATHS.matcher(path).matches()) {
            value = normalizeApostrophes(value);
        }
        // Fix up hyphens, replacing with N-dash as appropriate
        if (INTERVAL_FORMAT_PATHS.matcher(path).matches()) {
            value =
                    normalizeIntervalHyphensAndSpaces(
                            value); // This may also adjust spaces around en dash
        } else {
            value = normalizeHyphens(value);
        }
        // Fix up possibly empty field
        if (value.isEmpty()
                && (path.startsWith(FSR_START_PATH)
                        || path.startsWith(NSR_START_PATH)
                        || path.startsWith(NOL_START_PATH))) {
            value = EMPTY_ELEMENT_VALUE;
        }
        return value;
    }

    private boolean hasUnicodeSetValue(String path) {
        return path.startsWith("//ldml/characters/exemplarCharacters")
                || path.startsWith("//ldml/characters/parseLenients");
    }

    static final DateTimeCanonicalizer dtc = new DateTimeCanonicalizer(FIX_YEARS);

    private static final String BAR_VL = "\\|"; // U+007C VERTICAL LINE (pipe, bar) literal
    private static final String BAR_EL = "\\s+l\\s+"; // U+006C LATIN SMALL LETTER L with space
    private static final String BAR_DANDA = "।"; // U+0964 DEVANAGARI DANDA
    private static final String BAR_REGEX = "(" + BAR_VL + "|" + BAR_EL + "|" + BAR_DANDA + ")";
    public static final Splitter SPLIT_BAR =
            Splitter.on(Pattern.compile(BAR_REGEX)).trimResults().omitEmptyStrings();
    static final Splitter SPLIT_SPACE = Splitter.on(' ').trimResults().omitEmptyStrings();
    static final Joiner JOIN_BAR = Joiner.on(" | ");
    static final Joiner JOIN_SPACE = Joiner.on(' ');

    /**
     * Process the value for input. The result is a cleaned-up value. For example, an exemplar set
     * is modified to be in the normal format, and any missing [ ] are added (a common omission on
     * entry). If there are any failures then the original value is returned, so that the proper
     * error message can be given.
     *
     * @param path
     * @param value
     * @param internalException to be filled in if RuntimeException occurs
     * @return the possibly modified value
     */
    public synchronized String processInput(
            String path, String value, Exception[] internalException) {
        // skip processing for inheritance marker
        if (CldrUtility.INHERITANCE_MARKER.equals(value)) {
            return value;
        }
        final String original = value;
        value = stripProblematicControlCharacters(value);
        value = Normalizer.compose(value, false); // Always normalize all input to NFC.
        value = value.replace('\u00B5', '\u03BC'); // use the right Greek mu character
        if (internalException != null) {
            internalException[0] = null;
        }
        // for root annotations
        if (CLDRLocale.ROOT.equals(locale) && path.contains("/annotations")) {
            return value;
        }
        try {
            value = processInputMore(path, value);
        } catch (RuntimeException e) {
            if (internalException != null) {
                internalException[0] = e;
            }
            return original;
        }
        return value;
    }

    private String processInputMore(String path, String value) {
        final boolean isUnicodeSet = hasUnicodeSetValue(path);
        if (isUnicodeSet) {
            return inputUnicodeSet(path, value);
        }

        value = processLocaleSpecificInput(path, value, isUnicodeSet);

        if (UNICODE_WHITESPACE.containsSome(value)) {
            value = normalizeWhitespace(path, value);
        }

        // remove the empty value (mostly relevant for person names,
        // but prevents it showing up elsewhere by mistake
        value = value.replace(EMPTY_ELEMENT_VALUE, "");

        // all of our values should not have leading or trailing spaces, except insertBetween,
        // foreignSpaceReplacement, and anything with built-in attribute xml:space="preserve"
        if (!path.contains("/insertBetween")
                && !path.contains("/foreignSpaceReplacement")
                && !path.contains("/nativeSpaceReplacement")
                && !path.contains("[@xml:space=\"preserve\"]")
                && !isUnicodeSet) {
            value = value.trim();
        }

        // fix grouping separator if space
        if (path.startsWith("//ldml/numbers/symbols") && !path.contains("/alias")) {
            if (value.isEmpty()) {
                value = "\u00A0";
            }
            value = value.replace(' ', '\u00A0');
        }

        // fix date patterns
        DateTimePatternType datetimePatternType = DateTimePatternType.fromPath(path);
        if (DateTimePatternType.STOCK_AVAILABLE_INTERVAL_PATTERNS.contains(datetimePatternType)) {
            try {
                value = dtc.getCanonicalDatePattern(path, value, datetimePatternType);
            } catch (IllegalArgumentException ex) {
                return value;
            }
        }

        if (path.startsWith("//ldml/numbers/currencies/currency") && path.contains("displayName")) {
            value = normalizeCurrencyDisplayName(value);
        }
        NumericType numericType = NumericType.getNumericType(path);
        if (numericType != NumericType.NOT_NUMERIC) {
            if (numericType == NumericType.CURRENCY) {
                value = value.replaceAll(" ", "\u00A0");
                // NOTE: the following "if ... NumericType.CURRENCY_ABBREVIATED" was false here,
                // since we know it is NumericType.CURRENCY; so now the code is commented out; if
                // anyone
                // understands what the intention was, maybe the condition should be restored
                // somehow,
                // such as with "else if"
                // if (numericType == NumericType.CURRENCY_ABBREVIATED) {
                //    value = value.replaceAll("0\\.0+", "0");
                // }
            } else {
                value =
                        value.replaceAll("([%\u00A4]) ", "$1\u00A0")
                                .replaceAll(" ([%\u00A4])", "\u00A0$1");
                value = replace(NON_DECIMAL_PERIOD, value, "'.'");
                if (numericType == NumericType.DECIMAL_ABBREVIATED) {
                    value = value.replaceAll("0\\.0+", "0");
                }
            }
            value = getCanonicalPattern(value, numericType, isPosix);
        }

        // fix [,]
        if (path.startsWith("//ldml/localeDisplayNames/languages/language")
                || path.startsWith("//ldml/localeDisplayNames/scripts/script")
                || path.startsWith("//ldml/localeDisplayNames/territories/territory")
                || path.startsWith("//ldml/localeDisplayNames/variants/variant")
                || path.startsWith("//ldml/localeDisplayNames/keys/key")
                || path.startsWith("//ldml/localeDisplayNames/types/type")) {
            value = value.replace('[', '(').replace(']', ')').replace('［', '（').replace('］', '）');
        }

        // Normalize two single quotes for the inches symbol.
        if (path.contains("/units")) {
            value = value.replace("''", "″");
        }

        // check specific cases
        //        if (isUnicodeSet) {
        //            value = inputUnicodeSet(path, value);
        //        } else
        if (path.contains("stopword")) {
            if (value.equals("NONE")) {
                value = "";
            }
        }

        // Normalize ellipsis data.
        if (path.startsWith("//ldml/characters/ellipsis")) {
            value = value.replace("...", "…");
        }

        if (path.startsWith(NOL_START_PATH)) {
            value = normalizeNameOrderLocales(value);
        }

        // Replace Arabic presentation forms with their nominal counterparts
        value = replaceArabicPresentationForms(value);

        // Fix up any apostrophes as appropriate (Don't do so for things like date patterns...
        if (!APOSTROPHE_SKIP_PATHS.matcher(path).matches()) {
            value = normalizeApostrophes(value);
        }
        // Fix up any apostrophes in number symbols
        if (NUMBER_SEPARATOR_PATTERN.matcher(path).matches()) {
            value = value.replace('\'', '\u2019');
        }
        // Fix up hyphens, replacing with N-dash as appropriate
        if (INTERVAL_FORMAT_PATHS.matcher(path).matches()) {
            value =
                    normalizeIntervalHyphensAndSpaces(
                            value); // This may also adjust spaces around en dash
        } else if (!isUnicodeSet) {
            value = normalizeHyphens(value);
        }
        value = processAnnotations(path, value);
        value = normalizeZeroWidthSpace(value);
        if (VoteResolver.DROP_HARD_INHERITANCE) {
            value = replaceBaileyWithInheritanceMarker(path, value);
        }
        return value;
    }

    private String processLocaleSpecificInput(String path, String value, boolean isUnicodeSet) {
        if (locale.childOf(MALAYALAM)) {
            String newvalue = normalizeMalayalam(value);
            if (DEBUG_DAIP)
                System.out.println(
                        "DAIP: Normalized Malayalam '" + value + "' to '" + newvalue + "'");
            value = newvalue;
        } else if (locale.childOf(ROMANIAN) && !isUnicodeSet) {
            value = standardizeRomanian(value);
        } else if (locale.childOf(CATALAN) && !isUnicodeSet) {
            value = standardizeCatalan(value);
        } else if (locale.childOf(NGOMBA) && !isUnicodeSet) {
            value = standardizeNgomba(value);
        } else if (locale.childOf(KWASIO) && !isUnicodeSet) {
            value = standardizeKwasio(value);
        } else if (locale.childOf(HEBREW) && !APOSTROPHE_SKIP_PATHS.matcher(path).matches()) {
            value = replaceChars(path, value, HEBREW_CONVERSIONS, false);
        } else if ((locale.childOf(SWISS_GERMAN) || locale.childOf(GERMAN_SWITZERLAND))
                && !isUnicodeSet) {
            value = standardizeSwissGerman(value);
        } else if (locale.childOf(MYANMAR) && !isUnicodeSet) {
            value = standardizeMyanmar(value);
        } else if (locale.childOf(KYRGYZ)) {
            value = replaceChars(path, value, KYRGYZ_CONVERSIONS, false);
        } else if (locale.childOf(URDU) || locale.childOf(PASHTO) || locale.childOf(FARSI)) {
            value = replaceChars(path, value, URDU_PLUS_CONVERSIONS, true);
        } else if (locale.childOf(FF_ADLAM) && !isUnicodeSet) {
            value = fixAdlamNasalization(value);
        } else if (locale.childOf(KASHMIRI)) {
            value = replaceChars(path, value, KASHMIRI_CONVERSIONS, false);
        }
        return value;
    }

    private String processAnnotations(String path, String value) {
        if (AnnotationUtil.pathIsAnnotation(path)) {
            if (path.contains(Emoji.TYPE_TTS)) {
                // The row has something like "�� -name" in the first column. Cf. namePath,
                // getNamePaths.
                // Normally the value is like "zebra" or "unicorn face", without "|".
                // If the user enters a value with "|",  discard anything after "|"; e.g., change "a
                // | b | c" to "a".
                value = SPLIT_BAR.split(value).iterator().next();
            } else {
                // The row has something like "�� –keywords" in the first column. Cf. keywordPath,
                // getKeywordPaths.
                // Normally the value is like "stripe | zebra", with "|".
                value = annotationsForDisplay(value);
            }
        }
        return value;
    }

    private String normalizeNameOrderLocales(String value) {
        value = value.replace(EMPTY_ELEMENT_VALUE, "");
        TreeSet<String> result = new TreeSet<>(SPLIT_SPACE.splitToList(value));
        result.remove(LocaleNames.ZXX);
        if (result.remove(LocaleNames.UND)) { // put und at the front
            if (result.isEmpty()) {
                return LocaleNames.UND;
            } else {
                return LocaleNames.UND + " " + JOIN_SPACE.join(result);
            }
        }
        return JOIN_SPACE.join(result);
    }

    /**
     * Strip out all code points less than U+0020 except for U+0009 tab, U+000A line feed, and
     * U+000D carriage return.
     *
     * @param s the string
     * @return the resulting string
     */
    private String stripProblematicControlCharacters(String s) {
        if (s == null || s.isEmpty()) {
            return s;
        }
        return s.codePoints()
                .filter(c -> (c >= 0x20 || c == 9 || c == 0xA || c == 0xD))
                .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append)
                .toString();
    }

    private static final boolean REMOVE_COVERED_KEYWORDS = true;

    /**
     * Produce a modification of the given annotation by sorting its components and filtering
     * covered keywords.
     *
     * <p>Examples: Given "b | a", return "a | b". Given "bear | panda | panda bear", return "bear |
     * panda".
     *
     * @param value the string
     * @return the possibly modified string
     */
    private static String annotationsForDisplay(String value) {
        TreeSet<String> sorted = new TreeSet<>(Collator.getInstance(ULocale.ROOT));
        sorted.addAll(SPLIT_BAR.splitToList(value));
        if (REMOVE_COVERED_KEYWORDS) {
            filterCoveredKeywords(sorted);
        }
        value = JOIN_BAR.join(sorted);
        return value;
    }

    /**
     * Filter from the given set some keywords that include spaces, if they duplicate, or are
     * "covered by", other keywords in the set.
     *
     * <p>For example, if the set is {"bear", "panda", "panda bear"} (annotation was "bear | panda |
     * panda bear"), then remove "panda bear", treating it as "covered" since the set already
     * includes "panda" and "bear". Also, for example, if the set is {"bear", "panda", "PANDA
     * BEAR"}, then remove "PANDA BEAR" even though the casing differs.
     *
     * <p>Since casing is complex in many languages/scripts, this method does not attempt to
     * recognize all occurrences of case-insensitive matching. Instead, it first checks for
     * case-sensitive (exact) matching, then it checks for case-insensitive (loose) matching
     * according to Locale.ROOT. The intended effect is only to remove an item like "PANDA BEAR" if
     * both "panda" and "bear" are already present as individual items. The intended effect is never
     * to modify the casing of any item that is already present.
     *
     * @param sorted the set from which items may be removed
     */
    public static void filterCoveredKeywords(TreeSet<String> sorted) {
        // for now, just do single items
        HashSet<String> toRemove = new HashSet<>();

        TreeSet<String> sortedLower = new TreeSet<>();
        for (String item : sorted) {
            sortedLower.add(item.toLowerCase(Locale.ROOT));
        }
        for (String item : sorted) {
            List<String> list = SPLIT_SPACE.splitToList(item);
            if (list.size() < 2) {
                continue;
            }
            if (sorted.containsAll(list)) {
                toRemove.add(item);
            } else {
                List<String> listLower = new ArrayList<>();
                for (String s : list) {
                    listLower.add(s.toLowerCase(Locale.ROOT));
                }
                if (sortedLower.containsAll(listLower)) {
                    toRemove.add(item);
                }
            }
        }
        sorted.removeAll(toRemove);
    }

    /**
     * Given a sorted list like "BEAR | Bear ｜ PANDA | Panda | panda"，filter out any items that
     * duplicate other items aside from case, leaving only, for example, "BEAR | PANDA"
     *
     * @param sorted the set from which items may be removed
     */
    public static void filterKeywordsDifferingOnlyInCase(TreeSet<String> sorted) {
        TreeMultimap<String, String> mapFromLower = TreeMultimap.create();
        for (String item : sorted) {
            mapFromLower.put(item.toLowerCase(), item);
        }
        TreeSet<String> toRetain = new TreeSet<>();
        for (String lower : mapFromLower.keySet()) {
            Set<String> variants = mapFromLower.get(lower);
            for (String var : variants) {
                toRetain.add(var);
                break;
            }
        }
        sorted.retainAll(toRetain);
    }

    private String displayUnicodeSet(String value) {
        return pp.format(
                new UnicodeSet(value)); // will throw exception if bad format, eg missing [...]
    }

    private String inputUnicodeSet(String path, String value) {
        UnicodeSet exemplar = null;
        // hack, in case the input is called twice
        value = value.trim();
        if (value.startsWith("[") && value.endsWith("]")) {
            try {
                exemplar = new UnicodeSet(value);
            } catch (Exception e2) {
                // fall through
            }
        }
        if (exemplar == null) {
            try {
                exemplar = pp.parse(value);
            } catch (Exception e) {
                // can't parse at all
                return value; // we can't throw an exception because clients won't expect it.
            }
        }
        XPathParts parts = XPathParts.getFrozenInstance(path);
        //        if (parts.getElement(2).equals("parseLenients")) {
        //            return exemplar.toPattern(false);
        //        }
        final String type = parts.getAttributeValue(-1, "type");
        ExemplarType exemplarType =
                !path.contains("exemplarCharacters")
                        ? null
                        : type == null ? ExemplarType.main : ExemplarType.valueOf(type);
        value = getCleanedUnicodeSet(exemplar, exemplarType);
        return value;
    }

    private String normalizeCurrencyDisplayName(String value) {
        StringBuilder result = new StringBuilder();
        boolean inParentheses = false;
        for (int i = 0; i < value.length(); i++) {
            char c = value.charAt(i);
            if (c == '(') {
                inParentheses = true;
            } else if (c == ')') {
                inParentheses = false;
            }
            if (inParentheses && c == '-' && Character.isDigit(value.charAt(i - 1))) {
                c = 0x2013; /* Replace hyphen-minus with dash for date ranges */
            }
            result.append(c);
        }
        return result.toString();
    }

    private String normalizeApostrophes(String value) {
        // If our DAIP always had a CLDRFile to work with, then we could just check the exemplar set
        // in it to see.
        // But since we don't, we just maintain the list internally and use it.
        if (LANGUAGES_USING_MODIFIER_APOSTROPHE.contains(locale.getLanguage())) {
            return value.replace('\'', '\u02bc');
        } else {
            char prev = 0;
            StringBuilder builder = new StringBuilder();
            for (char c : value.toCharArray()) {
                if (c == '\'') {
                    if (Character.isLetter(prev)) {
                        builder.append('\u2019');
                    } else {
                        builder.append('\u2018');
                    }
                } else {
                    builder.append(c);
                }
                prev = c;
            }
            return builder.toString();
        }
    }

    private String normalizeIntervalHyphensAndSpaces(String value) {
        if (value.contains("{0}")) {
            // intervalFormatFallback pattern, not handled by DateTimePatternGenerator.FormatParser
            if (scriptCode.equals("Latn")) {
                value = INTERVAL_FALLBACK_RANGE.matcher(value).replaceAll("}\u2009\u2013\u2009{");
            }
            return value;
        }
        DateTimePatternGenerator.FormatParser fp = new DateTimePatternGenerator.FormatParser();
        fp.set(
                DateIntervalInfo.genPatternInfo(value, false)
                        .getFirstPart()); // first format & separator including spaces
        List<Object> items = fp.getItems();
        Object last = items.get(items.size() - 1);
        if (last instanceof String) {
            String separator =
                    last.toString(); // separator including spaces, and possibly preceding
            // literal text (. or quoted)
            String replacement = separator;
            if (scriptCode.equals("Latn")
                    && (separator.endsWith(" - ") || separator.endsWith(" \u2013 "))) {
                replacement =
                        separator.substring(0, separator.length() - 3)
                                + "\u2009\u2013\u2009"; // Per CLDR-14032,16308
            } else if (separator.contains("-")) {
                replacement = separator.replace("-", "\u2013");
            }
            if (!replacement.equals(separator)) {
                StringBuilder sb = new StringBuilder();
                sb.append(DateIntervalInfo.genPatternInfo(value, false).getFirstPart());
                if (sb.lastIndexOf(separator) >= 0) {
                    sb.delete(sb.lastIndexOf(separator), sb.length());
                    sb.append(replacement);
                    sb.append(
                            DateIntervalInfo.genPatternInfo(value, false)
                                    .getSecondPart()); // second format only
                    return sb.toString();
                }
            }
        }
        return value;
    }

    private String normalizeHyphens(String value) {
        int hyphenLocation = value.indexOf("-");
        if (hyphenLocation > 0
                && Character.isDigit(value.charAt(hyphenLocation - 1))
                && hyphenLocation < value.length() - 1
                && Character.isDigit(value.charAt(hyphenLocation + 1))) {
            return value.substring(0, hyphenLocation)
                    + "\u2013"
                    + value.substring(hyphenLocation + 1);
        }
        return value;
    }

    private String standardizeRomanian(String value) {
        StringBuilder builder = new StringBuilder();
        for (char c : value.toCharArray()) {
            for (char[] pair : ROMANIAN_CONVERSIONS) {
                if (c == pair[0]) {
                    c = pair[1];
                    break;
                }
            }
            builder.append(c);
        }
        return builder.toString();
    }

    private String standardizeKwasio(String value) {
        StringBuilder builder = new StringBuilder();
        for (char c : value.toCharArray()) {
            for (char[] pair : KWASIO_CONVERSIONS) {
                if (c == pair[0]) {
                    c = pair[1];
                    break;
                }
            }
            builder.append(c);
        }
        return builder.toString();
    }

    // Use the myanmar-tools detector.
    private String standardizeMyanmar(String value) {
        if (detector.getZawgyiProbability(value) > 0.90) {
            return zawgyiUnicodeTransliterator.transform(value);
        }
        return value;
    }

    private String standardizeNgomba(String value) {
        StringBuilder builder = new StringBuilder();
        char[] charArray = value.toCharArray();
        for (int i = 0; i < charArray.length; i++) {
            char c = charArray[i];
            boolean convertedSaltillo = false;
            for (char[] pair : NGOMBA_CONVERSIONS) {
                if (c == pair[0]) {
                    c = pair[1];
                    if (c == '\uA78C') {
                        convertedSaltillo = true;
                    }
                    break;
                }
            }
            if (convertedSaltillo
                    && ((i > 0
                                    && i < charArray.length - 1
                                    && Character.isUpperCase(charArray[i - 1])
                                    && Character.isUpperCase(charArray[i + 1]))
                            || (i > 1
                                    && Character.isUpperCase(charArray[i - 1])
                                    && Character.isUpperCase(charArray[i - 2])))) {
                c = '\uA78B'; // UPPER CASE SALTILLO
            }
            builder.append(c);
        }
        return builder.toString();
    }

    private String replaceChars(
            String path, String value, char[][] charsToReplace, boolean skipAuxExemplars) {
        if (skipAuxExemplars && path.contains("/exemplarCharacters[@type=\"auxiliary\"]")) {
            return value;
        }
        StringBuilder builder = new StringBuilder();
        for (char c : value.toCharArray()) {
            for (char[] pair : charsToReplace) {
                if (c == pair[0]) {
                    c = pair[1];
                    break;
                }
            }
            builder.append(c);
        }
        return builder.toString();
    }

    private String standardizeSwissGerman(String value) {
        return value.replaceAll("\u00DF", "ss");
    }

    private String standardizeCatalan(String value) {
        StringBuilder builder = new StringBuilder();
        for (char c : value.toCharArray()) {
            boolean didSubstitute = false;
            for (char[] triple : CATALAN_CONVERSIONS) {
                if (c == triple[0]) {
                    builder.append(triple[1]);
                    builder.append(triple[2]);
                    didSubstitute = true;
                    break;
                }
            }
            if (!didSubstitute) {
                builder.append(c);
            }
        }
        return builder.toString();
    }

    private String replace(Pattern pattern, String value, String replacement) {
        String value2 = pattern.matcher(value).replaceAll(replacement);
        if (DEBUG_DAIP && !value.equals(value2)) {
            System.out.println("\n" + value + " => " + value2);
        }
        return value2;
    }

    private static final Pattern UNNORMALIZED_MALAYALAM =
            PatternCache.get("(\u0D23|\u0D28|\u0D30|\u0D32|\u0D33|\u0D15)\u0D4D\u200D");

    private static final Map<Character, Character> NORMALIZING_MAP =
            Builder.with(new HashMap<Character, Character>())
                    .put('\u0D23', '\u0D7A')
                    .put('\u0D28', '\u0D7B')
                    .put('\u0D30', '\u0D7C')
                    .put('\u0D32', '\u0D7D')
                    .put('\u0D33', '\u0D7E')
                    .put('\u0D15', '\u0D7F')
                    .get();

    /**
     * Normalizes the Malayalam characters in the specified input.
     *
     * @param value the input to be normalized
     * @return
     */
    private String normalizeMalayalam(String value) {
        // Normalize Malayalam characters.
        Matcher matcher = UNNORMALIZED_MALAYALAM.matcher(value);
        if (matcher.find()) {
            StringBuffer buffer = new StringBuffer();
            int start = 0;
            do {
                buffer.append(value, start, matcher.start(0));
                char codePoint = matcher.group(1).charAt(0);
                buffer.append(NORMALIZING_MAP.get(codePoint));
                start = matcher.end(0);
            } while (matcher.find());
            buffer.append(value.substring(start));
            value = buffer.toString();
        }
        return value;
    }

    static final Transform<String, String> fixArabicPresentation =
            Transliterator.getInstance(
                    "[[:block=Arabic_Presentation_Forms_A:][:block=Arabic_Presentation_Forms_B:]] nfkc");

    /**
     * Normalizes the Arabic presentation forms characters in the specified input.
     *
     * @param value the input to be normalized
     * @return
     */
    private String replaceArabicPresentationForms(String value) {
        value = fixArabicPresentation.transform(value);
        return value;
    }

    static Pattern ADLAM_MISNASALIZED = PatternCache.get("([����])['’‘]([����������������])");
    public static String ADLAM_NASALIZATION = "��"; // U+1E94B (Unicode 12.0)

    public static String fixAdlamNasalization(String fromString) {
        return ADLAM_MISNASALIZED
                .matcher(fromString)
                .replaceAll("$1" + ADLAM_NASALIZATION + "$2"); // replace quote with ��
    }

    public String getCleanedUnicodeSet(UnicodeSet exemplar, ExemplarType exemplarType) {

        if (rawFormatter == null) {
            throw new IllegalArgumentException("Formatter must not be null");
        }
        if (exemplar == null) {
            throw new IllegalArgumentException("set to be cleaned must not be null");
        }

        String value;
        // prettyPrinter.setCompressRanges(exemplar.size() > 300);
        // value = exemplar.toPattern(false);
        UnicodeSet toAdd = new UnicodeSet();

        for (UnicodeSetIterator usi = new UnicodeSetIterator(exemplar); usi.next(); ) {
            String string = usi.getString();
            if (string.equals("ß") || string.equals("İ")) {
                toAdd.add(string);
                continue;
            }
            switch (string) {
                case "\u2011":
                    toAdd.add("-");
                    break; // nobreak hyphen
                case "-":
                    toAdd.add("\u2011");
                    break; // nobreak hyphen

                case " ":
                    toAdd.add("\u00a0");
                    break; // nobreak space
                case "\u00a0":
                    toAdd.add(" ");
                    break; // nobreak space

                case "\u202F":
                    toAdd.add("\u2009");
                    break; // nobreak narrow space
                case "\u2009":
                    toAdd.add("\u202F");
                    break; // nobreak narrow space
            }
            if (exemplarType != null && exemplarType.convertUppercase) {
                string = UCharacter.toLowerCase(ULocale.ENGLISH, string);
            }
            toAdd.add(string);
            // we allow
            String composed = Normalizer.compose(string, false);
            if (!string.equals(composed)) {
                toAdd.add(composed);
            }
        }

        if (exemplarType != null) {
            toAdd.removeAll(exemplarType.toRemove);
        }
        value = rawFormatter.format(toAdd);
        return value;
    }

    static final Splitter SEMI_SPLITTER = Splitter.on(';').trimResults();

    /**
     * @return a canonical numeric pattern, based on the type, and the isPOSIX flag. The latter is
     *     set for en_US_POSIX.
     */
    public static String getCanonicalPattern(String inpattern, NumericType type, boolean isPOSIX) {
        // TODO fix later to properly handle quoted ;

        DecimalFormat df = new DecimalFormat(inpattern);
        if (type == NumericType.DECIMAL_ABBREVIATED
                || type == NumericType.CURRENCY_ABBREVIATED
                || CldrUtility.INHERITANCE_MARKER.equals(inpattern)) {
            return inpattern; // TODO fix when ICU bug is fixed
            // df.setMaximumFractionDigits(df.getMinimumFractionDigits());
            // df.setMaximumIntegerDigits(Math.max(1, df.getMinimumIntegerDigits()));
        } else {
            // int decimals = type == CURRENCY_TYPE ? 2 : 1;
            int[] digits = isPOSIX ? type.posixDigitCount : type.digitCount;
            df.setMinimumIntegerDigits(digits[0]);
            df.setMinimumFractionDigits(digits[1]);
            df.setMaximumFractionDigits(digits[2]);
        }
        String pattern = df.toPattern();
        List<String> parts = SEMI_SPLITTER.splitToList(pattern);
        String pattern2 = parts.get(0);
        if (parts.size() > 1) {
            pattern2 += ";" + parts.get(1);
        }
        if (!pattern2.equals(pattern)) {
            pattern = pattern2;
        }
        // int pos = pattern.indexOf(';');
        // if (pos < 0) return pattern + ";-" + pattern;
        return pattern;
    }

    public void enableInheritanceReplacement(CLDRFile cldrFile) {
        cldrFileForBailey = cldrFile;
    }

    /*
     * This tests what type a numeric pattern is.
     */
    public enum NumericType {
        CURRENCY(new int[] {1, 2, 2}, new int[] {1, 2, 2}),
        CURRENCY_ABBREVIATED(),
        DECIMAL(new int[] {1, 0, 3}, new int[] {1, 0, 6}),
        DECIMAL_ABBREVIATED(),
        PERCENT(new int[] {1, 0, 0}, new int[] {1, 0, 0}),
        SCIENTIFIC(new int[] {0, 0, 0}, new int[] {1, 6, 6}),
        NOT_NUMERIC;

        private static final Pattern NUMBER_PATH =
                Pattern.compile(
                        "//ldml/numbers/((currency|decimal|percent|scientific)Formats|currencies/currency).*");
        private int[] digitCount;
        private int[] posixDigitCount;

        NumericType() {}

        NumericType(int[] digitCount, int[] posixDigitCount) {
            this.digitCount = digitCount;
            this.posixDigitCount = posixDigitCount;
        }

        /**
         * @return the numeric type of the xpath
         */
        public static NumericType getNumericType(String xpath) {
            Matcher matcher = NUMBER_PATH.matcher(xpath);
            if (!xpath.contains("/pattern")) {
                return NOT_NUMERIC;
            } else if (matcher.matches()) {
                if (matcher.group(1).equals("currencies/currency")) {
                    return CURRENCY;
                } else {
                    NumericType type = NumericType.valueOf(matcher.group(2).toUpperCase());
                    if (xpath.contains("=\"1000")) {
                        if (type == DECIMAL) {
                            type = DECIMAL_ABBREVIATED;
                        } else if (type == CURRENCY) {
                            type = CURRENCY_ABBREVIATED;
                        } else {
                            throw new IllegalArgumentException("Internal Error");
                        }
                    }
                    return type;
                }
            } else {
                return NOT_NUMERIC;
            }
        }

        public int[] getDigitCount() {
            return digitCount;
        }

        public int[] getPosixDigitCount() {
            return posixDigitCount;
        }
    }

    /**
     * Turn all whitespace sequences (including tab and newline, and NBSP for certain paths) into a
     * single space or a single NBSP depending on path. Also trim initial/final NBSP, unless the
     * value is only the one character, "\u00A0"
     *
     * @param path
     * @param value
     * @return the normalized value
     */
    private String normalizeWhitespace(String path, String value) {
        PathSpaceType pst = PathSpaceType.get(path);
        if (pst == PathSpaceType.allowSp) {
            value =
                    WHITESPACE_AND_NBSP_TO_NORMALIZE
                            .matcher(value)
                            .replaceAll(" "); // replace with regular space
        } else if (pst == PathSpaceType.allowNbsp) {
            value =
                    WHITESPACE_AND_NBSP_TO_NORMALIZE
                            .matcher(value)
                            .replaceAll("\u00A0"); // replace with NBSP
            value = trimNBSP(value);
        } else if (pst == PathSpaceType.allowNNbsp) {
            value =
                    WHITESPACE_AND_NBSP_TO_NORMALIZE
                            .matcher(value)
                            .replaceAll("\u202F"); // replace with NNBSP
            value = trimNBSP(value);
        } else if (pst == PathSpaceType.allowSpOrNbsp) {
            /*
             * in this case don't normalize away NBSP
             */
            value =
                    WHITESPACE_NO_NBSP_TO_NORMALIZE
                            .matcher(value)
                            .replaceAll(" "); // replace with regular space
            /*
             * if any NBSP and regular space are adjacent, replace with NBSP
             */
            value = NBSP_PLUS_SPACE_TO_NORMALIZE.matcher(value).replaceAll("\u00A0");
            value = SPACE_PLUS_NBSP_TO_NORMALIZE.matcher(value).replaceAll("\u00A0");
            value = MULTIPLE_NBSP.matcher(value).replaceAll("\u00A0");
            value = trimNBSP(value);
        } else {
            throw new IllegalArgumentException("Unknown PathSpaceType " + pst);
        }

        // Further whitespace adjustments per CLDR-14032
        if ((scriptCode.equals("Latn") || scriptCode.equals("Cyrl") || scriptCode.equals("Grek"))
                && HOUR_FORMAT_XPATHS.matcher(path).matches()) {
            String test = AMPM_SPACE_BEFORE.matcher(value).replaceAll("$1$2"); // value without a+
            String spaceReplace = path.contains("ascii") ? "$1\u0020$3" : "$1\u202F$3";
            if (value.length() - test.length() != 4) { // exclude patterns with aaaa
                value = AMPM_SPACE_BEFORE.matcher(value).replaceAll(spaceReplace);
            }
            test = AMPM_SPACE_AFTER.matcher(value).replaceAll("$2$3"); // value without a+
            if (value.length() - test.length() != 4) { // exclude patterns with aaaa
                value = AMPM_SPACE_AFTER.matcher(value).replaceAll(spaceReplace);
            }
        }
        if (scriptCode.equals("Cyrl") && YEAR_FORMAT_XPATHS.matcher(path).matches()) {
            value = YEAR_SPACE_YEARMARKER.matcher(value).replaceAll("y\u202F$1");
        }
        if (UNIT_NARROW_XPATHS.matcher(path).matches()) {
            value = PLACEHOLDER_SPACE_AFTER.matcher(value).replaceAll("}\u202F"); // Narrow NBSP
            value = PLACEHOLDER_SPACE_BEFORE.matcher(value).replaceAll("\u202F{");
        }
        if (UNIT_SHORT_XPATHS.matcher(path).matches()) {
            value = PLACEHOLDER_SPACE_AFTER.matcher(value).replaceAll("}\u00A0"); // Regular NBSP
            value = PLACEHOLDER_SPACE_BEFORE.matcher(value).replaceAll("\u00A0{");
        }

        // Finally, replace remaining space combinations with most restrictive type CLDR-17233
        // If we have NNBSP U+202F in combination with other spaces, keep just it
        value = NNBSP_AMONG_OTHER_SPACES.matcher(value).replaceAll("\u202F");
        // Else if we have NBSP U+00A0 in combination with other spaces, keep just it
        value = NBSP_AMONG_OTHER_SPACES.matcher(value).replaceAll("\u00A0");
        // Else if we have THIN SPACE U+2009 in combination with other spaces, keep just it
        value = THIN_SPACE_AMONG_OTHER_SPACES.matcher(value).replaceAll("\u2009");

        return value;
    }

    /**
     * Delete any initial or final NBSP or NNBSP, unless the value is just NBSP or NNBSP
     *
     * @param value
     * @return the trimmed value
     */
    private String trimNBSP(String value) {
        if (!value.equals("\u00A0") && !value.equals("\u202F")) {
            value = INITIAL_NBSP.matcher(value).replaceAll("");
            value = FINAL_NBSP.matcher(value).replaceAll("");
        }
        return value;
    }

    /** Categorize xpaths according to whether they allow space, NBSP, or both */
    public enum PathSpaceType {
        allowSp,
        allowNbsp,
        allowNNbsp,
        allowSpOrNbsp;

        public static PathSpaceType get(String path) {
            if (wantsRegularSpace(path)) {
                return allowSp;
            } else if (wantsNBSP(path)) {
                return allowNbsp;
            } else if (wantsNNBSP(path)) {
                return allowNNbsp;
            } else {
                return allowSpOrNbsp;
            }
        }

        private static boolean wantsRegularSpace(String path) {
            if ((path.contains("/dateFormatLength") && path.contains("/pattern"))
                    || path.contains("/availableFormats/dateFormatItem")
                    || (path.startsWith("//ldml/dates/timeZoneNames/metazone")
                            && path.contains("/long"))
                    || path.startsWith("//ldml/dates/timeZoneNames/regionFormat")
                    || path.startsWith("//ldml/localeDisplayNames/codePatterns/codePattern")
                    || path.startsWith("//ldml/localeDisplayNames/languages/language")
                    || path.startsWith("//ldml/localeDisplayNames/territories/territory")
                    || path.startsWith("//ldml/localeDisplayNames/types/type")
                    || (path.startsWith("//ldml/numbers/currencies/currency")
                            && path.contains("/displayName"))
                    || (path.contains("/decimalFormatLength[@type=\"long\"]")
                            && path.contains("/pattern"))
                    || path.startsWith("//ldml/posix/messages")
                    || (path.startsWith("//ldml/units/uni") && path.contains("/unitPattern "))) {
                return true;
            }
            return false;
        }

        private static boolean wantsNBSP(String path) {
            if ((path.contains("/currencies/currency")
                            && (path.contains("/group") || path.contains("/pattern")))
                    || (path.contains("/currencyFormatLength") && path.contains("/pattern"))
                    || (path.contains("/currencySpacing") && path.contains("/insertBetween"))
                    || (path.contains("/decimalFormatLength") && path.contains("/pattern"))
                    || // i.e. the non-long ones
                    (path.contains("/percentFormatLength") && path.contains("/pattern"))
                    || (path.startsWith("//ldml/numbers/symbols")
                            && (path.contains("/group") || path.contains("/nan")))) {
                return true;
            }
            return false;
        }

        private static boolean wantsNNBSP(String path) {
            if ((path.contains("/dayPeriodWidth[@type=\"abbreviated\"]")
                            || path.contains("/dayPeriodWidth[@type=\"narrow\"]"))
                    && (path.contains("/dayPeriod[@type=\"am\"]")
                            || path.contains("/dayPeriod[@type=\"pm\"]"))) {
                return true;
            }
            return false;
        }
    }

    private static final Pattern ZERO_WIDTH_SPACES = PatternCache.get("\\u200B+");
    private static final Set<String> LOCALES_NOT_ALLOWING_ZWS =
            new HashSet<>(Arrays.asList("da", "fr"));

    /**
     * Remove occurrences of U+200B ZERO_WIDTH_SPACE under certain conditions
     *
     * @param value the value to be normalized
     * @return the normalized value
     *     <p>TODO: extend this method to address more concerns, after clarifying the conditions -
     *     enlarge the set LOCALES_NOT_ALLOWING_ZWS? - strip initial and final ZWS in all locales? -
     *     reduce two or more adjacent ZWS to one ZWS? - allow or prohibit ZWS by itself as currency
     *     symbol, as currently in locales kea, pt_CV, pt_PT - allow or prohibit ZWS preceding URL
     *     as in "as per [U+200B]https://www.unicode.org/reports/tr35/tr35-general.html#Annotations
     *     " Reference: https://unicode-org.atlassian.net/browse/CLDR-15976
     */
    private String normalizeZeroWidthSpace(String value) {
        if (ZERO_WIDTH_SPACES.matcher(value).find()) {
            final String localeId = locale.getBaseName();
            if (LOCALES_NOT_ALLOWING_ZWS.contains(localeId)) {
                value = ZERO_WIDTH_SPACES.matcher(value).replaceAll("");
            }
        }
        return value;
    }

    /**
     * If inheritance replacement is enabled and the value matches the Bailey (inherited) value,
     * replace the value with CldrUtility.INHERITANCE_MARKER
     *
     * <p>This is only appropriate if cldrFileForBailey != null, meaning that
     * enableInheritanceReplacement has been called -- some cost may be involved in getting
     * cldrFileForBailey and calling getBaileyValue, and some callers of DAIP may not want the
     * replacement, so the default, when enableInheritanceReplacement has not been called, is no
     * replacement
     *
     * @param path
     * @param value
     * @return the value or CldrUtility.INHERITANCE_MARKER
     */
    public String replaceBaileyWithInheritanceMarker(String path, String value) {
        if (cldrFileForBailey != null && !value.isEmpty()) {
            Output<String> pathWhereFound = new Output<>();
            Output<String> localeWhereFound = new Output<>();
            String baileyValue =
                    cldrFileForBailey.getBaileyValue(path, pathWhereFound, localeWhereFound);
            if (value.equals(baileyValue)
                    && !XMLSource.ROOT_ID.equals(localeWhereFound.value)
                    && !XMLSource.CODE_FALLBACK_ID.equals(localeWhereFound.value)) {
                return CldrUtility.INHERITANCE_MARKER;
            }
        }
        return value;
    }
}