xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/tool/CLDRFileTransformer.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.tool;
2 
3 import com.ibm.icu.text.Normalizer;
4 import com.ibm.icu.text.Transliterator;
5 import com.ibm.icu.text.UnicodeSet;
6 import com.ibm.icu.util.ICUUncheckedIOException;
7 import java.io.File;
8 import java.util.Map;
9 import java.util.concurrent.ConcurrentHashMap;
10 import org.unicode.cldr.test.DisplayAndInputProcessor;
11 import org.unicode.cldr.util.CLDRFile;
12 import org.unicode.cldr.util.CLDRPaths;
13 import org.unicode.cldr.util.CLDRTransforms;
14 import org.unicode.cldr.util.CLDRTransforms.ParsedTransformID;
15 import org.unicode.cldr.util.CldrUtility;
16 import org.unicode.cldr.util.DtdType;
17 import org.unicode.cldr.util.Factory;
18 import org.unicode.cldr.util.LocaleIDParser;
19 import org.unicode.cldr.util.SimpleFactory.NoSourceDirectoryException;
20 import org.unicode.cldr.util.SimpleXMLSource;
21 import org.unicode.cldr.util.TempPrintWriter;
22 import org.unicode.cldr.util.XMLSource;
23 
24 /**
25  * Transforms the contents of a CLDRFile.
26  *
27  * @author jchye
28  */
29 public class CLDRFileTransformer {
30     public enum PolicyIfExisting {
31         RETAIN, // Do not transliterate if existing output has locale content
32         DISCARD, // Replace existing output locale content
33         MINIMIZE // RETAIN, plus drop values if translit is a no-op.
34     }
35 
36     /**
37      * Contains all supported locale-to-locale conversions along with information needed to convert
38      * each locale. Each enum value is named after the locale that results from the conversion.
39      */
40     public enum LocaleTransform {
41         sr_Latn(
42                 "sr",
43                 "Serbian-Latin-BGN.xml",
44                 Transliterator.FORWARD,
45                 "[:script=Cyrl:]",
46                 PolicyIfExisting.DISCARD), //
47         sr_Latn_BA(
48                 "sr_Cyrl_BA",
49                 "Serbian-Latin-BGN.xml",
50                 Transliterator.FORWARD,
51                 "[:script=Cyrl:]",
52                 PolicyIfExisting.DISCARD), //
53         sr_Latn_ME(
54                 "sr_Cyrl_ME",
55                 "Serbian-Latin-BGN.xml",
56                 Transliterator.FORWARD,
57                 "[:script=Cyrl:]",
58                 PolicyIfExisting.DISCARD), //
59         sr_Latn_XK(
60                 "sr_Cyrl_XK",
61                 "Serbian-Latin-BGN.xml",
62                 Transliterator.FORWARD,
63                 "[:script=Cyrl:]",
64                 PolicyIfExisting.DISCARD), //
65         ha_NE(
66                 "ha",
67                 "ha-ha_NE.xml",
68                 Transliterator.FORWARD,
69                 "[y Y ƴ Ƴ ʼ]",
70                 PolicyIfExisting.DISCARD), //
71         yo_BJ(
72                 "yo",
73                 "yo-yo_BJ.xml",
74                 Transliterator.FORWARD,
75                 "[ẹ ọ ṣ Ẹ Ọ Ṣ]",
76                 PolicyIfExisting.DISCARD), //
77         de_CH("de", "[ß] Casefold", Transliterator.FORWARD, "[ß]", PolicyIfExisting.MINIMIZE), //
78         yue_Hans(
79                 "yue",
80                 "Simplified-Traditional.xml",
81                 Transliterator.REVERSE,
82                 "[:script=Hant:]",
83                 PolicyIfExisting.RETAIN), //
84     // en_NZ("en_AU", "null", Transliterator.FORWARD, "[]", PolicyIfExisting.DISCARD),
85     // Needs work to fix currency symbols, handle Māori. See
86     // http://unicode.org/cldr/trac/ticket/9516#comment:6
87     ;
88 
89         private final String inputLocale;
90         private final String transformFilename;
91         private final int direction;
92         private final UnicodeSet inputChars;
93         private final PolicyIfExisting policy;
94 
95         /**
96          * @deprecated Use {@link #LocaleTransform(String,String,int,String,PolicyIfExisting)}
97          *     instead
98          */
99         @Deprecated
LocaleTransform( String inputLocale, String transformFilename, int direction, String inputCharPattern)100         private LocaleTransform(
101                 String inputLocale,
102                 String transformFilename,
103                 int direction,
104                 String inputCharPattern) {
105             this(
106                     inputLocale,
107                     transformFilename,
108                     direction,
109                     inputCharPattern,
110                     PolicyIfExisting.DISCARD);
111         }
112 
LocaleTransform( String inputLocale, String transformFilename, int direction, String inputCharPattern, PolicyIfExisting policy)113         private LocaleTransform(
114                 String inputLocale,
115                 String transformFilename,
116                 int direction,
117                 String inputCharPattern,
118                 PolicyIfExisting policy) {
119             this.inputLocale = inputLocale;
120             this.transformFilename = transformFilename;
121             this.direction = direction;
122             this.inputChars = new UnicodeSet(inputCharPattern);
123             this.policy = policy;
124         }
125 
126         /**
127          * @return the policy for existing content
128          */
getPolicyIfExisting()129         public PolicyIfExisting getPolicyIfExisting() {
130             return policy;
131         }
132 
133         /**
134          * @return the locale that used for conversion
135          */
getInputLocale()136         public String getInputLocale() {
137             return inputLocale;
138         }
139 
140         /**
141          * @return the locale that used for conversion
142          */
getOutputLocale()143         public String getOutputLocale() {
144             return this.toString();
145         }
146 
147         /**
148          * @return the filename of the transform used to make the conversion
149          */
getTransformFilename()150         public String getTransformFilename() {
151             return transformFilename;
152         }
153 
154         /**
155          * @return the direction of the transformation
156          */
getDirection()157         public int getDirection() {
158             return direction;
159         }
160 
161         /**
162          * @return the set of characters in the input locale that should have been removed after
163          *     transformation, used for internal debugging
164          */
getInputChars()165         private UnicodeSet getInputChars() {
166             return inputChars;
167         }
168     }
169 
170     private UnicodeSet unconverted = new UnicodeSet();
171     private Factory factory;
172     /*
173      * The transliterators map exists, and is static, to avoid wasting a lot of time creating
174      * a new Transliterator more often than necessary. (An alternative to "static" here might be to
175      * create only one CLDRFileTransformer, maybe as a member of ExampleGenerator.)
176      * Use ConcurrentHashMap rather than HashMap to avoid concurrency problems.
177      * Reference: https://unicode.org/cldr/trac/ticket/11657
178      */
179     private static Map<LocaleTransform, Transliterator> transliterators = new ConcurrentHashMap<>();
180     private String transformDir;
181 
182     /**
183      * @param factory the factory to get locale data from
184      * @param transformDir the directory containing the transform files
185      */
CLDRFileTransformer(Factory factory, String transformDir)186     public CLDRFileTransformer(Factory factory, String transformDir) {
187         this.factory = factory;
188         this.transformDir = transformDir;
189     }
190 
loadTransliterator(LocaleTransform localeTransform)191     public Transliterator loadTransliterator(LocaleTransform localeTransform) {
192         if (transliterators.containsKey(localeTransform)) {
193             return transliterators.get(localeTransform);
194         }
195         Transliterator transliterator;
196         if (localeTransform.getTransformFilename().contains(".xml")) {
197             ParsedTransformID directionInfo = new ParsedTransformID();
198             String ruleString =
199                     CLDRTransforms.getIcuRulesFromXmlFile(
200                             transformDir, localeTransform.getTransformFilename(), directionInfo);
201             transliterator =
202                     Transliterator.createFromRules(
203                             directionInfo.getId(), ruleString, localeTransform.getDirection());
204         } else {
205             transliterator = Transliterator.getInstance(localeTransform.getTransformFilename());
206         }
207         transliterators.put(localeTransform, transliterator);
208         return transliterator;
209     }
210 
211     /**
212      * NOTE: This method does not currently handle nested transliterators.
213      *
214      * @param input
215      * @return null if the input file was missing, or if there is no new output file.
216      */
transform(LocaleTransform localeTransform)217     public CLDRFile transform(LocaleTransform localeTransform) {
218         Transliterator transliterator = loadTransliterator(localeTransform);
219         CLDRFile input;
220         final String inputLocale = localeTransform.getInputLocale();
221         try {
222             input = factory.make(inputLocale, false);
223         } catch (ICUUncheckedIOException e1) {
224             return null; // input file is missing (or otherwise unavailable)
225         }
226         boolean hadOutput = true;
227         CLDRFile output;
228         try {
229             output = factory.make(localeTransform.getOutputLocale(), false);
230         } catch (NoSourceDirectoryException e) {
231             // if we can't open the file, then just make a new one.
232             XMLSource dataSource = new SimpleXMLSource(localeTransform.getOutputLocale());
233             output = new CLDRFile(dataSource);
234             hadOutput = false;
235         }
236         String outputParentString = LocaleIDParser.getParent(localeTransform.getOutputLocale());
237         CLDRFile outputParent = factory.make(outputParentString, true);
238 
239         outputParent = factory.make(inputLocale, false);
240         XMLSource outputSource = new SimpleXMLSource(localeTransform.toString());
241         DisplayAndInputProcessor daip = new DisplayAndInputProcessor(output, true);
242         for (String xpath : input) {
243             String value = input.getStringValue(xpath);
244             if (CldrUtility.INHERITANCE_MARKER.equals(value)) {
245                 final String foundIn = input.getSourceLocaleID(xpath, null);
246                 // Include these only when they are actually present in this file
247                 if (!foundIn.equals(inputLocale)) {
248                     // inheritance marker came from somewhere else, ignore it
249                     continue;
250                 }
251             }
252             if (value == null) {
253                 continue;
254             }
255             String fullPath = input.getFullXPath(xpath);
256             String oldValue = output.getStringValue(xpath);
257             String parentValue = outputParent.getStringValue(xpath);
258             value =
259                     transformValue(
260                             transliterator, localeTransform, xpath, value, oldValue, parentValue);
261             if (value != null) {
262                 // check again
263                 if (CldrUtility.INHERITANCE_MARKER.equals(value)) {
264                     final String foundIn = input.getSourceLocaleID(xpath, null);
265                     // Include these only when they are actually present in this file
266                     if (!foundIn.equals(inputLocale)) {
267                         // inheritance marker came from somewhere else, ignore it
268                         continue;
269                     }
270                 }
271                 value = daip.processInput(xpath, value, null);
272                 outputSource.putValueAtPath(fullPath, value);
273             }
274         }
275         if (!outputSource.iterator().hasNext()) { // empty new output
276             if (!hadOutput) {
277                 return null; // don't add file if nothing to add
278             }
279         }
280         return new CLDRFile(outputSource);
281     }
282 
283     /**
284      * Transforms a CLDRFile value into another form.
285      *
286      * @param parentValue
287      */
transformValue( Transliterator transliterator, LocaleTransform localeTransform, String path, String value, String oldValue, String parentValue)288     private String transformValue(
289             Transliterator transliterator,
290             LocaleTransform localeTransform,
291             String path,
292             String value,
293             String oldValue,
294             String parentValue) {
295 
296         // allows us to change only new values
297         switch (localeTransform.policy) {
298             case RETAIN:
299             case MINIMIZE:
300                 if (oldValue != null) {
301                     return oldValue;
302                 }
303                 break;
304             default:
305         }
306 
307         UnicodeSet chars = localeTransform.getInputChars();
308         String transliterated;
309 
310         // TODO: Don't transform dates/patterns.
311         // For now, don't try to transliterate the exemplar characters - use the ones from the
312         // original locale.
313         // In the future, we can probably control this better with a config file - similar to
314         // CLDRModify's config file.
315         if (path.contains("exemplarCharacters")) {
316             if (oldValue != null) {
317                 transliterated = oldValue;
318             } else {
319                 transliterated = value;
320             }
321         } else {
322             transliterated = transliterator.transliterate(value);
323             transliterated = Normalizer.compose(transliterated, false);
324         }
325         if (localeTransform.policy == PolicyIfExisting.MINIMIZE) {
326             if (transliterated.equals(value)) {
327                 return null;
328             }
329         }
330 
331         if (chars.containsSome(transliterated)) {
332             unconverted.addAll(new UnicodeSet().addAll(chars).retainAll(transliterated));
333         }
334         return transliterated;
335     }
336 
main(String[] args)337     public static void main(String[] args) throws Exception {
338         for (String dir : DtdType.ldml.directories) {
339             if (dir.equals("casing") // skip, field contents are keywords, not localizable content
340                     || dir.equals(
341                             "collation") // skip, field contents are complex, and can't be simply
342                     // remapped
343                     || dir.equals("annotationsDerived") // skip, derived later
344             ) {
345                 continue;
346             }
347             System.out.println("\nDirectory: " + dir);
348             final String sourceDirectory = CLDRPaths.COMMON_DIRECTORY + dir + "/";
349             Factory factory = Factory.make(sourceDirectory, ".*");
350 
351             CLDRFileTransformer transformer =
352                     new CLDRFileTransformer(
353                             factory, CLDRPaths.COMMON_DIRECTORY + "transforms" + File.separator);
354             for (LocaleTransform localeTransform : LocaleTransform.values()) {
355                 CLDRFile output = transformer.transform(localeTransform);
356                 if (output == null) {
357                     System.out.println(
358                             "SKIPPING missing file: "
359                                     + dir
360                                     + "/"
361                                     + localeTransform.inputLocale
362                                     + ".xml");
363                     continue;
364                 }
365                 String outputFile = output.getLocaleID() + ".xml";
366                 try (TempPrintWriter out =
367                         TempPrintWriter.openUTF8Writer(sourceDirectory, outputFile)
368                                 .skipCopyright(true)) {
369                     // System.out.println("Generating locale file: " + outputDir + outputFile);
370                     if (!transformer.unconverted.isEmpty()) {
371                         System.out.println("Untransformed characters: " + transformer.unconverted);
372                         transformer.unconverted.clear();
373                     }
374                     output.write(out.asPrintWriter());
375                 }
376             }
377         }
378     }
379 }
380