xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ConvertXTB.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.tool;
2 
3 import java.io.File;
4 import java.io.FileNotFoundException;
5 import java.io.IOException;
6 import java.io.InputStream;
7 import java.io.PrintStream;
8 import java.io.PrintWriter;
9 import java.util.ArrayList;
10 import java.util.HashMap;
11 import java.util.HashSet;
12 import java.util.Iterator;
13 import java.util.List;
14 import java.util.Map;
15 import java.util.Set;
16 import java.util.regex.Matcher;
17 import java.util.regex.Pattern;
18 import org.unicode.cldr.test.CheckCLDR;
19 import org.unicode.cldr.test.CheckCLDR.CheckStatus;
20 import org.unicode.cldr.test.CheckCLDR.CheckStatus.Type;
21 import org.unicode.cldr.test.DisplayAndInputProcessor;
22 import org.unicode.cldr.tool.Option.Options;
23 import org.unicode.cldr.util.CLDRFile;
24 import org.unicode.cldr.util.CLDRLocale;
25 import org.unicode.cldr.util.CLDRPaths;
26 import org.unicode.cldr.util.Factory;
27 import org.unicode.cldr.util.InputStreamFactory;
28 import org.unicode.cldr.util.PathDescription;
29 import org.unicode.cldr.util.PatternCache;
30 import org.unicode.cldr.util.PatternPlaceholders;
31 import org.unicode.cldr.util.PatternPlaceholders.PlaceholderInfo;
32 import org.unicode.cldr.util.SimpleXMLSource;
33 import org.unicode.cldr.util.SupplementalDataInfo;
34 import org.unicode.cldr.util.XMLFileReader;
35 import org.unicode.cldr.util.XMLSource;
36 import org.xml.sax.Attributes;
37 import org.xml.sax.ContentHandler;
38 import org.xml.sax.InputSource;
39 import org.xml.sax.Locator;
40 import org.xml.sax.SAXException;
41 import org.xml.sax.XMLReader;
42 
43 /**
44  * A command-line tool for converting XTB files to the CLDR format and checking them against current
45  * CLDR data.
46  *
47  * @author [email protected] (Jennifer Chye)
48  */
49 public class ConvertXTB {
50     private static final Pattern ID_PATTERN = PatternCache.get("\\[@id=\"(\\d++)\"]");
51     private static final Pattern PLURAL_MESSAGE_FORMAT =
52             PatternCache.get("\\{[A-Z_]++,plural, (.*)}");
53 
54     private static PatternPlaceholders patternPlaceholders;
55     private static Map<String, Map<String, String>> loadedReverseTagMaps;
56     private static PathDescription pathDescription;
57 
58     private Factory factory;
59     private File xtbDir;
60     private File inputDir;
61     private String outputDir;
62     private CheckCLDR checkCldr;
63     private CLDRFile englishFile;
64     private PrintStream out;
65 
66     /***
67      * Constructor. The input directory must have the following format:
68      *
69      * <pre>
70      * inputDir/
71      *   ar/
72      *     ar.wsb
73      *   ca/
74      *     ca.wsb
75      *   ...
76      *   xtb/
77      *     ar.xtb
78      *     ca.xtb
79      *     ...
80      * </pre>
81      *
82      * @param inputDir
83      *            the directory to read the wsb and xtb files from
84      * @param outputDir
85      *            the directory to write the generated CLDR files to
86      * @param checkFilter
87      *            the CheckCLDR regex filter for checking
88      */
ConvertXTB(String inputDir, String outputDir, String checkFilter)89     private ConvertXTB(String inputDir, String outputDir, String checkFilter) {
90         xtbDir = new File(inputDir, "xtb");
91         this.inputDir = new File(inputDir);
92         this.outputDir = outputDir;
93         factory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*");
94         englishFile = factory.make("en", true);
95         this.checkCldr = CheckCLDR.getCheckAll(factory, checkFilter);
96         CheckCLDR.setDisplayInformation(englishFile);
97         out = System.out;
98     }
99 
100     /**
101      * Sets the PrintStream that any errors will be sent to. System.out is used by default.
102      *
103      * @param out
104      */
setErrorOutput(PrintStream out)105     private void setErrorOutput(PrintStream out) {
106         this.out = out;
107     }
108 
109     /** Wrapper class for the contents of an XTB file. */
110     private class XtbInfo implements Iterable<XtbEntry> {
111         public String locale;
112         public List<XtbEntry> entries;
113 
XtbInfo(String locale)114         public XtbInfo(String locale) {
115             this.locale = locale;
116             entries = new ArrayList<>();
117         }
118 
119         @Override
iterator()120         public Iterator<XtbEntry> iterator() {
121             return entries.iterator();
122         }
123 
add(XtbEntry entry)124         public void add(XtbEntry entry) {
125             entries.add(entry);
126         }
127     }
128 
129     /** Wrapper class for information related to a &lt;translation&gt; node in an XTB file. */
130     private class XtbEntry {
131         public String messageId;
132         public String xpath;
133         public String value;
134 
XtbEntry(String messageId, String xpath, String value)135         public XtbEntry(String messageId, String xpath, String value) {
136             this.messageId = messageId;
137             this.xpath = xpath;
138             this.value = value;
139         }
140     }
141 
142     /** An XML handler for XTB files. */
143     private class XtbHandler implements ContentHandler {
144         private DisplayAndInputProcessor daip;
145         private StringBuffer currentText;
146         private String lastId;
147         private String lastXpath;
148 
149         private Set<String> orphanedMessages;
150         private Set<String> oldMessages;
151         private XtbInfo output;
152 
XtbHandler(Set<String> oldMessages, XtbInfo output)153         public XtbHandler(Set<String> oldMessages, XtbInfo output) {
154             daip = new DisplayAndInputProcessor(CLDRLocale.getInstance(output.locale));
155             currentText = new StringBuffer();
156             orphanedMessages = new HashSet<>();
157             this.oldMessages = oldMessages;
158             this.output = output;
159         }
160 
161         @Override
characters(char[] ch, int start, int length)162         public void characters(char[] ch, int start, int length) throws SAXException {
163             currentText.append(ch, start, length);
164         }
165 
166         @Override
startElement(String uri, String localName, String qName, Attributes attr)167         public void startElement(String uri, String localName, String qName, Attributes attr)
168                 throws SAXException {
169             if (qName.equals("translation")) {
170                 lastId = attr.getValue("id");
171                 lastXpath = IdToPath.getPath(lastId);
172                 currentText.setLength(0);
173             } else if (qName.equals("ph")) {
174                 String name = attr.getValue("name");
175                 String placeholder = getPlaceholderForName(lastXpath, name);
176                 currentText.append(placeholder);
177             }
178         }
179 
180         @Override
endElement(String uri, String localName, String qName)181         public void endElement(String uri, String localName, String qName) throws SAXException {
182             if (qName.equals("translation")) {
183                 if (lastXpath == null) {
184                     orphanedMessages.add(lastId);
185                 } else if (!oldMessages.contains(lastId)) {
186                     // Only add new values to reduce computation time.
187                     addValue(lastXpath, currentText.toString());
188                 }
189                 currentText.setLength(0);
190             }
191         }
192 
193         /**
194          * Add the specified xpath and value to the output.
195          *
196          * @param xpath
197          * @param value
198          */
addValue(String xpath, String value)199         private void addValue(String xpath, String value) {
200             Matcher matcher = PLURAL_MESSAGE_FORMAT.matcher(value);
201             if (matcher.matches()) {
202                 // Parse the plural value. Example plural value:
203                 // {NUMBER,plural, =0{0 {CURRENCY_NAME}}=1{1 {CURRENCY_NAME}}
204                 // one{# {CURRENCY_NAME}}other{# {CURRENCY_NAME}}}
205                 addPluralValue(xpath, matcher.group(1));
206             } else {
207                 addValueToOutput(xpath, value);
208             }
209         }
210 
211         /**
212          * Processes a plural value and xpath and adds them to the output.
213          *
214          * @param xpath
215          * @param value
216          */
addPluralValue(String xpath, String value)217         private void addPluralValue(String xpath, String value) {
218             // Example plural value to be parsed:
219             // =0{0 {CURRENCY_NAME}}=1{1 {CURRENCY_NAME}}
220             // one{# {CURRENCY_NAME}}other{# {CURRENCY_NAME}}
221             int numOpen = 0;
222             StringBuffer buffer = new StringBuffer();
223             String countType = null;
224             int nameStart = -1;
225             for (int i = 0; i < value.length(); i++) {
226                 char c = value.charAt(i);
227                 switch (c) {
228                     case '{':
229                         if (numOpen == 0) {
230                             int startIndex = buffer.charAt(0) == '=' ? 1 : 0;
231                             countType = buffer.substring(startIndex);
232                             buffer.setLength(0);
233                         } else {
234                             // Start of placeholder.
235                             nameStart = i + 1;
236                         }
237                         numOpen++;
238                         break;
239                     case '}':
240                         numOpen--;
241                         if (numOpen == 0) {
242                             // Special handling for decimal format lengths.
243                             if (lastXpath.contains("decimalFormatLength")) {
244                                 if (countType.length() == 1) {
245                                     countType = countType.charAt(0) == '1' ? "one" : "zero";
246                                 } else if (countType.equals("one")) {
247                                     // skip, contains rubbish
248                                     buffer.setLength(0);
249                                     countType = null;
250                                     break;
251                                 }
252                             }
253                             // Add the count attribute back to the xpath.
254                             String pluralXPath = xpath + "[@count=\"" + countType + "\"]";
255                             // Add any remaining missing placeholders.
256                             String pluralValue = buffer.toString();
257                             if (pluralValue.contains("{1}") && !pluralValue.contains("{0}")) {
258                                 // Fix placeholder numbering. Assumes there is only one
259                                 // placeholder in the pattern.
260                                 if (countType.matches("[01]")) {
261                                     pluralValue =
262                                             pluralValue.replaceAll(countType + "(?!})", "{0}");
263                                 } else {
264                                     pluralValue = pluralValue.replace("{1}", "{0}");
265                                 }
266                             }
267                             addValueToOutput(pluralXPath, pluralValue);
268                             buffer.setLength(0);
269                             countType = null;
270                         } else {
271                             // End of placeholder.
272                             String name = value.substring(nameStart, i);
273                             buffer.append(getPlaceholderForName(xpath, name));
274                         }
275                         break;
276                     case '#':
277                         buffer.append(lastXpath.contains("decimalFormatLength") ? '#' : "{0}");
278                         break;
279                     default:
280                         // Don't append placeholder names.
281                         if (numOpen < 2) {
282                             buffer.append(c);
283                         }
284                 }
285             }
286         }
287 
addValueToOutput(String xpath, String value)288         private void addValueToOutput(String xpath, String value) {
289             value = daip.processInput(xpath, value, null);
290             output.add(new XtbEntry(lastId, xpath, value));
291         }
292 
293         @Override
ignorableWhitespace(char[] arg0, int arg1, int arg2)294         public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException {}
295 
296         @Override
processingInstruction(String arg0, String arg1)297         public void processingInstruction(String arg0, String arg1) throws SAXException {}
298 
299         @Override
skippedEntity(String arg0)300         public void skippedEntity(String arg0) throws SAXException {}
301 
302         @Override
startDocument()303         public void startDocument() throws SAXException {}
304 
305         @Override
endDocument()306         public void endDocument() throws SAXException {
307             if (orphanedMessages.size() > 0) {
308                 System.err.println(
309                         orphanedMessages.size() + " message IDs with no matching xpaths: ");
310                 for (String messageID : orphanedMessages) {
311                     System.err.println(messageID);
312                 }
313             }
314         }
315 
316         @Override
setDocumentLocator(Locator arg0)317         public void setDocumentLocator(Locator arg0) {}
318 
319         @Override
startPrefixMapping(String arg0, String arg1)320         public void startPrefixMapping(String arg0, String arg1) throws SAXException {}
321 
322         @Override
endPrefixMapping(String arg0)323         public void endPrefixMapping(String arg0) throws SAXException {}
324     }
325 
326     /** An XML handler for WSB files. */
327     private class WsbHandler extends XMLFileReader.SimpleHandler {
328         private Set<String> messageIds;
329 
WsbHandler()330         public WsbHandler() {
331             messageIds = new HashSet<>();
332         }
333 
334         @Override
handlePathValue(String path, String value)335         public void handlePathValue(String path, String value) {
336             Matcher matcher = ID_PATTERN.matcher(path);
337             if (matcher.find()) {
338                 messageIds.add(matcher.group(1));
339             }
340         }
341 
getOldMessages()342         public Set<String> getOldMessages() {
343             return messageIds;
344         }
345     }
346 
347     /**
348      * @param xpath the xpath that the placeholder belongs to
349      * @param name the name to get the placeholder for
350      * @return the placeholder, e.g. "{0}" or "{1}"
351      */
getPlaceholderForName(String xpath, String name)352     private String getPlaceholderForName(String xpath, String name) {
353         if (loadedReverseTagMaps == null) {
354             loadedReverseTagMaps = new HashMap<>();
355         }
356         Map<String, String> map = loadedReverseTagMaps.get(xpath);
357         if (map == null) {
358             map = new HashMap<>();
359             loadedReverseTagMaps.put(xpath, map);
360             Map<String, PlaceholderInfo> tagMap = getTagMap(xpath);
361             for (Map.Entry<String, PlaceholderInfo> entry : tagMap.entrySet()) {
362                 map.put(entry.getValue().name, entry.getKey());
363             }
364         }
365         return map.get(name);
366     }
367 
368     /**
369      * @param xpath the xpath to get placeholder information for
370      * @return a mapping of placeholders to placeholder information for the specified xpath
371      */
getTagMap(String xpath)372     private Map<String, PlaceholderInfo> getTagMap(String xpath) {
373         if (patternPlaceholders == null) {
374             patternPlaceholders = PatternPlaceholders.getInstance();
375         }
376         return patternPlaceholders.get(xpath);
377     }
378 
379     /**
380      * Loads the contents of an XTB file into memory.
381      *
382      * @param locale the locale of the XTB file to be loaded
383      * @return
384      */
load(String locale)385     private XtbInfo load(String locale) {
386         // HACKETY HACK: The wsb files use old langauge codes with hyphens
387         // instead of CLDR's underscores.
388         // The xtb files use hyphens but differ yet again from the CLDR and xtb
389         // language codes, e.g.
390         // wsb uses "iw" but xtb and CLDR use "he". This means that we can't
391         // convert the locale to the CLDR standard until after reading in the
392         // xtb/wsb files. Sigh.
393         // Get the set of previously translated messages from the WSB file.
394         Set<String> oldMessages = null;
395         try {
396             oldMessages = loadOldMessages(locale);
397         } catch (IllegalArgumentException e) {
398             System.err.println("No wsb found for " + locale + ", skipping");
399             return null;
400         }
401 
402         // Parse the XTB file.
403         XtbInfo info = new XtbInfo(LanguageCodeConverter.fromGoogleLocaleId(locale));
404         XtbHandler handler = new XtbHandler(oldMessages, info);
405         XMLReader xmlReader = XMLFileReader.createXMLReader(false);
406         xmlReader.setContentHandler(handler);
407         File inputFile = new File(xtbDir, locale + ".xtb");
408         try (InputStream fis = InputStreamFactory.createInputStream(inputFile)) {
409             //  FileInputStream fis = new FileInputStream(inputFile);
410             InputSource is = new InputSource(fis);
411             xmlReader.parse(is);
412             // fis.close();
413         } catch (SAXException | IOException e) {
414             System.err.println("Error loading " + inputFile.getAbsolutePath());
415             e.printStackTrace();
416         }
417         //            catch (SAXException e) {
418         //            System.err.println("Error loading " + inputFile.getAbsolutePath());
419         //            e.printStackTrace();
420         //        }
421         return info;
422     }
423 
424     /**
425      * Loads the set of messages that were previously translated.
426      *
427      * @param locale the locale of the messages to be retrieved
428      * @return
429      * @throws IllegalArgumentException if there was an error parsing the wsb
430      */
loadOldMessages(String locale)431     private Set<String> loadOldMessages(String locale) throws IllegalArgumentException {
432         locale = LanguageCodeConverter.toGoogleLocaleId(locale);
433         WsbHandler handler = new WsbHandler();
434         XMLFileReader xfr = new XMLFileReader().setHandler(handler);
435         File wsbFile = new File(inputDir, locale + '/' + locale + ".wsb");
436         xfr.read(wsbFile.getAbsolutePath(), -1, true);
437         return handler.getOldMessages();
438     }
439 
440     /**
441      * Processes all XTB files in the input directory that match the specified regex.
442      *
443      * @param regexFilter
444      */
processAll(String regexFilter)445     private void processAll(String regexFilter) {
446         out.println(
447                 "Locale\tMessage ID\tDescription\tEnglish Value\t"
448                         + "Translated Value\tType\tError Message");
449         for (String filename : xtbDir.list()) {
450             if (filename.matches(regexFilter + "\\.xtb")) {
451                 String locale = filename.substring(0, filename.length() - 4);
452                 XtbInfo xtbInfo = load(locale);
453                 if (xtbInfo == null) continue;
454                 check(xtbInfo);
455                 writeXml(xtbInfo);
456             }
457         }
458     }
459 
460     /**
461      * Checks the contents of the XTB file against the existing CLDR data.
462      *
463      * @param xtbInfo the contents of the XTB to be checked
464      */
check(XtbInfo xtbInfo)465     private void check(XtbInfo xtbInfo) {
466         String locale = xtbInfo.locale;
467         CLDRFile cldrFile = factory.make(locale, false).cloneAsThawed();
468         for (XtbEntry info : xtbInfo) {
469             cldrFile.add(info.xpath, info.value);
470         }
471         Map<String, String> options = new HashMap<>();
472         List<CheckStatus> possibleErrors = new ArrayList<>();
473         checkCldr.setCldrFileToCheck(cldrFile, new CheckCLDR.Options(options), possibleErrors);
474         for (CheckStatus status : possibleErrors) {
475             System.out.println(locale + "\tLOCALE ERROR\t" + status.getMessage());
476         }
477         int numErrors = 0;
478         for (XtbEntry info : xtbInfo) {
479             String xpath = CLDRFile.getDistinguishingXPath(info.xpath, null);
480             String fullPath = cldrFile.getFullXPath(xpath);
481             String value = info.value;
482             checkCldr.check(xpath, fullPath, value, new CheckCLDR.Options(options), possibleErrors);
483             numErrors += displayErrors(locale, info.messageId, xpath, value, possibleErrors);
484         }
485         if (numErrors == 0) System.out.println("No errors found for " + locale);
486         out.flush();
487     }
488 
489     /**
490      * Displays any errors that occurred when checking the specified xpath
491      *
492      * @param locale the locale of the xpath being checked
493      * @param id the message ID corresponding to the xpath
494      * @param xpath the xpath that was checked
495      * @param value the value of the xpath
496      * @param possibleErrors a list of errors generated by checking the xpath
497      */
displayErrors( String locale, String id, String xpath, String value, List<CheckStatus> possibleErrors)498     private int displayErrors(
499             String locale,
500             String id,
501             String xpath,
502             String value,
503             List<CheckStatus> possibleErrors) {
504         String description = getDescription(xpath, value);
505         // Ignore these interval formats since they'll be removed.
506         if (id.equals("8190100716823312848") || id.equals("8190100716823312848")) return 0;
507         int numErrors = 0;
508         for (CheckStatus status : possibleErrors) {
509             out.println(
510                     locale
511                             + "\t"
512                             + id
513                             + "\t"
514                             + description
515                             + "\t"
516                             + englishFile.getStringValue(xpath)
517                             + "\t"
518                             + value
519                             + "\t"
520                             + status.getType()
521                             + "\t"
522                             + status.getMessage().replace('\t', ' '));
523             if (status.getType().equals(Type.Error)) {
524                 numErrors++;
525             }
526         }
527         return numErrors;
528     }
529 
530     /**
531      * Writes the contents of the XTB file to an XML file in CLDR format.
532      *
533      * @param xtbInfo
534      */
writeXml(XtbInfo xtbInfo)535     private void writeXml(XtbInfo xtbInfo) {
536         String locale = xtbInfo.locale;
537         File xmlFile = new File(outputDir, locale + ".xml");
538         // Add proposed alt tags to all xpaths.
539         XMLSource altSource = new SimpleXMLSource(locale);
540         for (XtbEntry info : xtbInfo) {
541             altSource.putValueAtPath(info.xpath, info.value);
542         }
543         PrintWriter out;
544         try {
545             out = new PrintWriter(xmlFile);
546             new CLDRFile(altSource).write(out);
547             out.close();
548         } catch (FileNotFoundException e) {
549             System.err.println("Couldn't write " + xmlFile.getAbsolutePath() + " to disk.");
550         }
551     }
552 
getDescription(String path, String value)553     private String getDescription(String path, String value) {
554         if (pathDescription == null) {
555             SupplementalDataInfo supplementalDataInfo = SupplementalDataInfo.getInstance();
556             pathDescription =
557                     new PathDescription(
558                             supplementalDataInfo,
559                             englishFile,
560                             null,
561                             null,
562                             PathDescription.ErrorHandling.CONTINUE);
563         }
564         final String description = pathDescription.getDescription(path, value, null);
565         return description;
566     }
567 
568     private static final Options options =
569             new Options()
570                     .add(
571                             "source_dir",
572                             ".*",
573                             "The source directory containing the xtb and wsb files to be read")
574                     .add(
575                             "destination_dir",
576                             ".*",
577                             "The destination directory to write the XML files to")
578                     .add(
579                             "locale_filter",
580                             ".*",
581                             ".*",
582                             "A regex filter for (Google) locales to be processed")
583                     .add("test_filter", ".*", ".*", "A regex filter for CheckCLDR tests")
584                     .add(
585                             "error_file",
586                             ".*",
587                             "./errors.tsv",
588                             "The file that checking results should be written to");
589 
590     /**
591      * @param args
592      */
main(String[] args)593     public static void main(String[] args) throws IOException {
594         options.parse(args, true);
595         String inputDir = null;
596         Option option = options.get("source_dir");
597         if (option.doesOccur()) {
598             inputDir = option.getValue();
599         } else {
600             throw new RuntimeException("Input dir must be specified");
601         }
602         String outputDir = null;
603         option = options.get("destination_dir");
604         if (option.doesOccur()) {
605             outputDir = option.getValue();
606         } else {
607             throw new RuntimeException("Output dir must be specified");
608         }
609         String localeFilter = options.get("locale_filter").getValue();
610         String testFilter = options.get("test_filter").getValue();
611         String errorFile = options.get("error_file").getValue();
612 
613         // input, output
614         ConvertXTB converter = new ConvertXTB(inputDir, outputDir, testFilter);
615         PrintStream out = new PrintStream(new File(errorFile));
616         converter.setErrorOutput(out);
617         converter.processAll(localeFilter);
618         out.close();
619     }
620 }
621