1 package org.unicode.cldr.tool; 2 3 import java.io.File; 4 import java.io.FileNotFoundException; 5 import java.io.IOException; 6 import java.io.InputStream; 7 import java.io.PrintStream; 8 import java.io.PrintWriter; 9 import java.util.ArrayList; 10 import java.util.HashMap; 11 import java.util.HashSet; 12 import java.util.Iterator; 13 import java.util.List; 14 import java.util.Map; 15 import java.util.Set; 16 import java.util.regex.Matcher; 17 import java.util.regex.Pattern; 18 import org.unicode.cldr.test.CheckCLDR; 19 import org.unicode.cldr.test.CheckCLDR.CheckStatus; 20 import org.unicode.cldr.test.CheckCLDR.CheckStatus.Type; 21 import org.unicode.cldr.test.DisplayAndInputProcessor; 22 import org.unicode.cldr.tool.Option.Options; 23 import org.unicode.cldr.util.CLDRFile; 24 import org.unicode.cldr.util.CLDRLocale; 25 import org.unicode.cldr.util.CLDRPaths; 26 import org.unicode.cldr.util.Factory; 27 import org.unicode.cldr.util.InputStreamFactory; 28 import org.unicode.cldr.util.PathDescription; 29 import org.unicode.cldr.util.PatternCache; 30 import org.unicode.cldr.util.PatternPlaceholders; 31 import org.unicode.cldr.util.PatternPlaceholders.PlaceholderInfo; 32 import org.unicode.cldr.util.SimpleXMLSource; 33 import org.unicode.cldr.util.SupplementalDataInfo; 34 import org.unicode.cldr.util.XMLFileReader; 35 import org.unicode.cldr.util.XMLSource; 36 import org.xml.sax.Attributes; 37 import org.xml.sax.ContentHandler; 38 import org.xml.sax.InputSource; 39 import org.xml.sax.Locator; 40 import org.xml.sax.SAXException; 41 import org.xml.sax.XMLReader; 42 43 /** 44 * A command-line tool for converting XTB files to the CLDR format and checking them against current 45 * CLDR data. 46 * 47 * @author [email protected] (Jennifer Chye) 48 */ 49 public class ConvertXTB { 50 private static final Pattern ID_PATTERN = PatternCache.get("\\[@id=\"(\\d++)\"]"); 51 private static final Pattern PLURAL_MESSAGE_FORMAT = 52 PatternCache.get("\\{[A-Z_]++,plural, (.*)}"); 53 54 private static PatternPlaceholders patternPlaceholders; 55 private static Map<String, Map<String, String>> loadedReverseTagMaps; 56 private static PathDescription pathDescription; 57 58 private Factory factory; 59 private File xtbDir; 60 private File inputDir; 61 private String outputDir; 62 private CheckCLDR checkCldr; 63 private CLDRFile englishFile; 64 private PrintStream out; 65 66 /*** 67 * Constructor. The input directory must have the following format: 68 * 69 * <pre> 70 * inputDir/ 71 * ar/ 72 * ar.wsb 73 * ca/ 74 * ca.wsb 75 * ... 76 * xtb/ 77 * ar.xtb 78 * ca.xtb 79 * ... 80 * </pre> 81 * 82 * @param inputDir 83 * the directory to read the wsb and xtb files from 84 * @param outputDir 85 * the directory to write the generated CLDR files to 86 * @param checkFilter 87 * the CheckCLDR regex filter for checking 88 */ ConvertXTB(String inputDir, String outputDir, String checkFilter)89 private ConvertXTB(String inputDir, String outputDir, String checkFilter) { 90 xtbDir = new File(inputDir, "xtb"); 91 this.inputDir = new File(inputDir); 92 this.outputDir = outputDir; 93 factory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); 94 englishFile = factory.make("en", true); 95 this.checkCldr = CheckCLDR.getCheckAll(factory, checkFilter); 96 CheckCLDR.setDisplayInformation(englishFile); 97 out = System.out; 98 } 99 100 /** 101 * Sets the PrintStream that any errors will be sent to. System.out is used by default. 102 * 103 * @param out 104 */ setErrorOutput(PrintStream out)105 private void setErrorOutput(PrintStream out) { 106 this.out = out; 107 } 108 109 /** Wrapper class for the contents of an XTB file. */ 110 private class XtbInfo implements Iterable<XtbEntry> { 111 public String locale; 112 public List<XtbEntry> entries; 113 XtbInfo(String locale)114 public XtbInfo(String locale) { 115 this.locale = locale; 116 entries = new ArrayList<>(); 117 } 118 119 @Override iterator()120 public Iterator<XtbEntry> iterator() { 121 return entries.iterator(); 122 } 123 add(XtbEntry entry)124 public void add(XtbEntry entry) { 125 entries.add(entry); 126 } 127 } 128 129 /** Wrapper class for information related to a <translation> node in an XTB file. */ 130 private class XtbEntry { 131 public String messageId; 132 public String xpath; 133 public String value; 134 XtbEntry(String messageId, String xpath, String value)135 public XtbEntry(String messageId, String xpath, String value) { 136 this.messageId = messageId; 137 this.xpath = xpath; 138 this.value = value; 139 } 140 } 141 142 /** An XML handler for XTB files. */ 143 private class XtbHandler implements ContentHandler { 144 private DisplayAndInputProcessor daip; 145 private StringBuffer currentText; 146 private String lastId; 147 private String lastXpath; 148 149 private Set<String> orphanedMessages; 150 private Set<String> oldMessages; 151 private XtbInfo output; 152 XtbHandler(Set<String> oldMessages, XtbInfo output)153 public XtbHandler(Set<String> oldMessages, XtbInfo output) { 154 daip = new DisplayAndInputProcessor(CLDRLocale.getInstance(output.locale)); 155 currentText = new StringBuffer(); 156 orphanedMessages = new HashSet<>(); 157 this.oldMessages = oldMessages; 158 this.output = output; 159 } 160 161 @Override characters(char[] ch, int start, int length)162 public void characters(char[] ch, int start, int length) throws SAXException { 163 currentText.append(ch, start, length); 164 } 165 166 @Override startElement(String uri, String localName, String qName, Attributes attr)167 public void startElement(String uri, String localName, String qName, Attributes attr) 168 throws SAXException { 169 if (qName.equals("translation")) { 170 lastId = attr.getValue("id"); 171 lastXpath = IdToPath.getPath(lastId); 172 currentText.setLength(0); 173 } else if (qName.equals("ph")) { 174 String name = attr.getValue("name"); 175 String placeholder = getPlaceholderForName(lastXpath, name); 176 currentText.append(placeholder); 177 } 178 } 179 180 @Override endElement(String uri, String localName, String qName)181 public void endElement(String uri, String localName, String qName) throws SAXException { 182 if (qName.equals("translation")) { 183 if (lastXpath == null) { 184 orphanedMessages.add(lastId); 185 } else if (!oldMessages.contains(lastId)) { 186 // Only add new values to reduce computation time. 187 addValue(lastXpath, currentText.toString()); 188 } 189 currentText.setLength(0); 190 } 191 } 192 193 /** 194 * Add the specified xpath and value to the output. 195 * 196 * @param xpath 197 * @param value 198 */ addValue(String xpath, String value)199 private void addValue(String xpath, String value) { 200 Matcher matcher = PLURAL_MESSAGE_FORMAT.matcher(value); 201 if (matcher.matches()) { 202 // Parse the plural value. Example plural value: 203 // {NUMBER,plural, =0{0 {CURRENCY_NAME}}=1{1 {CURRENCY_NAME}} 204 // one{# {CURRENCY_NAME}}other{# {CURRENCY_NAME}}} 205 addPluralValue(xpath, matcher.group(1)); 206 } else { 207 addValueToOutput(xpath, value); 208 } 209 } 210 211 /** 212 * Processes a plural value and xpath and adds them to the output. 213 * 214 * @param xpath 215 * @param value 216 */ addPluralValue(String xpath, String value)217 private void addPluralValue(String xpath, String value) { 218 // Example plural value to be parsed: 219 // =0{0 {CURRENCY_NAME}}=1{1 {CURRENCY_NAME}} 220 // one{# {CURRENCY_NAME}}other{# {CURRENCY_NAME}} 221 int numOpen = 0; 222 StringBuffer buffer = new StringBuffer(); 223 String countType = null; 224 int nameStart = -1; 225 for (int i = 0; i < value.length(); i++) { 226 char c = value.charAt(i); 227 switch (c) { 228 case '{': 229 if (numOpen == 0) { 230 int startIndex = buffer.charAt(0) == '=' ? 1 : 0; 231 countType = buffer.substring(startIndex); 232 buffer.setLength(0); 233 } else { 234 // Start of placeholder. 235 nameStart = i + 1; 236 } 237 numOpen++; 238 break; 239 case '}': 240 numOpen--; 241 if (numOpen == 0) { 242 // Special handling for decimal format lengths. 243 if (lastXpath.contains("decimalFormatLength")) { 244 if (countType.length() == 1) { 245 countType = countType.charAt(0) == '1' ? "one" : "zero"; 246 } else if (countType.equals("one")) { 247 // skip, contains rubbish 248 buffer.setLength(0); 249 countType = null; 250 break; 251 } 252 } 253 // Add the count attribute back to the xpath. 254 String pluralXPath = xpath + "[@count=\"" + countType + "\"]"; 255 // Add any remaining missing placeholders. 256 String pluralValue = buffer.toString(); 257 if (pluralValue.contains("{1}") && !pluralValue.contains("{0}")) { 258 // Fix placeholder numbering. Assumes there is only one 259 // placeholder in the pattern. 260 if (countType.matches("[01]")) { 261 pluralValue = 262 pluralValue.replaceAll(countType + "(?!})", "{0}"); 263 } else { 264 pluralValue = pluralValue.replace("{1}", "{0}"); 265 } 266 } 267 addValueToOutput(pluralXPath, pluralValue); 268 buffer.setLength(0); 269 countType = null; 270 } else { 271 // End of placeholder. 272 String name = value.substring(nameStart, i); 273 buffer.append(getPlaceholderForName(xpath, name)); 274 } 275 break; 276 case '#': 277 buffer.append(lastXpath.contains("decimalFormatLength") ? '#' : "{0}"); 278 break; 279 default: 280 // Don't append placeholder names. 281 if (numOpen < 2) { 282 buffer.append(c); 283 } 284 } 285 } 286 } 287 addValueToOutput(String xpath, String value)288 private void addValueToOutput(String xpath, String value) { 289 value = daip.processInput(xpath, value, null); 290 output.add(new XtbEntry(lastId, xpath, value)); 291 } 292 293 @Override ignorableWhitespace(char[] arg0, int arg1, int arg2)294 public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException {} 295 296 @Override processingInstruction(String arg0, String arg1)297 public void processingInstruction(String arg0, String arg1) throws SAXException {} 298 299 @Override skippedEntity(String arg0)300 public void skippedEntity(String arg0) throws SAXException {} 301 302 @Override startDocument()303 public void startDocument() throws SAXException {} 304 305 @Override endDocument()306 public void endDocument() throws SAXException { 307 if (orphanedMessages.size() > 0) { 308 System.err.println( 309 orphanedMessages.size() + " message IDs with no matching xpaths: "); 310 for (String messageID : orphanedMessages) { 311 System.err.println(messageID); 312 } 313 } 314 } 315 316 @Override setDocumentLocator(Locator arg0)317 public void setDocumentLocator(Locator arg0) {} 318 319 @Override startPrefixMapping(String arg0, String arg1)320 public void startPrefixMapping(String arg0, String arg1) throws SAXException {} 321 322 @Override endPrefixMapping(String arg0)323 public void endPrefixMapping(String arg0) throws SAXException {} 324 } 325 326 /** An XML handler for WSB files. */ 327 private class WsbHandler extends XMLFileReader.SimpleHandler { 328 private Set<String> messageIds; 329 WsbHandler()330 public WsbHandler() { 331 messageIds = new HashSet<>(); 332 } 333 334 @Override handlePathValue(String path, String value)335 public void handlePathValue(String path, String value) { 336 Matcher matcher = ID_PATTERN.matcher(path); 337 if (matcher.find()) { 338 messageIds.add(matcher.group(1)); 339 } 340 } 341 getOldMessages()342 public Set<String> getOldMessages() { 343 return messageIds; 344 } 345 } 346 347 /** 348 * @param xpath the xpath that the placeholder belongs to 349 * @param name the name to get the placeholder for 350 * @return the placeholder, e.g. "{0}" or "{1}" 351 */ getPlaceholderForName(String xpath, String name)352 private String getPlaceholderForName(String xpath, String name) { 353 if (loadedReverseTagMaps == null) { 354 loadedReverseTagMaps = new HashMap<>(); 355 } 356 Map<String, String> map = loadedReverseTagMaps.get(xpath); 357 if (map == null) { 358 map = new HashMap<>(); 359 loadedReverseTagMaps.put(xpath, map); 360 Map<String, PlaceholderInfo> tagMap = getTagMap(xpath); 361 for (Map.Entry<String, PlaceholderInfo> entry : tagMap.entrySet()) { 362 map.put(entry.getValue().name, entry.getKey()); 363 } 364 } 365 return map.get(name); 366 } 367 368 /** 369 * @param xpath the xpath to get placeholder information for 370 * @return a mapping of placeholders to placeholder information for the specified xpath 371 */ getTagMap(String xpath)372 private Map<String, PlaceholderInfo> getTagMap(String xpath) { 373 if (patternPlaceholders == null) { 374 patternPlaceholders = PatternPlaceholders.getInstance(); 375 } 376 return patternPlaceholders.get(xpath); 377 } 378 379 /** 380 * Loads the contents of an XTB file into memory. 381 * 382 * @param locale the locale of the XTB file to be loaded 383 * @return 384 */ load(String locale)385 private XtbInfo load(String locale) { 386 // HACKETY HACK: The wsb files use old langauge codes with hyphens 387 // instead of CLDR's underscores. 388 // The xtb files use hyphens but differ yet again from the CLDR and xtb 389 // language codes, e.g. 390 // wsb uses "iw" but xtb and CLDR use "he". This means that we can't 391 // convert the locale to the CLDR standard until after reading in the 392 // xtb/wsb files. Sigh. 393 // Get the set of previously translated messages from the WSB file. 394 Set<String> oldMessages = null; 395 try { 396 oldMessages = loadOldMessages(locale); 397 } catch (IllegalArgumentException e) { 398 System.err.println("No wsb found for " + locale + ", skipping"); 399 return null; 400 } 401 402 // Parse the XTB file. 403 XtbInfo info = new XtbInfo(LanguageCodeConverter.fromGoogleLocaleId(locale)); 404 XtbHandler handler = new XtbHandler(oldMessages, info); 405 XMLReader xmlReader = XMLFileReader.createXMLReader(false); 406 xmlReader.setContentHandler(handler); 407 File inputFile = new File(xtbDir, locale + ".xtb"); 408 try (InputStream fis = InputStreamFactory.createInputStream(inputFile)) { 409 // FileInputStream fis = new FileInputStream(inputFile); 410 InputSource is = new InputSource(fis); 411 xmlReader.parse(is); 412 // fis.close(); 413 } catch (SAXException | IOException e) { 414 System.err.println("Error loading " + inputFile.getAbsolutePath()); 415 e.printStackTrace(); 416 } 417 // catch (SAXException e) { 418 // System.err.println("Error loading " + inputFile.getAbsolutePath()); 419 // e.printStackTrace(); 420 // } 421 return info; 422 } 423 424 /** 425 * Loads the set of messages that were previously translated. 426 * 427 * @param locale the locale of the messages to be retrieved 428 * @return 429 * @throws IllegalArgumentException if there was an error parsing the wsb 430 */ loadOldMessages(String locale)431 private Set<String> loadOldMessages(String locale) throws IllegalArgumentException { 432 locale = LanguageCodeConverter.toGoogleLocaleId(locale); 433 WsbHandler handler = new WsbHandler(); 434 XMLFileReader xfr = new XMLFileReader().setHandler(handler); 435 File wsbFile = new File(inputDir, locale + '/' + locale + ".wsb"); 436 xfr.read(wsbFile.getAbsolutePath(), -1, true); 437 return handler.getOldMessages(); 438 } 439 440 /** 441 * Processes all XTB files in the input directory that match the specified regex. 442 * 443 * @param regexFilter 444 */ processAll(String regexFilter)445 private void processAll(String regexFilter) { 446 out.println( 447 "Locale\tMessage ID\tDescription\tEnglish Value\t" 448 + "Translated Value\tType\tError Message"); 449 for (String filename : xtbDir.list()) { 450 if (filename.matches(regexFilter + "\\.xtb")) { 451 String locale = filename.substring(0, filename.length() - 4); 452 XtbInfo xtbInfo = load(locale); 453 if (xtbInfo == null) continue; 454 check(xtbInfo); 455 writeXml(xtbInfo); 456 } 457 } 458 } 459 460 /** 461 * Checks the contents of the XTB file against the existing CLDR data. 462 * 463 * @param xtbInfo the contents of the XTB to be checked 464 */ check(XtbInfo xtbInfo)465 private void check(XtbInfo xtbInfo) { 466 String locale = xtbInfo.locale; 467 CLDRFile cldrFile = factory.make(locale, false).cloneAsThawed(); 468 for (XtbEntry info : xtbInfo) { 469 cldrFile.add(info.xpath, info.value); 470 } 471 Map<String, String> options = new HashMap<>(); 472 List<CheckStatus> possibleErrors = new ArrayList<>(); 473 checkCldr.setCldrFileToCheck(cldrFile, new CheckCLDR.Options(options), possibleErrors); 474 for (CheckStatus status : possibleErrors) { 475 System.out.println(locale + "\tLOCALE ERROR\t" + status.getMessage()); 476 } 477 int numErrors = 0; 478 for (XtbEntry info : xtbInfo) { 479 String xpath = CLDRFile.getDistinguishingXPath(info.xpath, null); 480 String fullPath = cldrFile.getFullXPath(xpath); 481 String value = info.value; 482 checkCldr.check(xpath, fullPath, value, new CheckCLDR.Options(options), possibleErrors); 483 numErrors += displayErrors(locale, info.messageId, xpath, value, possibleErrors); 484 } 485 if (numErrors == 0) System.out.println("No errors found for " + locale); 486 out.flush(); 487 } 488 489 /** 490 * Displays any errors that occurred when checking the specified xpath 491 * 492 * @param locale the locale of the xpath being checked 493 * @param id the message ID corresponding to the xpath 494 * @param xpath the xpath that was checked 495 * @param value the value of the xpath 496 * @param possibleErrors a list of errors generated by checking the xpath 497 */ displayErrors( String locale, String id, String xpath, String value, List<CheckStatus> possibleErrors)498 private int displayErrors( 499 String locale, 500 String id, 501 String xpath, 502 String value, 503 List<CheckStatus> possibleErrors) { 504 String description = getDescription(xpath, value); 505 // Ignore these interval formats since they'll be removed. 506 if (id.equals("8190100716823312848") || id.equals("8190100716823312848")) return 0; 507 int numErrors = 0; 508 for (CheckStatus status : possibleErrors) { 509 out.println( 510 locale 511 + "\t" 512 + id 513 + "\t" 514 + description 515 + "\t" 516 + englishFile.getStringValue(xpath) 517 + "\t" 518 + value 519 + "\t" 520 + status.getType() 521 + "\t" 522 + status.getMessage().replace('\t', ' ')); 523 if (status.getType().equals(Type.Error)) { 524 numErrors++; 525 } 526 } 527 return numErrors; 528 } 529 530 /** 531 * Writes the contents of the XTB file to an XML file in CLDR format. 532 * 533 * @param xtbInfo 534 */ writeXml(XtbInfo xtbInfo)535 private void writeXml(XtbInfo xtbInfo) { 536 String locale = xtbInfo.locale; 537 File xmlFile = new File(outputDir, locale + ".xml"); 538 // Add proposed alt tags to all xpaths. 539 XMLSource altSource = new SimpleXMLSource(locale); 540 for (XtbEntry info : xtbInfo) { 541 altSource.putValueAtPath(info.xpath, info.value); 542 } 543 PrintWriter out; 544 try { 545 out = new PrintWriter(xmlFile); 546 new CLDRFile(altSource).write(out); 547 out.close(); 548 } catch (FileNotFoundException e) { 549 System.err.println("Couldn't write " + xmlFile.getAbsolutePath() + " to disk."); 550 } 551 } 552 getDescription(String path, String value)553 private String getDescription(String path, String value) { 554 if (pathDescription == null) { 555 SupplementalDataInfo supplementalDataInfo = SupplementalDataInfo.getInstance(); 556 pathDescription = 557 new PathDescription( 558 supplementalDataInfo, 559 englishFile, 560 null, 561 null, 562 PathDescription.ErrorHandling.CONTINUE); 563 } 564 final String description = pathDescription.getDescription(path, value, null); 565 return description; 566 } 567 568 private static final Options options = 569 new Options() 570 .add( 571 "source_dir", 572 ".*", 573 "The source directory containing the xtb and wsb files to be read") 574 .add( 575 "destination_dir", 576 ".*", 577 "The destination directory to write the XML files to") 578 .add( 579 "locale_filter", 580 ".*", 581 ".*", 582 "A regex filter for (Google) locales to be processed") 583 .add("test_filter", ".*", ".*", "A regex filter for CheckCLDR tests") 584 .add( 585 "error_file", 586 ".*", 587 "./errors.tsv", 588 "The file that checking results should be written to"); 589 590 /** 591 * @param args 592 */ main(String[] args)593 public static void main(String[] args) throws IOException { 594 options.parse(args, true); 595 String inputDir = null; 596 Option option = options.get("source_dir"); 597 if (option.doesOccur()) { 598 inputDir = option.getValue(); 599 } else { 600 throw new RuntimeException("Input dir must be specified"); 601 } 602 String outputDir = null; 603 option = options.get("destination_dir"); 604 if (option.doesOccur()) { 605 outputDir = option.getValue(); 606 } else { 607 throw new RuntimeException("Output dir must be specified"); 608 } 609 String localeFilter = options.get("locale_filter").getValue(); 610 String testFilter = options.get("test_filter").getValue(); 611 String errorFile = options.get("error_file").getValue(); 612 613 // input, output 614 ConvertXTB converter = new ConvertXTB(inputDir, outputDir, testFilter); 615 PrintStream out = new PrintStream(new File(errorFile)); 616 converter.setErrorOutput(out); 617 converter.processAll(localeFilter); 618 out.close(); 619 } 620 } 621