1 package org.unicode.cldr.tool; 2 3 import com.google.common.base.Joiner; 4 import com.google.common.base.Splitter; 5 import com.ibm.icu.impl.Relation; 6 import com.ibm.icu.impl.Row; 7 import com.ibm.icu.impl.Row.R2; 8 import com.ibm.icu.impl.Row.R4; 9 import com.ibm.icu.util.VersionInfo; 10 import java.io.BufferedReader; 11 import java.io.File; 12 import java.io.IOException; 13 import java.io.PrintWriter; 14 import java.util.ArrayList; 15 import java.util.Arrays; 16 import java.util.Collection; 17 import java.util.Collections; 18 import java.util.HashSet; 19 import java.util.LinkedHashSet; 20 import java.util.List; 21 import java.util.Map; 22 import java.util.Map.Entry; 23 import java.util.Set; 24 import java.util.TreeMap; 25 import java.util.TreeSet; 26 import java.util.regex.Matcher; 27 import java.util.regex.Pattern; 28 import org.unicode.cldr.draft.FileUtilities; 29 import org.unicode.cldr.tool.Option.Options; 30 import org.unicode.cldr.util.Builder; 31 import org.unicode.cldr.util.CLDRConfig; 32 import org.unicode.cldr.util.CLDRFile; 33 import org.unicode.cldr.util.CLDRPaths; 34 import org.unicode.cldr.util.CldrUtility; 35 import org.unicode.cldr.util.Counter; 36 import org.unicode.cldr.util.DtdData; 37 import org.unicode.cldr.util.DtdData.Attribute; 38 import org.unicode.cldr.util.DtdData.Element; 39 import org.unicode.cldr.util.DtdType; 40 import org.unicode.cldr.util.PathStarrer; 41 import org.unicode.cldr.util.PathUtilities; 42 import org.unicode.cldr.util.PatternCache; 43 import org.unicode.cldr.util.RegexUtilities; 44 import org.unicode.cldr.util.SupplementalDataInfo; 45 import org.unicode.cldr.util.XMLFileReader; 46 import org.unicode.cldr.util.XMLFileReader.SimpleHandler; 47 import org.unicode.cldr.util.XPathParts; 48 import org.xml.sax.ErrorHandler; 49 import org.xml.sax.SAXException; 50 import org.xml.sax.SAXParseException; 51 52 public class GenerateItemCounts { 53 private static final SupplementalDataInfo SUPPLEMENTAL_DATA_INFO = 54 CLDRConfig.getInstance().getSupplementalDataInfo(); 55 private static final boolean SKIP_ORDERING = true; 56 private static final String OUT_DIRECTORY = 57 CLDRPaths.GEN_DIRECTORY + "/itemcount/"; // CldrUtility.MAIN_DIRECTORY; 58 private Map<String, List<StackTraceElement>> cantRead = new TreeMap<>(); 59 60 static { 61 System.err.println("Probably obsolete tool"); 62 } 63 64 private static String[] DIRECTORIES = { 65 // MUST be oldest first! 66 // "cldr-archive/cldr-21.0", 67 // "cldr-24.0", 68 "cldr-27.0", "trunk" 69 }; 70 71 private static String TRUNK_VERSION = "26.0"; 72 73 static boolean doChanges = true; 74 static Relation<String, String> path2value = 75 Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 76 static final AttributeTypes ATTRIBUTE_TYPES = new AttributeTypes(); 77 78 static final Options myOptions = new Options(); 79 80 enum MyOptions { 81 summary( 82 null, 83 null, 84 "if present, summarizes data already collected. Run once with, once without."), 85 directory( 86 ".*", 87 ".*", 88 "if summary, creates filtered version (eg -d main): does a find in the name, which is of the form dir/file"), 89 verbose(null, null, "verbose debugging messages"), 90 rawfilter(".*", ".*", "filter the raw files (non-summary, mostly for debugging)"), 91 ; 92 // boilerplate 93 final Option option; 94 MyOptions(String argumentPattern, String defaultArgument, String helpText)95 MyOptions(String argumentPattern, String defaultArgument, String helpText) { 96 option = myOptions.add(this, argumentPattern, defaultArgument, helpText); 97 } 98 } 99 100 static Matcher DIR_FILE_MATCHER; 101 static Matcher RAW_FILE_MATCHER; 102 static boolean VERBOSE; 103 main(String[] args)104 public static void main(String[] args) throws IOException { 105 myOptions.parse(MyOptions.directory, args, true); 106 107 DIR_FILE_MATCHER = PatternCache.get(MyOptions.directory.option.getValue()).matcher(""); 108 RAW_FILE_MATCHER = PatternCache.get(MyOptions.rawfilter.option.getValue()).matcher(""); 109 VERBOSE = MyOptions.verbose.option.doesOccur(); 110 111 if (MyOptions.summary.option.doesOccur()) { 112 doSummary(); 113 System.out.println("DONE"); 114 return; 115 // } else if (arg.equals("changes")) { 116 // doChanges = true; 117 } else { 118 } 119 // Pattern dirPattern = dirPattern = PatternCache.get(arg); 120 GenerateItemCounts main = new GenerateItemCounts(); 121 try { 122 Relation<String, String> oldPath2value = null; 123 for (String dir : DIRECTORIES) { 124 // if (dirPattern != null && !dirPattern.matcher(dir).find()) continue; 125 final String pathname = 126 dir.equals("trunk") 127 ? CLDRPaths.BASE_DIRECTORY 128 : CLDRPaths.ARCHIVE_DIRECTORY + "/" + dir; 129 boolean isFinal = dir == DIRECTORIES[DIRECTORIES.length - 1]; 130 131 String fulldir = PathUtilities.getNormalizedPathString(pathname); 132 String prefix = (MyOptions.rawfilter.option.doesOccur() ? "filtered_" : ""); 133 String fileKey = dir.replace("/", "_"); 134 try (PrintWriter summary = 135 FileUtilities.openUTF8Writer( 136 OUT_DIRECTORY, prefix + fileKey + "_count.txt"); 137 PrintWriter changes = 138 FileUtilities.openUTF8Writer( 139 OUT_DIRECTORY, prefix + fileKey + "_changes.txt"); 140 PrintWriter changesNew = 141 FileUtilities.openUTF8Writer( 142 OUT_DIRECTORY, prefix + fileKey + "_news.txt"); 143 PrintWriter changesDeletes = 144 FileUtilities.openUTF8Writer( 145 OUT_DIRECTORY, prefix + fileKey + "_deletes.txt"); 146 PrintWriter changesSummary = 147 FileUtilities.openUTF8Writer( 148 OUT_DIRECTORY, 149 prefix + fileKey + "_changes_summary.txt"); ) { 150 main.summarizeCoverage(summary, fulldir, isFinal); 151 if (doChanges) { 152 if (oldPath2value != null) { 153 compare( 154 summary, 155 changes, 156 changesNew, 157 changesDeletes, 158 changesSummary, 159 oldPath2value, 160 path2value); 161 checkBadAttributes(path2value, prefix + fileKey + "_dtd_check.txt"); 162 } 163 oldPath2value = path2value; 164 path2value = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 165 } 166 } 167 } 168 ATTRIBUTE_TYPES.showStarred(); 169 } finally { 170 if (main.cantRead.size() != 0) { 171 System.out.println("Couldn't read:\t"); 172 for (String file : main.cantRead.keySet()) { 173 System.out.println(file + "\t" + main.cantRead.get(file)); 174 } 175 } 176 System.out.println("DONE"); 177 } 178 } 179 180 static final Set<String> SKIP_ATTRIBUTES = 181 new HashSet<>(Arrays.asList("draft", "references", "validSubLocales")); 182 183 static final Relation<String, DtdType> ELEMENTS_OCCURRING = 184 Relation.of(new TreeMap(), TreeSet.class); 185 static final Relation<String, DtdType> ELEMENTS_POSSIBLE = 186 Relation.of(new TreeMap(), TreeSet.class); 187 static final Relation<String, Row.R2<DtdType, String>> ATTRIBUTES_OCCURRING = 188 Relation.of(new TreeMap(), TreeSet.class); 189 static final Relation<String, Row.R2<DtdType, String>> ATTRIBUTES_POSSIBLE = 190 Relation.of(new TreeMap(), TreeSet.class); 191 checkBadAttributes(Relation<String, String> path2value2, String outputFile)192 private static void checkBadAttributes(Relation<String, String> path2value2, String outputFile) 193 throws IOException { 194 // an attribute is misplaced if it is not distinguishing, but is on a non-final node. 195 196 Set<String> errors = new LinkedHashSet<>(); 197 198 SupplementalDataInfo supp = SUPPLEMENTAL_DATA_INFO; 199 for (DtdType dtdType : DtdType.values()) { 200 if (dtdType.getStatus() != DtdType.DtdStatus.active) continue; 201 if (dtdType == DtdType.ldmlICU) { 202 continue; 203 } 204 DtdData data = DtdData.getInstance(dtdType); 205 for (Element element : data.getElements()) { 206 String elementName = element.name; 207 ELEMENTS_POSSIBLE.put(elementName, dtdType); 208 final Set<Element> children = element.getChildren().keySet(); 209 210 boolean skipFinal = 211 children.isEmpty() 212 || children.size() == 1 213 && children.iterator().next().name.equals("special"); 214 215 for (Entry<Attribute, Integer> attributeInt : element.getAttributes().entrySet()) { 216 Attribute attribute = attributeInt.getKey(); 217 String attributeName = attribute.name; 218 if (attribute.defaultValue != null) { 219 errors.add( 220 "Warning, default value «" 221 + attribute.defaultValue 222 + "» for: " 223 + dtdType 224 + "\t" 225 + elementName 226 + "\t" 227 + attributeName); 228 } 229 final R2<DtdType, String> attributeRow = Row.of(dtdType, elementName); 230 ATTRIBUTES_POSSIBLE.put(attributeName, attributeRow); 231 if (skipFinal 232 || SKIP_ATTRIBUTES.contains( 233 attributeName)) { // don't worry about non-final, references, 234 // draft, standard 235 continue; 236 } 237 if (supp.isDeprecated(dtdType, elementName, attributeName, null)) { 238 continue; 239 } 240 if (!CLDRFile.isDistinguishing(dtdType, elementName, attributeName)) { 241 String doesOccur = ""; 242 final Set<R2<DtdType, String>> attributeRows = 243 ATTRIBUTES_OCCURRING.get(attributeName); 244 if (attributeRows == null || !attributeRows.contains(attributeRow)) { 245 doesOccur = "\tNEVER"; 246 } 247 errors.add( 248 "Warning, !disting, !leaf: " 249 + dtdType 250 + "\t" 251 + elementName 252 + "\t" 253 + attributeName 254 + "\t" 255 + children 256 + doesOccur); 257 } 258 } 259 } 260 } 261 try (PrintWriter out = FileUtilities.openUTF8Writer(OUT_DIRECTORY, outputFile)) { 262 out.println("\nElements\tDeprecated\tOccurring\tPossible in DTD, but never occurs"); 263 264 for (Entry<String, Set<DtdType>> x : ELEMENTS_POSSIBLE.keyValuesSet()) { 265 final String element = x.getKey(); 266 if (element.equals("#PCDATA") 267 || element.equals("ANY") 268 || element.equals("generation")) { 269 continue; 270 } 271 final Set<DtdType> possible = x.getValue(); 272 Set<DtdType> deprecated = new TreeSet(); 273 for (DtdType dtdType : possible) { 274 if (SUPPLEMENTAL_DATA_INFO.isDeprecated(dtdType, element, "*", "*")) { 275 deprecated.add(dtdType); 276 } 277 } 278 Set<DtdType> notDeprecated = new TreeSet(possible); 279 notDeprecated.removeAll(deprecated); 280 281 Set<DtdType> occurs = 282 CldrUtility.ifNull(ELEMENTS_OCCURRING.get(element), Collections.EMPTY_SET); 283 Set<DtdType> noOccur = new TreeSet(possible); 284 noOccur.removeAll(occurs); 285 286 if (!Collections.disjoint(deprecated, occurs)) { // deprecated must not occur 287 final Set<DtdType> intersection = CldrUtility.intersect(deprecated, occurs); 288 errors.add( 289 "Error: element «" 290 + element 291 + "» is deprecated in " 292 + (deprecated.equals(possible) ? "EVERYWHERE" : intersection) 293 + " but occurs in live data: " 294 + intersection); 295 } 296 if (!Collections.disjoint( 297 notDeprecated, noOccur)) { // if !deprecated & !occur, warning 298 errors.add( 299 "Warning: element «" 300 + element 301 + "» doesn't occur in and is not deprecated in " 302 + CldrUtility.intersect(notDeprecated, noOccur)); 303 } 304 305 out.println(element + "\t" + deprecated + "\t" + occurs + "\t" + noOccur); 306 } 307 308 out.println("\nAttributes\tDeprecated\tOccurring\tPossible in DTD, but never occurs"); 309 310 for (Entry<String, Set<R2<DtdType, String>>> x : ATTRIBUTES_POSSIBLE.keyValuesSet()) { 311 final String attribute = x.getKey(); 312 if (attribute.equals("alt") 313 || attribute.equals("draft") 314 || attribute.equals("references")) { 315 continue; 316 } 317 final Set<R2<DtdType, String>> possible = x.getValue(); 318 Set<R2<DtdType, String>> deprecated = new TreeSet(); 319 for (R2<DtdType, String> s : possible) { 320 final DtdType dtdType = s.get0(); 321 final String element = s.get1(); 322 if (SUPPLEMENTAL_DATA_INFO.isDeprecated(dtdType, element, attribute, "*")) { 323 deprecated.add(s); 324 } 325 } 326 Set<R2<DtdType, String>> notDeprecated = new TreeSet(possible); 327 notDeprecated.removeAll(deprecated); 328 329 Set<R2<DtdType, String>> occurs = 330 CldrUtility.ifNull( 331 ATTRIBUTES_OCCURRING.get(attribute), Collections.EMPTY_SET); 332 Set<R2<DtdType, String>> noOccur = new TreeSet(possible); 333 noOccur.removeAll(occurs); 334 335 if (!Collections.disjoint(deprecated, occurs)) { // deprecated must not occur 336 final Set<R2<DtdType, String>> intersection = 337 CldrUtility.intersect(deprecated, occurs); 338 errors.add( 339 "Error: attribute «" 340 + attribute 341 + "» is deprecated in " 342 + (deprecated.equals(possible) ? "EVERYWHERE" : intersection) 343 + " but occurs in live data: " 344 + intersection); 345 } 346 if (!Collections.disjoint( 347 notDeprecated, noOccur)) { // if !deprecated & !occur, warning 348 errors.add( 349 "Warning: attribute «" 350 + attribute 351 + "» doesn't occur in and is not deprecated in " 352 + CldrUtility.intersect(notDeprecated, noOccur)); 353 } 354 out.println(attribute + "\t" + deprecated + "\t" + occurs + "\t" + noOccur); 355 } 356 out.println("\nERRORS/WARNINGS"); 357 out.println(Joiner.on("\n").join(errors)); 358 } 359 } 360 361 static class AttributeTypes { 362 Relation<String, String> elementPathToAttributes = 363 Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 364 final PathStarrer PATH_STARRER = new PathStarrer().setSubstitutionPattern("*"); 365 final Set<String> STARRED_PATHS = new TreeSet<>(); 366 StringBuilder elementPath = new StringBuilder(); 367 add(String path)368 public void add(String path) { 369 XPathParts parts = XPathParts.getFrozenInstance(path); 370 elementPath.setLength(0); 371 for (int i = 0; i < parts.size(); ++i) { 372 String element = parts.getElement(i); 373 elementPath.append('/').append(element); 374 elementPathToAttributes.putAll( 375 elementPath.toString().intern(), parts.getAttributeKeys(i)); 376 } 377 } 378 showStarred()379 public void showStarred() throws IOException { 380 PrintWriter starred = FileUtilities.openUTF8Writer(OUT_DIRECTORY, "starred" + ".txt"); 381 382 for (Entry<String, Set<String>> entry : elementPathToAttributes.keyValuesSet()) { 383 Set<String> attributes = entry.getValue(); 384 if (attributes.size() == 0) { 385 continue; 386 } 387 String path = entry.getKey(); 388 String[] elements = path.split("/"); 389 DtdType type = DtdType.valueOf(elements[1]); 390 String finalElement = elements[elements.length - 1]; 391 starred.print(path); 392 for (String attribute : attributes) { 393 if (CLDRFile.isDistinguishing(type, finalElement, attribute)) { 394 starred.print("[@" + attribute + "='disting.']"); 395 } else { 396 starred.print("[@" + attribute + "='DATA']"); 397 } 398 } 399 starred.println(); 400 } 401 starred.close(); 402 } 403 } 404 405 static Pattern prefix = PatternCache.get("([^/]+/[^/]+)(.*)"); 406 407 static class Delta { 408 Counter<String> newCount = new Counter<>(); 409 Counter<String> deletedCount = new Counter<>(); 410 Counter<String> changedCount = new Counter<>(); 411 Counter<String> unchangedCount = new Counter<>(); 412 print(PrintWriter changesSummary, Set<String> prefixes)413 void print(PrintWriter changesSummary, Set<String> prefixes) { 414 changesSummary.println( 415 "Total" 416 + "\t" 417 + unchangedCount.getTotal() 418 + "\t" 419 + deletedCount.getTotal() 420 + "\t" 421 + changedCount.getTotal() 422 + "\t" 423 + newCount.getTotal()); 424 changesSummary.println("Directory\tSame\tRemoved\tChanged\tAdded"); 425 for (String prefix : prefixes) { 426 changesSummary.println( 427 prefix 428 + "\t" 429 + unchangedCount.get(prefix) 430 + "\t" 431 + deletedCount.get(prefix) 432 + "\t" 433 + changedCount.get(prefix) 434 + "\t" 435 + newCount.get(prefix)); 436 } 437 } 438 } 439 compare( PrintWriter summary, PrintWriter changes, PrintWriter changesNew, PrintWriter changesDeletes, PrintWriter changesSummary, Relation<String, String> oldPath2value, Relation<String, String> path2value2)440 private static void compare( 441 PrintWriter summary, 442 PrintWriter changes, 443 PrintWriter changesNew, 444 PrintWriter changesDeletes, 445 PrintWriter changesSummary, 446 Relation<String, String> oldPath2value, 447 Relation<String, String> path2value2) { 448 Set<String> union = 449 Builder.with(new TreeSet<String>()) 450 .addAll(oldPath2value.keySet()) 451 .addAll(path2value2.keySet()) 452 .get(); 453 long total = 0; 454 Matcher prefixMatcher = prefix.matcher(""); 455 Delta charCount = new Delta(); 456 Delta itemCount = new Delta(); 457 Set<String> prefixes = new TreeSet(); 458 for (String path : union) { 459 if (!prefixMatcher.reset(path).find()) { 460 throw new IllegalArgumentException(); 461 } 462 String prefix = prefixMatcher.group(1); 463 prefixes.add(prefix); 464 String localPath = prefixMatcher.group(2); 465 Set<String> set1 = oldPath2value.getAll(path); 466 Set<String> set2 = path2value2.getAll(path); 467 if (set2 != null) { 468 total += set2.size(); 469 } 470 if (set1 == null) { 471 changesNew.println(prefix + "\t" + "\t" + set2 + "\t" + localPath); 472 itemCount.newCount.add(prefix, set2.size()); 473 charCount.newCount.add(prefix, totalLength(set2)); 474 } else if (set2 == null) { 475 changesDeletes.println(prefix + "\t" + set1 + "\t\t" + localPath); 476 itemCount.deletedCount.add(prefix, -set1.size()); 477 charCount.deletedCount.add(prefix, -totalLength(set1)); 478 } else if (!set1.equals(set2)) { 479 TreeSet<String> set1minus2 = 480 Builder.with(new TreeSet<String>()).addAll(set1).removeAll(set2).get(); 481 TreeSet<String> set2minus1 = 482 Builder.with(new TreeSet<String>()).addAll(set2).removeAll(set1).get(); 483 TreeSet<String> set2and1 = 484 Builder.with(new TreeSet<String>()).addAll(set2).retainAll(set1).get(); 485 itemCount.changedCount.add(prefix, (set2minus1.size() + set1minus2.size() + 1) / 2); 486 itemCount.unchangedCount.add(prefix, set2and1.size()); 487 charCount.changedCount.add( 488 prefix, (totalLength(set2minus1) + totalLength(set1minus2) + 1) / 2); 489 charCount.unchangedCount.add(prefix, totalLength(set2and1)); 490 changes.println(prefix + "\t" + set1minus2 + "\t" + set2minus1 + "\t" + localPath); 491 } else { 492 itemCount.unchangedCount.add(prefix, set2.size()); 493 charCount.unchangedCount.add(prefix, totalLength(set2)); 494 } 495 } 496 itemCount.print(changesSummary, prefixes); 497 changesSummary.println(); 498 charCount.print(changesSummary, prefixes); 499 // union = Builder.with(new TreeSet<String>()) 500 // .addAll(newCount.keySet()) 501 // .addAll(deletedCount.keySet()) 502 // .addAll(changedCount.keySet()) 503 // .addAll(unchangedCount.keySet()) 504 // .get(); 505 summary.println("#Total:\t" + total); 506 } 507 totalLength(Set<String> set2)508 private static long totalLength(Set<String> set2) { 509 int result = 0; 510 for (String s : set2) { 511 result += s.length(); 512 } 513 return result; 514 } 515 516 static final Pattern LOCALE_PATTERN = 517 PatternCache.get( 518 "([a-z]{2,3})(?:[_-]([A-Z][a-z]{3}))?(?:[_-]([a-zA-Z0-9]{2,3}))?([_-][a-zA-Z0-9]{1,8})*"); 519 doSummary()520 public static void doSummary() throws IOException { 521 Map<String, R4<Counter<String>, Counter<String>, Counter<String>, Counter<String>>> 522 key_release_count = new TreeMap<>(); 523 Matcher countryLocale = LOCALE_PATTERN.matcher(""); 524 List<String> releases = new ArrayList<>(); 525 Pattern releaseNumber = PatternCache.get("count_(?:.*-(\\d+(\\.\\d+)*)|trunk)\\.txt"); 526 // int releaseCount = 1; 527 Relation<String, String> release_keys = 528 Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 529 Relation<String, String> localesToPaths = 530 Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 531 Set<String> writtenLanguages = new TreeSet<>(); 532 Set<String> countries = new TreeSet<>(); 533 534 File[] listFiles = new File(OUT_DIRECTORY).listFiles(); 535 // find the most recent version 536 VersionInfo mostRecentVersion = VersionInfo.getInstance(0); 537 for (File subdir : listFiles) { 538 final String name = subdir.getName(); 539 final Matcher releaseMatcher = releaseNumber.matcher(name); 540 if (!releaseMatcher.matches()) { 541 if (name.startsWith("count_")) { 542 throw new IllegalArgumentException( 543 "Bad match " + RegexUtilities.showMismatch(releaseMatcher, name)); 544 } 545 continue; 546 } 547 String releaseNum = releaseMatcher.group(1); // "1." + releaseCount++; 548 if (releaseNum == null) { 549 releaseNum = TRUNK_VERSION; 550 } 551 VersionInfo vi = VersionInfo.getInstance(releaseNum); 552 if (vi.compareTo(mostRecentVersion) > 0) { 553 mostRecentVersion = vi; 554 } 555 } 556 557 for (File subdir : listFiles) { 558 final String name = subdir.getName(); 559 final Matcher releaseMatcher = releaseNumber.matcher(name); 560 if (!releaseMatcher.matches()) { 561 if (name.startsWith("count_")) { 562 throw new IllegalArgumentException( 563 "Bad match " + RegexUtilities.showMismatch(releaseMatcher, name)); 564 } 565 continue; 566 } 567 String releaseNum = releaseMatcher.group(1); // "1." + releaseCount++; 568 if (releaseNum == null) { 569 releaseNum = TRUNK_VERSION; 570 } 571 VersionInfo vi = VersionInfo.getInstance(releaseNum); 572 boolean captureData = vi.equals(mostRecentVersion); 573 releases.add(releaseNum); 574 BufferedReader in = 575 FileUtilities.openUTF8Reader("", PathUtilities.getNormalizedPathString(subdir)); 576 while (true) { 577 String line = in.readLine(); 578 if (line == null) break; 579 line = line.trim(); 580 if (line.startsWith("#")) { 581 continue; 582 } 583 // common/main New: [Yellowknife] 584 // /gl//ldml/dates/timeZoneNames/zone[@type="America/Yellowknife"]/exemplarCity 585 586 String[] parts = line.split("\t"); 587 try { 588 String file = parts[0]; 589 if (file.startsWith("seed/") || !DIR_FILE_MATCHER.reset(file).find()) { 590 if (VERBOSE) { 591 System.out.println( 592 "Skipping: " 593 + RegexUtilities.showMismatch(DIR_FILE_MATCHER, file)); 594 } 595 continue; 596 } else if (VERBOSE) { 597 System.out.println("Including: " + file); 598 } 599 600 long valueCount = Long.parseLong(parts[1]); 601 long valueLen = Long.parseLong(parts[2]); 602 long attrCount = Long.parseLong(parts[3]); 603 long attrLen = Long.parseLong(parts[4]); 604 int lastSlash = file.lastIndexOf("/"); 605 String key2 = file; 606 String path = file.substring(0, lastSlash); 607 String key = file.substring(lastSlash + 1); 608 if (countryLocale.reset(key).matches()) { 609 String lang = countryLocale.group(1); 610 String script = countryLocale.group(2); 611 String country = countryLocale.group(3); 612 String writtenLang = lang + (script == null ? "" : "_" + script); 613 String locale = writtenLang + (country == null ? "" : "_" + country); 614 if (captureData) { 615 localesToPaths.put(locale, path); 616 writtenLanguages.add(writtenLang); 617 if (country != null) { 618 countries.add(country); 619 } 620 } 621 // System.out.println(key + " => " + newKey); 622 // key = writtenLang + "—" + ULocale.getDisplayName(writtenLang, "en"); 623 } 624 if (valueCount + attrCount == 0) continue; 625 release_keys.put(releaseNum, key2); 626 R4<Counter<String>, Counter<String>, Counter<String>, Counter<String>> 627 release_count = key_release_count.get(key2); 628 if (release_count == null) { 629 release_count = 630 Row.of( 631 new Counter<String>(), 632 new Counter<String>(), 633 new Counter<String>(), 634 new Counter<String>()); 635 key_release_count.put(key2, release_count); 636 } 637 release_count.get0().add(releaseNum, valueCount); 638 release_count.get1().add(releaseNum, valueLen); 639 release_count.get2().add(releaseNum, attrCount); 640 release_count.get3().add(releaseNum, attrLen); 641 } catch (Exception e) { 642 throw new IllegalArgumentException(line, e); 643 } 644 } 645 in.close(); 646 } 647 PrintWriter summary = 648 FileUtilities.openUTF8Writer( 649 OUT_DIRECTORY, 650 (MyOptions.directory.option.doesOccur() ? "filtered-" : "") 651 + "summary" 652 + ".txt"); 653 for (String file : releases) { 654 summary.print("\t" + file + "\tlen"); 655 } 656 summary.println(); 657 for (String key : key_release_count.keySet()) { 658 summary.print(key); 659 R4<Counter<String>, Counter<String>, Counter<String>, Counter<String>> release_count = 660 key_release_count.get(key); 661 for (String release2 : releases) { 662 long count = 663 release_count.get0().get(release2) + release_count.get2().get(release2); 664 long len = release_count.get1().get(release2) + release_count.get3().get(release2); 665 summary.print("\t" + count + "\t" + len); 666 } 667 summary.println(); 668 } 669 for (String release : release_keys.keySet()) { 670 System.out.println("Release:\t" + release + "\t" + release_keys.getAll(release).size()); 671 } 672 summary.close(); 673 PrintWriter summary2 = 674 FileUtilities.openUTF8Writer( 675 OUT_DIRECTORY, 676 (MyOptions.directory.option.doesOccur() ? "filtered-" : "") 677 + "locales" 678 + ".txt"); 679 summary2.println("#Languages (inc. script):\t" + writtenLanguages.size()); 680 summary2.println("#Countries:\t" + countries.size()); 681 summary2.println("#Locales:\t" + localesToPaths.size()); 682 for (Entry<String, Set<String>> entry : localesToPaths.keyValuesSet()) { 683 summary2.println(entry.getKey() + "\t" + Joiner.on("\t").join(entry.getValue())); 684 } 685 summary2.close(); 686 } 687 688 static final Set<String> ATTRIBUTES_TO_SKIP = 689 Builder.with(new HashSet<String>()) 690 .addAll("version", "references", "standard", "draft") 691 .freeze(); 692 static final Pattern skipPath = 693 PatternCache.get( 694 "" 695 + "\\[\\@alt=\"[^\"]*proposed" 696 + "|^//" 697 + "(ldml(\\[[^/]*)?/identity" 698 + "|(ldmlBCP47|supplementalData|keyboard)(\\[[^/]*)?/(generation|version)" 699 + ")"); 700 capture(DtdType type2, XPathParts parts)701 static void capture(DtdType type2, XPathParts parts) { 702 for (int i = 0; i < parts.size(); ++i) { 703 String element = parts.getElement(i); 704 ELEMENTS_OCCURRING.put(element, type2); 705 for (String attribute : parts.getAttributes(i).keySet()) { 706 ATTRIBUTES_OCCURRING.put(attribute, Row.of(type2, element)); 707 } 708 } 709 } 710 711 static class MyHandler extends SimpleHandler { 712 long valueCount; 713 long valueLen; 714 long attributeCount; 715 long attributeLen; 716 Matcher skipPathMatcher = skipPath.matcher(""); 717 Splitter lines = Splitter.onPattern("\n+").omitEmptyStrings().trimResults(); 718 String prefix; 719 int orderedCount; 720 DtdType type; 721 private final boolean isFinal; 722 MyHandler(String prefix, boolean isFinal)723 MyHandler(String prefix, boolean isFinal) { 724 this.prefix = prefix; 725 this.isFinal = isFinal; 726 } 727 728 @Override handlePathValue(String path, String value)729 public void handlePathValue(String path, String value) { 730 if (type == null) { 731 XPathParts parts = XPathParts.getFrozenInstance(path); 732 type = DtdType.valueOf(parts.getElement(0)); 733 } 734 735 ATTRIBUTE_TYPES.add(path); 736 737 if (skipPathMatcher.reset(path).find()) { 738 return; 739 } 740 String pathKey = null; 741 if (doChanges) { 742 // if (path.contains("/collations")) { 743 // System.out.println("whoops"); 744 // } 745 pathKey = fixKeyPath(path); 746 } 747 int len = value.length(); 748 value = value.trim(); 749 if (value.isEmpty() && len > 0) { 750 value = " "; 751 } 752 if (value.length() != 0) { 753 List<String> valueLines = lines.splitToList(value); 754 if (valueLines.size() == 1) { 755 valueCount++; 756 valueLen += value.length(); 757 if (doChanges) { 758 path2value.put(pathKey, value); 759 } 760 } else { 761 int count = 0; 762 for (String v : valueLines) { 763 valueCount++; 764 valueLen += v.length(); 765 if (doChanges) { 766 path2value.put(pathKey + "/_q" + count++, v); 767 } 768 } 769 } 770 } 771 XPathParts parts = XPathParts.getFrozenInstance(path); 772 if (isFinal) { 773 capture(type, parts); 774 } 775 if (path.contains("[@")) { 776 int i = parts.size() - 1; // only look at last item 777 Collection<String> attributes = parts.getAttributeKeys(i); 778 if (attributes.size() != 0) { 779 String element = parts.getElement(i); 780 for (String attribute : attributes) { 781 if (ATTRIBUTES_TO_SKIP.contains(attribute) 782 || CLDRFile.isDistinguishing(type, element, attribute)) { 783 continue; 784 } 785 String valuePart = parts.getAttributeValue(i, attribute); 786 // String[] valueParts = attrValue.split("\\s"); 787 // for (String valuePart : valueParts) { 788 attributeCount++; 789 attributeLen += valuePart.length(); 790 if (doChanges) { 791 path2value.put(pathKey + "/_" + attribute, valuePart); 792 // } 793 } 794 } 795 } 796 } 797 } 798 fixKeyPath(String path)799 private String fixKeyPath(String path) { 800 XPathParts parts = XPathParts.getFrozenInstance(path); 801 if (!SKIP_ORDERING) { 802 parts = parts.cloneAsThawed(); 803 } 804 for (int i = 0; i < parts.size(); ++i) { 805 String element = parts.getElement(i); 806 if (!SKIP_ORDERING) { 807 if (CLDRFile.isOrdered(element, type)) { 808 parts.addAttribute("_q", String.valueOf(orderedCount++)); 809 } 810 } 811 } 812 return prefix + CLDRFile.getDistinguishingXPath(parts.toString(), null); 813 } 814 } 815 check(String systemID, String name, boolean isFinal)816 private MyHandler check(String systemID, String name, boolean isFinal) { 817 MyHandler myHandler = new MyHandler(name, isFinal); 818 try { 819 XMLFileReader reader = new XMLFileReader().setHandler(myHandler); 820 reader.read(systemID, XMLFileReader.CONTENT_HANDLER, true); 821 } catch (Exception e) { 822 cantRead.put(name, Arrays.asList(e.getStackTrace())); 823 } 824 return myHandler; 825 826 // try { 827 // FileInputStream fis = new FileInputStream(systemID); 828 // XMLFileReader xmlReader = XMLFileReader.createXMLReader(true); 829 // xmlReader.setErrorHandler(new MyErrorHandler()); 830 // MyHandler myHandler = new MyHandler(); 831 // smlReader 832 // xmlReader.setHandler(myHandler); 833 // InputSource is = new InputSource(fis); 834 // is.setSystemId(systemID.toString()); 835 // xmlReader.parse(is); 836 // fis.close(); 837 // return myHandler; 838 // } catch (SAXParseException e) { 839 // System.out.println("\t" + "Can't read " + systemID); 840 // System.out.println("\t" + e.getClass() + "\t" + e.getMessage()); 841 // } catch (SAXException e) { 842 // System.out.println("\t" + "Can't read " + systemID); 843 // System.out.println("\t" + e.getClass() + "\t" + e.getMessage()); 844 // } catch (IOException e) { 845 // System.out.println("\t" + "Can't read " + systemID); 846 // System.out.println("\t" + e.getClass() + "\t" + e.getMessage()); 847 // } 848 } 849 850 static class MyErrorHandler implements ErrorHandler { 851 @Override error(SAXParseException exception)852 public void error(SAXParseException exception) throws SAXException { 853 System.out.println("\nerror: " + XMLFileReader.showSAX(exception)); 854 throw exception; 855 } 856 857 @Override fatalError(SAXParseException exception)858 public void fatalError(SAXParseException exception) throws SAXException { 859 System.out.println("\nfatalError: " + XMLFileReader.showSAX(exception)); 860 throw exception; 861 } 862 863 @Override warning(SAXParseException exception)864 public void warning(SAXParseException exception) throws SAXException { 865 System.out.println("\nwarning: " + XMLFileReader.showSAX(exception)); 866 throw exception; 867 } 868 } 869 summarizeCoverage(PrintWriter summary, String commonDir, boolean isFinal)870 private void summarizeCoverage(PrintWriter summary, String commonDir, boolean isFinal) { 871 System.out.println(commonDir); 872 summary.println( 873 "#name" 874 + "\t" 875 + "value-count" 876 + "\t" 877 + "value-len" 878 + "\t" 879 + "attr-count" 880 + "\t" 881 + "attr-len"); 882 File commonDirectory = new File(commonDir); 883 if (!commonDirectory.exists()) { 884 System.out.println("Doesn't exist:\t" + commonDirectory); 885 } 886 summarizeFiles(summary, commonDirectory, isFinal, 1); 887 } 888 889 static final Set<String> SKIP_DIRS = 890 new HashSet<>(Arrays.asList("specs", "tools", "seed", "exemplars")); 891 summarizeFiles(PrintWriter summary, File directory, boolean isFinal, int level)892 public void summarizeFiles(PrintWriter summary, File directory, boolean isFinal, int level) { 893 System.out.println("\t\t\t\t\t\t\t".substring(0, level) + directory); 894 int count = 0; 895 for (File file : directory.listFiles()) { 896 String filename = file.getName(); 897 if (filename.startsWith(".")) { 898 // do nothing 899 } else if (file.isDirectory()) { 900 if (!SKIP_DIRS.contains(filename)) { 901 summarizeFiles(summary, file, isFinal, level + 1); 902 } 903 } else if (!filename.startsWith("#") && filename.endsWith(".xml")) { 904 String name = 905 new File(directory.getParent()).getName() 906 + "/" 907 + directory.getName() 908 + "/" 909 + file.getName(); 910 name = name.substring(0, name.length() - 4); // strip .xml 911 if (!RAW_FILE_MATCHER.reset(name).find()) { 912 continue; 913 } 914 if (VERBOSE) { 915 System.out.println(name); 916 } else { 917 System.out.print("."); 918 if (++count > 100) { 919 count = 0; 920 System.out.println(); 921 } 922 System.out.flush(); 923 } 924 MyHandler handler = check(file.toString(), name, isFinal); 925 summary.println( 926 name 927 + "\t" 928 + handler.valueCount 929 + "\t" 930 + handler.valueLen 931 + "\t" 932 + handler.attributeCount 933 + "\t" 934 + handler.attributeLen); 935 } 936 } 937 System.out.println(); 938 } 939 } 940