xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/tool/GenerateItemCounts.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.tool;
2 
3 import com.google.common.base.Joiner;
4 import com.google.common.base.Splitter;
5 import com.ibm.icu.impl.Relation;
6 import com.ibm.icu.impl.Row;
7 import com.ibm.icu.impl.Row.R2;
8 import com.ibm.icu.impl.Row.R4;
9 import com.ibm.icu.util.VersionInfo;
10 import java.io.BufferedReader;
11 import java.io.File;
12 import java.io.IOException;
13 import java.io.PrintWriter;
14 import java.util.ArrayList;
15 import java.util.Arrays;
16 import java.util.Collection;
17 import java.util.Collections;
18 import java.util.HashSet;
19 import java.util.LinkedHashSet;
20 import java.util.List;
21 import java.util.Map;
22 import java.util.Map.Entry;
23 import java.util.Set;
24 import java.util.TreeMap;
25 import java.util.TreeSet;
26 import java.util.regex.Matcher;
27 import java.util.regex.Pattern;
28 import org.unicode.cldr.draft.FileUtilities;
29 import org.unicode.cldr.tool.Option.Options;
30 import org.unicode.cldr.util.Builder;
31 import org.unicode.cldr.util.CLDRConfig;
32 import org.unicode.cldr.util.CLDRFile;
33 import org.unicode.cldr.util.CLDRPaths;
34 import org.unicode.cldr.util.CldrUtility;
35 import org.unicode.cldr.util.Counter;
36 import org.unicode.cldr.util.DtdData;
37 import org.unicode.cldr.util.DtdData.Attribute;
38 import org.unicode.cldr.util.DtdData.Element;
39 import org.unicode.cldr.util.DtdType;
40 import org.unicode.cldr.util.PathStarrer;
41 import org.unicode.cldr.util.PathUtilities;
42 import org.unicode.cldr.util.PatternCache;
43 import org.unicode.cldr.util.RegexUtilities;
44 import org.unicode.cldr.util.SupplementalDataInfo;
45 import org.unicode.cldr.util.XMLFileReader;
46 import org.unicode.cldr.util.XMLFileReader.SimpleHandler;
47 import org.unicode.cldr.util.XPathParts;
48 import org.xml.sax.ErrorHandler;
49 import org.xml.sax.SAXException;
50 import org.xml.sax.SAXParseException;
51 
52 public class GenerateItemCounts {
53     private static final SupplementalDataInfo SUPPLEMENTAL_DATA_INFO =
54             CLDRConfig.getInstance().getSupplementalDataInfo();
55     private static final boolean SKIP_ORDERING = true;
56     private static final String OUT_DIRECTORY =
57             CLDRPaths.GEN_DIRECTORY + "/itemcount/"; // CldrUtility.MAIN_DIRECTORY;
58     private Map<String, List<StackTraceElement>> cantRead = new TreeMap<>();
59 
60     static {
61         System.err.println("Probably obsolete tool");
62     }
63 
64     private static String[] DIRECTORIES = {
65         // MUST be oldest first!
66         // "cldr-archive/cldr-21.0",
67         // "cldr-24.0",
68         "cldr-27.0", "trunk"
69     };
70 
71     private static String TRUNK_VERSION = "26.0";
72 
73     static boolean doChanges = true;
74     static Relation<String, String> path2value =
75             Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
76     static final AttributeTypes ATTRIBUTE_TYPES = new AttributeTypes();
77 
78     static final Options myOptions = new Options();
79 
80     enum MyOptions {
81         summary(
82                 null,
83                 null,
84                 "if present, summarizes data already collected. Run once with, once without."),
85         directory(
86                 ".*",
87                 ".*",
88                 "if summary, creates filtered version (eg -d main): does a find in the name, which is of the form dir/file"),
89         verbose(null, null, "verbose debugging messages"),
90         rawfilter(".*", ".*", "filter the raw files (non-summary, mostly for debugging)"),
91         ;
92         // boilerplate
93         final Option option;
94 
MyOptions(String argumentPattern, String defaultArgument, String helpText)95         MyOptions(String argumentPattern, String defaultArgument, String helpText) {
96             option = myOptions.add(this, argumentPattern, defaultArgument, helpText);
97         }
98     }
99 
100     static Matcher DIR_FILE_MATCHER;
101     static Matcher RAW_FILE_MATCHER;
102     static boolean VERBOSE;
103 
main(String[] args)104     public static void main(String[] args) throws IOException {
105         myOptions.parse(MyOptions.directory, args, true);
106 
107         DIR_FILE_MATCHER = PatternCache.get(MyOptions.directory.option.getValue()).matcher("");
108         RAW_FILE_MATCHER = PatternCache.get(MyOptions.rawfilter.option.getValue()).matcher("");
109         VERBOSE = MyOptions.verbose.option.doesOccur();
110 
111         if (MyOptions.summary.option.doesOccur()) {
112             doSummary();
113             System.out.println("DONE");
114             return;
115             // } else if (arg.equals("changes")) {
116             // doChanges = true;
117         } else {
118         }
119         // Pattern dirPattern = dirPattern = PatternCache.get(arg);
120         GenerateItemCounts main = new GenerateItemCounts();
121         try {
122             Relation<String, String> oldPath2value = null;
123             for (String dir : DIRECTORIES) {
124                 // if (dirPattern != null && !dirPattern.matcher(dir).find()) continue;
125                 final String pathname =
126                         dir.equals("trunk")
127                                 ? CLDRPaths.BASE_DIRECTORY
128                                 : CLDRPaths.ARCHIVE_DIRECTORY + "/" + dir;
129                 boolean isFinal = dir == DIRECTORIES[DIRECTORIES.length - 1];
130 
131                 String fulldir = PathUtilities.getNormalizedPathString(pathname);
132                 String prefix = (MyOptions.rawfilter.option.doesOccur() ? "filtered_" : "");
133                 String fileKey = dir.replace("/", "_");
134                 try (PrintWriter summary =
135                                 FileUtilities.openUTF8Writer(
136                                         OUT_DIRECTORY, prefix + fileKey + "_count.txt");
137                         PrintWriter changes =
138                                 FileUtilities.openUTF8Writer(
139                                         OUT_DIRECTORY, prefix + fileKey + "_changes.txt");
140                         PrintWriter changesNew =
141                                 FileUtilities.openUTF8Writer(
142                                         OUT_DIRECTORY, prefix + fileKey + "_news.txt");
143                         PrintWriter changesDeletes =
144                                 FileUtilities.openUTF8Writer(
145                                         OUT_DIRECTORY, prefix + fileKey + "_deletes.txt");
146                         PrintWriter changesSummary =
147                                 FileUtilities.openUTF8Writer(
148                                         OUT_DIRECTORY,
149                                         prefix + fileKey + "_changes_summary.txt"); ) {
150                     main.summarizeCoverage(summary, fulldir, isFinal);
151                     if (doChanges) {
152                         if (oldPath2value != null) {
153                             compare(
154                                     summary,
155                                     changes,
156                                     changesNew,
157                                     changesDeletes,
158                                     changesSummary,
159                                     oldPath2value,
160                                     path2value);
161                             checkBadAttributes(path2value, prefix + fileKey + "_dtd_check.txt");
162                         }
163                         oldPath2value = path2value;
164                         path2value = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
165                     }
166                 }
167             }
168             ATTRIBUTE_TYPES.showStarred();
169         } finally {
170             if (main.cantRead.size() != 0) {
171                 System.out.println("Couldn't read:\t");
172                 for (String file : main.cantRead.keySet()) {
173                     System.out.println(file + "\t" + main.cantRead.get(file));
174                 }
175             }
176             System.out.println("DONE");
177         }
178     }
179 
180     static final Set<String> SKIP_ATTRIBUTES =
181             new HashSet<>(Arrays.asList("draft", "references", "validSubLocales"));
182 
183     static final Relation<String, DtdType> ELEMENTS_OCCURRING =
184             Relation.of(new TreeMap(), TreeSet.class);
185     static final Relation<String, DtdType> ELEMENTS_POSSIBLE =
186             Relation.of(new TreeMap(), TreeSet.class);
187     static final Relation<String, Row.R2<DtdType, String>> ATTRIBUTES_OCCURRING =
188             Relation.of(new TreeMap(), TreeSet.class);
189     static final Relation<String, Row.R2<DtdType, String>> ATTRIBUTES_POSSIBLE =
190             Relation.of(new TreeMap(), TreeSet.class);
191 
checkBadAttributes(Relation<String, String> path2value2, String outputFile)192     private static void checkBadAttributes(Relation<String, String> path2value2, String outputFile)
193             throws IOException {
194         // an attribute is misplaced if it is not distinguishing, but is on a non-final node.
195 
196         Set<String> errors = new LinkedHashSet<>();
197 
198         SupplementalDataInfo supp = SUPPLEMENTAL_DATA_INFO;
199         for (DtdType dtdType : DtdType.values()) {
200             if (dtdType.getStatus() != DtdType.DtdStatus.active) continue;
201             if (dtdType == DtdType.ldmlICU) {
202                 continue;
203             }
204             DtdData data = DtdData.getInstance(dtdType);
205             for (Element element : data.getElements()) {
206                 String elementName = element.name;
207                 ELEMENTS_POSSIBLE.put(elementName, dtdType);
208                 final Set<Element> children = element.getChildren().keySet();
209 
210                 boolean skipFinal =
211                         children.isEmpty()
212                                 || children.size() == 1
213                                         && children.iterator().next().name.equals("special");
214 
215                 for (Entry<Attribute, Integer> attributeInt : element.getAttributes().entrySet()) {
216                     Attribute attribute = attributeInt.getKey();
217                     String attributeName = attribute.name;
218                     if (attribute.defaultValue != null) {
219                         errors.add(
220                                 "Warning, default value «"
221                                         + attribute.defaultValue
222                                         + "» for: "
223                                         + dtdType
224                                         + "\t"
225                                         + elementName
226                                         + "\t"
227                                         + attributeName);
228                     }
229                     final R2<DtdType, String> attributeRow = Row.of(dtdType, elementName);
230                     ATTRIBUTES_POSSIBLE.put(attributeName, attributeRow);
231                     if (skipFinal
232                             || SKIP_ATTRIBUTES.contains(
233                                     attributeName)) { // don't worry about non-final, references,
234                         // draft, standard
235                         continue;
236                     }
237                     if (supp.isDeprecated(dtdType, elementName, attributeName, null)) {
238                         continue;
239                     }
240                     if (!CLDRFile.isDistinguishing(dtdType, elementName, attributeName)) {
241                         String doesOccur = "";
242                         final Set<R2<DtdType, String>> attributeRows =
243                                 ATTRIBUTES_OCCURRING.get(attributeName);
244                         if (attributeRows == null || !attributeRows.contains(attributeRow)) {
245                             doesOccur = "\tNEVER";
246                         }
247                         errors.add(
248                                 "Warning, !disting, !leaf: "
249                                         + dtdType
250                                         + "\t"
251                                         + elementName
252                                         + "\t"
253                                         + attributeName
254                                         + "\t"
255                                         + children
256                                         + doesOccur);
257                     }
258                 }
259             }
260         }
261         try (PrintWriter out = FileUtilities.openUTF8Writer(OUT_DIRECTORY, outputFile)) {
262             out.println("\nElements\tDeprecated\tOccurring\tPossible in DTD, but never occurs");
263 
264             for (Entry<String, Set<DtdType>> x : ELEMENTS_POSSIBLE.keyValuesSet()) {
265                 final String element = x.getKey();
266                 if (element.equals("#PCDATA")
267                         || element.equals("ANY")
268                         || element.equals("generation")) {
269                     continue;
270                 }
271                 final Set<DtdType> possible = x.getValue();
272                 Set<DtdType> deprecated = new TreeSet();
273                 for (DtdType dtdType : possible) {
274                     if (SUPPLEMENTAL_DATA_INFO.isDeprecated(dtdType, element, "*", "*")) {
275                         deprecated.add(dtdType);
276                     }
277                 }
278                 Set<DtdType> notDeprecated = new TreeSet(possible);
279                 notDeprecated.removeAll(deprecated);
280 
281                 Set<DtdType> occurs =
282                         CldrUtility.ifNull(ELEMENTS_OCCURRING.get(element), Collections.EMPTY_SET);
283                 Set<DtdType> noOccur = new TreeSet(possible);
284                 noOccur.removeAll(occurs);
285 
286                 if (!Collections.disjoint(deprecated, occurs)) { // deprecated must not occur
287                     final Set<DtdType> intersection = CldrUtility.intersect(deprecated, occurs);
288                     errors.add(
289                             "Error: element «"
290                                     + element
291                                     + "» is deprecated in "
292                                     + (deprecated.equals(possible) ? "EVERYWHERE" : intersection)
293                                     + " but occurs in live data: "
294                                     + intersection);
295                 }
296                 if (!Collections.disjoint(
297                         notDeprecated, noOccur)) { // if !deprecated & !occur, warning
298                     errors.add(
299                             "Warning: element «"
300                                     + element
301                                     + "» doesn't occur in and is not deprecated in "
302                                     + CldrUtility.intersect(notDeprecated, noOccur));
303                 }
304 
305                 out.println(element + "\t" + deprecated + "\t" + occurs + "\t" + noOccur);
306             }
307 
308             out.println("\nAttributes\tDeprecated\tOccurring\tPossible in DTD, but never occurs");
309 
310             for (Entry<String, Set<R2<DtdType, String>>> x : ATTRIBUTES_POSSIBLE.keyValuesSet()) {
311                 final String attribute = x.getKey();
312                 if (attribute.equals("alt")
313                         || attribute.equals("draft")
314                         || attribute.equals("references")) {
315                     continue;
316                 }
317                 final Set<R2<DtdType, String>> possible = x.getValue();
318                 Set<R2<DtdType, String>> deprecated = new TreeSet();
319                 for (R2<DtdType, String> s : possible) {
320                     final DtdType dtdType = s.get0();
321                     final String element = s.get1();
322                     if (SUPPLEMENTAL_DATA_INFO.isDeprecated(dtdType, element, attribute, "*")) {
323                         deprecated.add(s);
324                     }
325                 }
326                 Set<R2<DtdType, String>> notDeprecated = new TreeSet(possible);
327                 notDeprecated.removeAll(deprecated);
328 
329                 Set<R2<DtdType, String>> occurs =
330                         CldrUtility.ifNull(
331                                 ATTRIBUTES_OCCURRING.get(attribute), Collections.EMPTY_SET);
332                 Set<R2<DtdType, String>> noOccur = new TreeSet(possible);
333                 noOccur.removeAll(occurs);
334 
335                 if (!Collections.disjoint(deprecated, occurs)) { // deprecated must not occur
336                     final Set<R2<DtdType, String>> intersection =
337                             CldrUtility.intersect(deprecated, occurs);
338                     errors.add(
339                             "Error: attribute «"
340                                     + attribute
341                                     + "» is deprecated in "
342                                     + (deprecated.equals(possible) ? "EVERYWHERE" : intersection)
343                                     + " but occurs in live data: "
344                                     + intersection);
345                 }
346                 if (!Collections.disjoint(
347                         notDeprecated, noOccur)) { // if !deprecated & !occur, warning
348                     errors.add(
349                             "Warning: attribute «"
350                                     + attribute
351                                     + "» doesn't occur in and is not deprecated in "
352                                     + CldrUtility.intersect(notDeprecated, noOccur));
353                 }
354                 out.println(attribute + "\t" + deprecated + "\t" + occurs + "\t" + noOccur);
355             }
356             out.println("\nERRORS/WARNINGS");
357             out.println(Joiner.on("\n").join(errors));
358         }
359     }
360 
361     static class AttributeTypes {
362         Relation<String, String> elementPathToAttributes =
363                 Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
364         final PathStarrer PATH_STARRER = new PathStarrer().setSubstitutionPattern("*");
365         final Set<String> STARRED_PATHS = new TreeSet<>();
366         StringBuilder elementPath = new StringBuilder();
367 
add(String path)368         public void add(String path) {
369             XPathParts parts = XPathParts.getFrozenInstance(path);
370             elementPath.setLength(0);
371             for (int i = 0; i < parts.size(); ++i) {
372                 String element = parts.getElement(i);
373                 elementPath.append('/').append(element);
374                 elementPathToAttributes.putAll(
375                         elementPath.toString().intern(), parts.getAttributeKeys(i));
376             }
377         }
378 
showStarred()379         public void showStarred() throws IOException {
380             PrintWriter starred = FileUtilities.openUTF8Writer(OUT_DIRECTORY, "starred" + ".txt");
381 
382             for (Entry<String, Set<String>> entry : elementPathToAttributes.keyValuesSet()) {
383                 Set<String> attributes = entry.getValue();
384                 if (attributes.size() == 0) {
385                     continue;
386                 }
387                 String path = entry.getKey();
388                 String[] elements = path.split("/");
389                 DtdType type = DtdType.valueOf(elements[1]);
390                 String finalElement = elements[elements.length - 1];
391                 starred.print(path);
392                 for (String attribute : attributes) {
393                     if (CLDRFile.isDistinguishing(type, finalElement, attribute)) {
394                         starred.print("[@" + attribute + "='disting.']");
395                     } else {
396                         starred.print("[@" + attribute + "='DATA']");
397                     }
398                 }
399                 starred.println();
400             }
401             starred.close();
402         }
403     }
404 
405     static Pattern prefix = PatternCache.get("([^/]+/[^/]+)(.*)");
406 
407     static class Delta {
408         Counter<String> newCount = new Counter<>();
409         Counter<String> deletedCount = new Counter<>();
410         Counter<String> changedCount = new Counter<>();
411         Counter<String> unchangedCount = new Counter<>();
412 
print(PrintWriter changesSummary, Set<String> prefixes)413         void print(PrintWriter changesSummary, Set<String> prefixes) {
414             changesSummary.println(
415                     "Total"
416                             + "\t"
417                             + unchangedCount.getTotal()
418                             + "\t"
419                             + deletedCount.getTotal()
420                             + "\t"
421                             + changedCount.getTotal()
422                             + "\t"
423                             + newCount.getTotal());
424             changesSummary.println("Directory\tSame\tRemoved\tChanged\tAdded");
425             for (String prefix : prefixes) {
426                 changesSummary.println(
427                         prefix
428                                 + "\t"
429                                 + unchangedCount.get(prefix)
430                                 + "\t"
431                                 + deletedCount.get(prefix)
432                                 + "\t"
433                                 + changedCount.get(prefix)
434                                 + "\t"
435                                 + newCount.get(prefix));
436             }
437         }
438     }
439 
compare( PrintWriter summary, PrintWriter changes, PrintWriter changesNew, PrintWriter changesDeletes, PrintWriter changesSummary, Relation<String, String> oldPath2value, Relation<String, String> path2value2)440     private static void compare(
441             PrintWriter summary,
442             PrintWriter changes,
443             PrintWriter changesNew,
444             PrintWriter changesDeletes,
445             PrintWriter changesSummary,
446             Relation<String, String> oldPath2value,
447             Relation<String, String> path2value2) {
448         Set<String> union =
449                 Builder.with(new TreeSet<String>())
450                         .addAll(oldPath2value.keySet())
451                         .addAll(path2value2.keySet())
452                         .get();
453         long total = 0;
454         Matcher prefixMatcher = prefix.matcher("");
455         Delta charCount = new Delta();
456         Delta itemCount = new Delta();
457         Set<String> prefixes = new TreeSet();
458         for (String path : union) {
459             if (!prefixMatcher.reset(path).find()) {
460                 throw new IllegalArgumentException();
461             }
462             String prefix = prefixMatcher.group(1);
463             prefixes.add(prefix);
464             String localPath = prefixMatcher.group(2);
465             Set<String> set1 = oldPath2value.getAll(path);
466             Set<String> set2 = path2value2.getAll(path);
467             if (set2 != null) {
468                 total += set2.size();
469             }
470             if (set1 == null) {
471                 changesNew.println(prefix + "\t" + "\t" + set2 + "\t" + localPath);
472                 itemCount.newCount.add(prefix, set2.size());
473                 charCount.newCount.add(prefix, totalLength(set2));
474             } else if (set2 == null) {
475                 changesDeletes.println(prefix + "\t" + set1 + "\t\t" + localPath);
476                 itemCount.deletedCount.add(prefix, -set1.size());
477                 charCount.deletedCount.add(prefix, -totalLength(set1));
478             } else if (!set1.equals(set2)) {
479                 TreeSet<String> set1minus2 =
480                         Builder.with(new TreeSet<String>()).addAll(set1).removeAll(set2).get();
481                 TreeSet<String> set2minus1 =
482                         Builder.with(new TreeSet<String>()).addAll(set2).removeAll(set1).get();
483                 TreeSet<String> set2and1 =
484                         Builder.with(new TreeSet<String>()).addAll(set2).retainAll(set1).get();
485                 itemCount.changedCount.add(prefix, (set2minus1.size() + set1minus2.size() + 1) / 2);
486                 itemCount.unchangedCount.add(prefix, set2and1.size());
487                 charCount.changedCount.add(
488                         prefix, (totalLength(set2minus1) + totalLength(set1minus2) + 1) / 2);
489                 charCount.unchangedCount.add(prefix, totalLength(set2and1));
490                 changes.println(prefix + "\t" + set1minus2 + "\t" + set2minus1 + "\t" + localPath);
491             } else {
492                 itemCount.unchangedCount.add(prefix, set2.size());
493                 charCount.unchangedCount.add(prefix, totalLength(set2));
494             }
495         }
496         itemCount.print(changesSummary, prefixes);
497         changesSummary.println();
498         charCount.print(changesSummary, prefixes);
499         //        union = Builder.with(new TreeSet<String>())
500         //            .addAll(newCount.keySet())
501         //            .addAll(deletedCount.keySet())
502         //            .addAll(changedCount.keySet())
503         //            .addAll(unchangedCount.keySet())
504         //            .get();
505         summary.println("#Total:\t" + total);
506     }
507 
totalLength(Set<String> set2)508     private static long totalLength(Set<String> set2) {
509         int result = 0;
510         for (String s : set2) {
511             result += s.length();
512         }
513         return result;
514     }
515 
516     static final Pattern LOCALE_PATTERN =
517             PatternCache.get(
518                     "([a-z]{2,3})(?:[_-]([A-Z][a-z]{3}))?(?:[_-]([a-zA-Z0-9]{2,3}))?([_-][a-zA-Z0-9]{1,8})*");
519 
doSummary()520     public static void doSummary() throws IOException {
521         Map<String, R4<Counter<String>, Counter<String>, Counter<String>, Counter<String>>>
522                 key_release_count = new TreeMap<>();
523         Matcher countryLocale = LOCALE_PATTERN.matcher("");
524         List<String> releases = new ArrayList<>();
525         Pattern releaseNumber = PatternCache.get("count_(?:.*-(\\d+(\\.\\d+)*)|trunk)\\.txt");
526         // int releaseCount = 1;
527         Relation<String, String> release_keys =
528                 Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
529         Relation<String, String> localesToPaths =
530                 Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class);
531         Set<String> writtenLanguages = new TreeSet<>();
532         Set<String> countries = new TreeSet<>();
533 
534         File[] listFiles = new File(OUT_DIRECTORY).listFiles();
535         // find the most recent version
536         VersionInfo mostRecentVersion = VersionInfo.getInstance(0);
537         for (File subdir : listFiles) {
538             final String name = subdir.getName();
539             final Matcher releaseMatcher = releaseNumber.matcher(name);
540             if (!releaseMatcher.matches()) {
541                 if (name.startsWith("count_")) {
542                     throw new IllegalArgumentException(
543                             "Bad match " + RegexUtilities.showMismatch(releaseMatcher, name));
544                 }
545                 continue;
546             }
547             String releaseNum = releaseMatcher.group(1); // "1." + releaseCount++;
548             if (releaseNum == null) {
549                 releaseNum = TRUNK_VERSION;
550             }
551             VersionInfo vi = VersionInfo.getInstance(releaseNum);
552             if (vi.compareTo(mostRecentVersion) > 0) {
553                 mostRecentVersion = vi;
554             }
555         }
556 
557         for (File subdir : listFiles) {
558             final String name = subdir.getName();
559             final Matcher releaseMatcher = releaseNumber.matcher(name);
560             if (!releaseMatcher.matches()) {
561                 if (name.startsWith("count_")) {
562                     throw new IllegalArgumentException(
563                             "Bad match " + RegexUtilities.showMismatch(releaseMatcher, name));
564                 }
565                 continue;
566             }
567             String releaseNum = releaseMatcher.group(1); // "1." + releaseCount++;
568             if (releaseNum == null) {
569                 releaseNum = TRUNK_VERSION;
570             }
571             VersionInfo vi = VersionInfo.getInstance(releaseNum);
572             boolean captureData = vi.equals(mostRecentVersion);
573             releases.add(releaseNum);
574             BufferedReader in =
575                     FileUtilities.openUTF8Reader("", PathUtilities.getNormalizedPathString(subdir));
576             while (true) {
577                 String line = in.readLine();
578                 if (line == null) break;
579                 line = line.trim();
580                 if (line.startsWith("#")) {
581                     continue;
582                 }
583                 // common/main  New:        [Yellowknife]
584                 // /gl//ldml/dates/timeZoneNames/zone[@type="America/Yellowknife"]/exemplarCity
585 
586                 String[] parts = line.split("\t");
587                 try {
588                     String file = parts[0];
589                     if (file.startsWith("seed/") || !DIR_FILE_MATCHER.reset(file).find()) {
590                         if (VERBOSE) {
591                             System.out.println(
592                                     "Skipping: "
593                                             + RegexUtilities.showMismatch(DIR_FILE_MATCHER, file));
594                         }
595                         continue;
596                     } else if (VERBOSE) {
597                         System.out.println("Including: " + file);
598                     }
599 
600                     long valueCount = Long.parseLong(parts[1]);
601                     long valueLen = Long.parseLong(parts[2]);
602                     long attrCount = Long.parseLong(parts[3]);
603                     long attrLen = Long.parseLong(parts[4]);
604                     int lastSlash = file.lastIndexOf("/");
605                     String key2 = file;
606                     String path = file.substring(0, lastSlash);
607                     String key = file.substring(lastSlash + 1);
608                     if (countryLocale.reset(key).matches()) {
609                         String lang = countryLocale.group(1);
610                         String script = countryLocale.group(2);
611                         String country = countryLocale.group(3);
612                         String writtenLang = lang + (script == null ? "" : "_" + script);
613                         String locale = writtenLang + (country == null ? "" : "_" + country);
614                         if (captureData) {
615                             localesToPaths.put(locale, path);
616                             writtenLanguages.add(writtenLang);
617                             if (country != null) {
618                                 countries.add(country);
619                             }
620                         }
621                         // System.out.println(key + " => " + newKey);
622                         // key = writtenLang + "—" + ULocale.getDisplayName(writtenLang, "en");
623                     }
624                     if (valueCount + attrCount == 0) continue;
625                     release_keys.put(releaseNum, key2);
626                     R4<Counter<String>, Counter<String>, Counter<String>, Counter<String>>
627                             release_count = key_release_count.get(key2);
628                     if (release_count == null) {
629                         release_count =
630                                 Row.of(
631                                         new Counter<String>(),
632                                         new Counter<String>(),
633                                         new Counter<String>(),
634                                         new Counter<String>());
635                         key_release_count.put(key2, release_count);
636                     }
637                     release_count.get0().add(releaseNum, valueCount);
638                     release_count.get1().add(releaseNum, valueLen);
639                     release_count.get2().add(releaseNum, attrCount);
640                     release_count.get3().add(releaseNum, attrLen);
641                 } catch (Exception e) {
642                     throw new IllegalArgumentException(line, e);
643                 }
644             }
645             in.close();
646         }
647         PrintWriter summary =
648                 FileUtilities.openUTF8Writer(
649                         OUT_DIRECTORY,
650                         (MyOptions.directory.option.doesOccur() ? "filtered-" : "")
651                                 + "summary"
652                                 + ".txt");
653         for (String file : releases) {
654             summary.print("\t" + file + "\tlen");
655         }
656         summary.println();
657         for (String key : key_release_count.keySet()) {
658             summary.print(key);
659             R4<Counter<String>, Counter<String>, Counter<String>, Counter<String>> release_count =
660                     key_release_count.get(key);
661             for (String release2 : releases) {
662                 long count =
663                         release_count.get0().get(release2) + release_count.get2().get(release2);
664                 long len = release_count.get1().get(release2) + release_count.get3().get(release2);
665                 summary.print("\t" + count + "\t" + len);
666             }
667             summary.println();
668         }
669         for (String release : release_keys.keySet()) {
670             System.out.println("Release:\t" + release + "\t" + release_keys.getAll(release).size());
671         }
672         summary.close();
673         PrintWriter summary2 =
674                 FileUtilities.openUTF8Writer(
675                         OUT_DIRECTORY,
676                         (MyOptions.directory.option.doesOccur() ? "filtered-" : "")
677                                 + "locales"
678                                 + ".txt");
679         summary2.println("#Languages (inc. script):\t" + writtenLanguages.size());
680         summary2.println("#Countries:\t" + countries.size());
681         summary2.println("#Locales:\t" + localesToPaths.size());
682         for (Entry<String, Set<String>> entry : localesToPaths.keyValuesSet()) {
683             summary2.println(entry.getKey() + "\t" + Joiner.on("\t").join(entry.getValue()));
684         }
685         summary2.close();
686     }
687 
688     static final Set<String> ATTRIBUTES_TO_SKIP =
689             Builder.with(new HashSet<String>())
690                     .addAll("version", "references", "standard", "draft")
691                     .freeze();
692     static final Pattern skipPath =
693             PatternCache.get(
694                     ""
695                             + "\\[\\@alt=\"[^\"]*proposed"
696                             + "|^//"
697                             + "(ldml(\\[[^/]*)?/identity"
698                             + "|(ldmlBCP47|supplementalData|keyboard)(\\[[^/]*)?/(generation|version)"
699                             + ")");
700 
capture(DtdType type2, XPathParts parts)701     static void capture(DtdType type2, XPathParts parts) {
702         for (int i = 0; i < parts.size(); ++i) {
703             String element = parts.getElement(i);
704             ELEMENTS_OCCURRING.put(element, type2);
705             for (String attribute : parts.getAttributes(i).keySet()) {
706                 ATTRIBUTES_OCCURRING.put(attribute, Row.of(type2, element));
707             }
708         }
709     }
710 
711     static class MyHandler extends SimpleHandler {
712         long valueCount;
713         long valueLen;
714         long attributeCount;
715         long attributeLen;
716         Matcher skipPathMatcher = skipPath.matcher("");
717         Splitter lines = Splitter.onPattern("\n+").omitEmptyStrings().trimResults();
718         String prefix;
719         int orderedCount;
720         DtdType type;
721         private final boolean isFinal;
722 
MyHandler(String prefix, boolean isFinal)723         MyHandler(String prefix, boolean isFinal) {
724             this.prefix = prefix;
725             this.isFinal = isFinal;
726         }
727 
728         @Override
handlePathValue(String path, String value)729         public void handlePathValue(String path, String value) {
730             if (type == null) {
731                 XPathParts parts = XPathParts.getFrozenInstance(path);
732                 type = DtdType.valueOf(parts.getElement(0));
733             }
734 
735             ATTRIBUTE_TYPES.add(path);
736 
737             if (skipPathMatcher.reset(path).find()) {
738                 return;
739             }
740             String pathKey = null;
741             if (doChanges) {
742                 // if (path.contains("/collations")) {
743                 // System.out.println("whoops");
744                 // }
745                 pathKey = fixKeyPath(path);
746             }
747             int len = value.length();
748             value = value.trim();
749             if (value.isEmpty() && len > 0) {
750                 value = " ";
751             }
752             if (value.length() != 0) {
753                 List<String> valueLines = lines.splitToList(value);
754                 if (valueLines.size() == 1) {
755                     valueCount++;
756                     valueLen += value.length();
757                     if (doChanges) {
758                         path2value.put(pathKey, value);
759                     }
760                 } else {
761                     int count = 0;
762                     for (String v : valueLines) {
763                         valueCount++;
764                         valueLen += v.length();
765                         if (doChanges) {
766                             path2value.put(pathKey + "/_q" + count++, v);
767                         }
768                     }
769                 }
770             }
771             XPathParts parts = XPathParts.getFrozenInstance(path);
772             if (isFinal) {
773                 capture(type, parts);
774             }
775             if (path.contains("[@")) {
776                 int i = parts.size() - 1; // only look at last item
777                 Collection<String> attributes = parts.getAttributeKeys(i);
778                 if (attributes.size() != 0) {
779                     String element = parts.getElement(i);
780                     for (String attribute : attributes) {
781                         if (ATTRIBUTES_TO_SKIP.contains(attribute)
782                                 || CLDRFile.isDistinguishing(type, element, attribute)) {
783                             continue;
784                         }
785                         String valuePart = parts.getAttributeValue(i, attribute);
786                         // String[] valueParts = attrValue.split("\\s");
787                         // for (String valuePart : valueParts) {
788                         attributeCount++;
789                         attributeLen += valuePart.length();
790                         if (doChanges) {
791                             path2value.put(pathKey + "/_" + attribute, valuePart);
792                             // }
793                         }
794                     }
795                 }
796             }
797         }
798 
fixKeyPath(String path)799         private String fixKeyPath(String path) {
800             XPathParts parts = XPathParts.getFrozenInstance(path);
801             if (!SKIP_ORDERING) {
802                 parts = parts.cloneAsThawed();
803             }
804             for (int i = 0; i < parts.size(); ++i) {
805                 String element = parts.getElement(i);
806                 if (!SKIP_ORDERING) {
807                     if (CLDRFile.isOrdered(element, type)) {
808                         parts.addAttribute("_q", String.valueOf(orderedCount++));
809                     }
810                 }
811             }
812             return prefix + CLDRFile.getDistinguishingXPath(parts.toString(), null);
813         }
814     }
815 
check(String systemID, String name, boolean isFinal)816     private MyHandler check(String systemID, String name, boolean isFinal) {
817         MyHandler myHandler = new MyHandler(name, isFinal);
818         try {
819             XMLFileReader reader = new XMLFileReader().setHandler(myHandler);
820             reader.read(systemID, XMLFileReader.CONTENT_HANDLER, true);
821         } catch (Exception e) {
822             cantRead.put(name, Arrays.asList(e.getStackTrace()));
823         }
824         return myHandler;
825 
826         // try {
827         // FileInputStream fis = new FileInputStream(systemID);
828         // XMLFileReader xmlReader = XMLFileReader.createXMLReader(true);
829         // xmlReader.setErrorHandler(new MyErrorHandler());
830         // MyHandler myHandler = new MyHandler();
831         // smlReader
832         // xmlReader.setHandler(myHandler);
833         // InputSource is = new InputSource(fis);
834         // is.setSystemId(systemID.toString());
835         // xmlReader.parse(is);
836         // fis.close();
837         // return myHandler;
838         // } catch (SAXParseException e) {
839         // System.out.println("\t" + "Can't read " + systemID);
840         // System.out.println("\t" + e.getClass() + "\t" + e.getMessage());
841         // } catch (SAXException e) {
842         // System.out.println("\t" + "Can't read " + systemID);
843         // System.out.println("\t" + e.getClass() + "\t" + e.getMessage());
844         // } catch (IOException e) {
845         // System.out.println("\t" + "Can't read " + systemID);
846         // System.out.println("\t" + e.getClass() + "\t" + e.getMessage());
847         // }
848     }
849 
850     static class MyErrorHandler implements ErrorHandler {
851         @Override
error(SAXParseException exception)852         public void error(SAXParseException exception) throws SAXException {
853             System.out.println("\nerror: " + XMLFileReader.showSAX(exception));
854             throw exception;
855         }
856 
857         @Override
fatalError(SAXParseException exception)858         public void fatalError(SAXParseException exception) throws SAXException {
859             System.out.println("\nfatalError: " + XMLFileReader.showSAX(exception));
860             throw exception;
861         }
862 
863         @Override
warning(SAXParseException exception)864         public void warning(SAXParseException exception) throws SAXException {
865             System.out.println("\nwarning: " + XMLFileReader.showSAX(exception));
866             throw exception;
867         }
868     }
869 
summarizeCoverage(PrintWriter summary, String commonDir, boolean isFinal)870     private void summarizeCoverage(PrintWriter summary, String commonDir, boolean isFinal) {
871         System.out.println(commonDir);
872         summary.println(
873                 "#name"
874                         + "\t"
875                         + "value-count"
876                         + "\t"
877                         + "value-len"
878                         + "\t"
879                         + "attr-count"
880                         + "\t"
881                         + "attr-len");
882         File commonDirectory = new File(commonDir);
883         if (!commonDirectory.exists()) {
884             System.out.println("Doesn't exist:\t" + commonDirectory);
885         }
886         summarizeFiles(summary, commonDirectory, isFinal, 1);
887     }
888 
889     static final Set<String> SKIP_DIRS =
890             new HashSet<>(Arrays.asList("specs", "tools", "seed", "exemplars"));
891 
summarizeFiles(PrintWriter summary, File directory, boolean isFinal, int level)892     public void summarizeFiles(PrintWriter summary, File directory, boolean isFinal, int level) {
893         System.out.println("\t\t\t\t\t\t\t".substring(0, level) + directory);
894         int count = 0;
895         for (File file : directory.listFiles()) {
896             String filename = file.getName();
897             if (filename.startsWith(".")) {
898                 // do nothing
899             } else if (file.isDirectory()) {
900                 if (!SKIP_DIRS.contains(filename)) {
901                     summarizeFiles(summary, file, isFinal, level + 1);
902                 }
903             } else if (!filename.startsWith("#") && filename.endsWith(".xml")) {
904                 String name =
905                         new File(directory.getParent()).getName()
906                                 + "/"
907                                 + directory.getName()
908                                 + "/"
909                                 + file.getName();
910                 name = name.substring(0, name.length() - 4); // strip .xml
911                 if (!RAW_FILE_MATCHER.reset(name).find()) {
912                     continue;
913                 }
914                 if (VERBOSE) {
915                     System.out.println(name);
916                 } else {
917                     System.out.print(".");
918                     if (++count > 100) {
919                         count = 0;
920                         System.out.println();
921                     }
922                     System.out.flush();
923                 }
924                 MyHandler handler = check(file.toString(), name, isFinal);
925                 summary.println(
926                         name
927                                 + "\t"
928                                 + handler.valueCount
929                                 + "\t"
930                                 + handler.valueLen
931                                 + "\t"
932                                 + handler.attributeCount
933                                 + "\t"
934                                 + handler.attributeLen);
935             }
936         }
937         System.out.println();
938     }
939 }
940