xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/tool/SearchXml.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 package org.unicode.cldr.tool;
2 
3 import com.ibm.icu.impl.Relation;
4 import com.ibm.icu.impl.UnicodeRegex;
5 import com.ibm.icu.text.Transliterator;
6 import com.ibm.icu.util.Output;
7 import com.ibm.icu.util.ULocale;
8 import java.io.File;
9 import java.io.IOException;
10 import java.io.PrintWriter;
11 import java.io.StringWriter;
12 import java.util.Collections;
13 import java.util.LinkedHashMap;
14 import java.util.LinkedHashSet;
15 import java.util.Set;
16 import java.util.regex.Matcher;
17 import org.unicode.cldr.test.CoverageLevel2;
18 import org.unicode.cldr.tool.Option.Options;
19 import org.unicode.cldr.util.CLDRLocale;
20 import org.unicode.cldr.util.CLDRPaths;
21 import org.unicode.cldr.util.CLDRTool;
22 import org.unicode.cldr.util.Counter;
23 import org.unicode.cldr.util.Level;
24 import org.unicode.cldr.util.PathHeader;
25 import org.unicode.cldr.util.PathHeader.BaseUrl;
26 import org.unicode.cldr.util.PathStarrer;
27 import org.unicode.cldr.util.PathUtilities;
28 import org.unicode.cldr.util.PatternCache;
29 import org.unicode.cldr.util.SupplementalDataInfo;
30 import org.unicode.cldr.util.XMLFileReader;
31 
32 @CLDRTool(alias = "searchxml", description = "Search CLDR XML for matching paths or values")
33 public class SearchXml {
34 
35     // TODO Use options
36     private static Matcher fileMatcher;
37 
38     private static Matcher pathMatcher;
39 
40     private static Matcher valueMatcher;
41     private static Matcher levelMatcher;
42     private static Matcher iRankMatcher;
43 
44     private static boolean showFiles;
45     private static boolean showValues = true;
46     private static boolean replaceValues;
47 
48     private static int total = 0;
49 
50     private static boolean countOnly = false;
51     private static boolean verbose = false;
52 
53     private static boolean pathExclude = false;
54     private static boolean levelExclude = false;
55     private static boolean iRankExclude = false;
56     private static boolean valueExclude = false;
57     private static boolean fileExclude = false;
58     private static boolean unique = false;
59     private static boolean groups = false;
60     private static Counter<String> uniqueData = new Counter<>();
61 
62     private static String valuePattern;
63     private static File comparisonDirectory;
64     private static boolean recursive;
65 
66     private static Counter<String> kountRegexMatches;
67     private static Counter<String> starCounter;
68     private static final Set<String> ERRORS = new LinkedHashSet<>();
69     private static final PathStarrer pathStarrer = new PathStarrer();
70     private static PathHeader.Factory PATH_HEADER_FACTORY = null;
71 
72     static final Options myOptions =
73             new Options()
74                     .add(
75                             "source",
76                             ".*",
77                             CLDRPaths.MAIN_DIRECTORY,
78                             "source directory (use also " + CLDRPaths.AUX_DIRECTORY + ")")
79                     .add(
80                             "file",
81                             ".*",
82                             null,
83                             "regex to filter files. ! in front selects items that don't match.")
84                     .add(
85                             "path",
86                             ".*",
87                             null,
88                             "regex to filter paths. ! in front selects items that don't match. example: -p relative.*@type=\\\"-?3\\\"")
89                     .add(
90                             "value",
91                             ".*",
92                             null,
93                             "regex to filter values. ! in front selects items that don't match")
94                     .add(
95                             "level",
96                             ".*",
97                             null,
98                             "regex to filter levels. ! in front selects items that don't match")
99                     .add("count", null, null, "only count items")
100                     .add("kount", null, null, "count regex group matches in pattern")
101                     .add("other", ".+", null, "compare against other directory")
102                     .add("unique", null, null, "only unique lines")
103                     .add(
104                             "groups",
105                             null,
106                             null,
107                             "only retain capturing groups in path/value, eg in -p @modifiers=\\\"([^\\\"]*+)\\\", output the part in (...)")
108                     .add("Verbose", null, null, "verbose output")
109                     .add("recursive", null, null, "recurse directories")
110                     .add("Star", null, null, "get statistics on starred paths")
111                     .add("PathHeader", null, null, "show path header and string ID")
112                     .add(
113                             "iRank",
114                             ".*",
115                             null,
116                             "Filter by inheritance rank, where 0 = root, ow N = inherits directly from rank N-1");
117 
main(String[] args)118     public static void main(String[] args) throws IOException {
119         double startTime = System.currentTimeMillis();
120         myOptions.parse(args, true);
121 
122         verbose = myOptions.get("Verbose").doesOccur();
123 
124         String sourceDirectory = myOptions.get("source").getValue();
125         if (sourceDirectory == null) {
126             System.out.println("#" + "Need Source Directory! ");
127             return;
128         }
129         Output<Boolean> exclude = new Output<>();
130         fileMatcher = getMatcher(myOptions.get("file").getValue(), exclude);
131         fileExclude = exclude.value;
132 
133         pathMatcher = getMatcher(myOptions.get("path").getValue(), exclude);
134         pathExclude = exclude.value;
135 
136         levelMatcher = getMatcher(myOptions.get("level").getValue(), exclude);
137         levelExclude = exclude.value;
138 
139         iRankMatcher = getMatcher(myOptions.get("iRank").getValue(), exclude);
140         iRankExclude = exclude.value;
141 
142         valueMatcher = getMatcher(myOptions.get("value").getValue(), exclude);
143         valueExclude = exclude.value;
144 
145         if (myOptions.get("Star").doesOccur()) {
146             starCounter = new Counter<>();
147         }
148 
149         if (pathMatcher != null && valueMatcher != null) {
150             valuePattern = valueMatcher.pattern().toString();
151             if (PatternCache.get("\\$\\d.*").matcher(valuePattern).find()) {
152                 replaceValues = true;
153             }
154         }
155 
156         if (myOptions.get("PathHeader").doesOccur()) {
157             PATH_HEADER_FACTORY = PathHeader.getFactory(ToolConfig.getToolInstance().getEnglish());
158         }
159 
160         unique = myOptions.get("unique").doesOccur();
161         groups = myOptions.get("groups").doesOccur();
162 
163         countOnly = myOptions.get("count").doesOccur();
164         kountRegexMatches = myOptions.get("kount").doesOccur() ? new Counter<>() : null;
165 
166         recursive = myOptions.get("recursive").doesOccur();
167 
168         // showFiles = myOptions.get("showFiles").doesOccur();
169         // showValues = myOptions.get("showValues").doesOccur();
170 
171         File src = new File(sourceDirectory);
172         if (!src.isDirectory()) {
173             System.err.println("#" + sourceDirectory + " must be a directory");
174             return;
175         }
176 
177         String comparisonDirectoryString = myOptions.get("other").getValue();
178         if (comparisonDirectoryString != null) {
179             comparisonDirectory = new File(comparisonDirectoryString);
180             if (!comparisonDirectory.isDirectory()) {
181                 System.err.println("#" + comparisonDirectoryString + " must be a directory");
182                 return;
183             }
184         }
185 
186         if (countOnly) {
187             System.out.print("file");
188             for (Level cLevel : Level.values()) {
189                 System.out.print("\t" + cLevel);
190             }
191             System.out.println();
192         }
193 
194         processDirectory(src);
195 
196         if (kountRegexMatches != null) {
197             for (String item : kountRegexMatches.getKeysetSortedByCount(false)) {
198                 System.out.println("#" + kountRegexMatches.getCount(item) + "\t" + item);
199             }
200         }
201 
202         if (unique) {
203             for (String item : uniqueData.getKeysetSortedByCount(false)) {
204                 System.out.println("#" + uniqueData.getCount(item) + item);
205             }
206         }
207 
208         if (starCounter != null) {
209             for (String path : starCounter.getKeysetSortedByCount(false)) {
210                 System.out.println("#" + starCounter.get(path) + "\t" + path);
211             }
212         }
213         double deltaTime = System.currentTimeMillis() - startTime;
214         System.out.println("#" + "Elapsed: " + deltaTime / 1000.0 + " seconds");
215         System.out.println("#" + "Instances found: " + total);
216     }
217 
getMatcher(String property, Output<Boolean> exclude)218     private static Matcher getMatcher(String property, Output<Boolean> exclude) {
219         exclude.value = false;
220         if (property == null) {
221             return null;
222         }
223         if (property.startsWith("!")) {
224             exclude.value = true;
225             property = property.substring(1);
226         }
227         Matcher result = UnicodeRegex.compile(property).matcher("");
228         //        System.out.println(result.pattern());
229         //
230         return result;
231     }
232 
processDirectory(File src)233     private static void processDirectory(File src) throws IOException {
234         if (comparisonDirectory != null) {
235             System.out.println(
236                     "#"
237                             + "Locale"
238                             + "\tFile"
239                             + "\tBase"
240                             + DiffInfo.DiffInfoHeader
241                             + "\n#\tValue\tOtherValue\tPath");
242         }
243         for (File file : src.listFiles()) {
244             if (recursive && file.isDirectory()) {
245                 processDirectory(file);
246                 continue;
247             }
248             if (file.length() == 0) {
249                 continue;
250             }
251 
252             String fileName = file.getName();
253             String canonicalFile = PathUtilities.getNormalizedPathString(file);
254 
255             if (!fileName.endsWith(".xml")) {
256                 continue;
257             }
258 
259             String coreName = fileName.substring(0, fileName.length() - 4); // remove .xml
260 
261             if (fileMatcher != null && fileExclude == fileMatcher.reset(coreName).find()) {
262                 if (verbose) {
263                     System.out.println("#" + "* -f Skipping " + canonicalFile);
264                 }
265                 continue;
266             }
267             if (iRankMatcher != null
268                     && iRankExclude
269                             == iRankMatcher
270                                     .reset(
271                                             String.valueOf(
272                                                     CLDRLocale.getInstance(coreName).getRank()))
273                                     .find()) {
274                 if (verbose) {
275                     System.out.println("#" + "* -i Skipping " + canonicalFile);
276                 }
277                 continue;
278             }
279             if (verbose) {
280                 System.out.println("#" + "Searching " + canonicalFile);
281             }
282 
283             if (showFiles) {
284                 System.out.println("#" + "* " + canonicalFile);
285             }
286 
287             Relation<String, String> source = getXmlFileAsRelation(src, fileName);
288             Relation<String, String> other = null;
289             if (comparisonDirectory != null) {
290                 other = getXmlFileAsRelation(comparisonDirectory, fileName);
291             }
292 
293             checkFiles(recursive ? file.getParent() : null, fileName, coreName, source, other);
294             System.out.flush();
295         }
296         System.out.println("#" + "\t" + DiffInfo.DiffInfoHeader);
297         DIFF_INFO.showValues("TOTAL");
298 
299         for (String error : ERRORS) {
300             System.err.println("#" + error);
301         }
302     }
303 
getXmlFileAsRelation(File directory, String fileName)304     private static Relation<String, String> getXmlFileAsRelation(File directory, String fileName) {
305         ListHandler listHandler = new ListHandler();
306         XMLFileReader xfr = new XMLFileReader().setHandler(listHandler);
307         try {
308             String fileName2 = PathUtilities.getNormalizedPathString(directory) + "/" + fileName;
309             xfr.read(fileName2, XMLFileReader.CONTENT_HANDLER | XMLFileReader.ERROR_HANDLER, false);
310         } catch (Exception e) {
311             StringWriter stringWriter = new StringWriter();
312             PrintWriter arg0 = new PrintWriter(stringWriter);
313             e.printStackTrace(arg0);
314             arg0.flush();
315             ERRORS.add("Can't read " + directory + "/" + fileName + "\n" + stringWriter);
316         }
317         return listHandler.data;
318     }
319 
320     static class ListHandler extends XMLFileReader.SimpleHandler {
321         public Relation<String, String> data =
322                 Relation.of(new LinkedHashMap<String, Set<String>>(), LinkedHashSet.class);
323 
324         @Override
handlePathValue(String path, String value)325         public void handlePathValue(String path, String value) {
326             data.put(path, value);
327         }
328     }
329 
330     // static MyHandler myHandler = new MyHandler();
331 
332     static DiffInfo DIFF_INFO = new DiffInfo();
333 
334     static class DiffInfo {
335         static final String DiffInfoHeader = "\tSame" + "\tDeletions" + "\tAdditions" + "\tChanges";
336 
337         int additionCount = 0;
338         int deletionCount = 0;
339         int changed2Values = 0;
340         int sameCount = 0;
341 
showValues(String title)342         public void showValues(String title) {
343             System.out.println(
344                     "#"
345                             + title
346                             + "\t"
347                             + sameCount
348                             + "\t"
349                             + deletionCount
350                             + "\t"
351                             + additionCount
352                             + "\t"
353                             + (changed2Values / 2));
354             DIFF_INFO.additionCount += additionCount;
355             DIFF_INFO.deletionCount += deletionCount;
356             DIFF_INFO.changed2Values += changed2Values;
357             DIFF_INFO.sameCount += sameCount;
358         }
359     }
360 
361     /**
362      * @author markdavis
363      * @param fileName
364      * @param canonicalFile
365      */
checkFiles( String filePath, String fileName, String coreName, Relation<String, String> source, Relation<String, String> other)366     private static void checkFiles(
367             String filePath,
368             String fileName,
369             String coreName,
370             Relation<String, String> source,
371             Relation<String, String> other) {
372         CoverageLevel2 level = null;
373         String firstMessage;
374         String file;
375         Counter<Level> levelCounter = new Counter<>();
376         String canonicalFile = fileName;
377         firstMessage = "* " + canonicalFile;
378         file = canonicalFile;
379 
380         DiffInfo diffInfo = new DiffInfo();
381 
382         if (levelMatcher != null || countOnly) {
383             try {
384                 level = CoverageLevel2.getInstance(canonicalFile);
385             } catch (Exception e) {
386             }
387         }
388 
389         if (countOnly) {
390             System.out.print(fileName);
391             for (Level cLevel : Level.values()) {
392                 System.out.print("\t" + levelCounter.get(cLevel));
393             }
394             System.out.println();
395         }
396 
397         Set<String> keys = new LinkedHashSet<>(source.keySet());
398         if (other != null) {
399             keys.addAll(other.keySet());
400         }
401         for (String path : keys) {
402             if (path.startsWith("//ldml/identity/")) {
403                 continue;
404             }
405             if (pathMatcher != null && pathExclude == pathMatcher.reset(path).find()) {
406                 continue;
407             }
408 
409             Level pathLevel = null;
410 
411             pathLevel = level == null ? Level.COMPREHENSIVE : level.getLevel(path);
412             levelCounter.add(pathLevel, 1);
413 
414             if (levelMatcher != null
415                     && levelExclude == levelMatcher.reset(pathLevel.toString()).find()) {
416                 continue;
417             }
418 
419             Set<String> values = source.get(path);
420             Set<String> otherValues = other == null ? null : other.get(path);
421 
422             // if (showValues) {
423             // System.out.println("#"+values + "\t" + otherValues + "\t<=\t" + path);
424             // }
425 
426             if (other != null) {
427                 if (values != otherValues) {
428                     boolean diff = true;
429                     if (values == null) {
430                         diffInfo.additionCount += otherValues.size();
431                     } else if (otherValues == null) {
432                         diffInfo.deletionCount += values.size();
433                     } else if (!values.equals(otherValues)) {
434                         diffInfo.changed2Values += values.size() + otherValues.size();
435                     } else {
436                         diff = false;
437                         diffInfo.sameCount += values.size();
438                     }
439                     if (diff && showValues) {
440                         show(
441                                 ConfigOption.add,
442                                 filePath,
443                                 file,
444                                 null,
445                                 null,
446                                 path,
447                                 values,
448                                 otherValues);
449                     }
450                 }
451             } else {
452                 for (String value : values) {
453                     if (replaceValues) {
454                         String pattern = valuePattern;
455                         for (int i = 0; i <= pathMatcher.groupCount(); ++i) {
456                             pattern = pattern.replace("$" + i, pathMatcher.group(i));
457                         }
458                         valueMatcher = PatternCache.get(pattern).matcher("");
459                     }
460 
461                     if (valueMatcher != null && valueExclude == valueMatcher.reset(value).find()) {
462                         continue;
463                     }
464 
465                     if (kountRegexMatches != null && pathMatcher != null) {
466                         kountRegexMatches.add(pathMatcher.group(1), 1);
467                     }
468 
469                     if (starCounter != null) {
470                         starCounter.add(pathStarrer.set(path), 1);
471                     }
472                     ++total;
473 
474                     if (firstMessage != null) {
475                         // System.out.println("#"+firstMessage);
476                         firstMessage = null;
477                     }
478                     if (!countOnly) {
479                         String data =
480                                 groups
481                                         ? group(value, valueMatcher)
482                                                 + "\t"
483                                                 + group(path, pathMatcher)
484                                         : value + "\t" + path;
485                         if (!unique) {
486                             String pathHeaderInfo = "";
487                             if (PATH_HEADER_FACTORY != null) {
488                                 PathHeader pathHeader = PATH_HEADER_FACTORY.fromPath(path);
489                                 if (pathHeader != null) {
490                                     pathHeaderInfo =
491                                             "\n\t"
492                                                     + pathHeader
493                                                     + "\n\t"
494                                                     + pathHeader.getUrl(
495                                                             BaseUrl.PRODUCTION, coreName);
496                                 }
497                             }
498                             // http://st.unicode.org/cldr-apps/v#/en/Fields/59d8178ec2fe04ae
499                             if (!groups && pathHeaderInfo.isEmpty()) {
500                                 show(
501                                         ConfigOption.add,
502                                         filePath,
503                                         file,
504                                         null,
505                                         null,
506                                         path,
507                                         Collections.singleton(value),
508                                         null);
509                             } else {
510                                 System.out.println(
511                                         "#?"
512                                                 + (recursive ? filePath + "\t" : "")
513                                                 + file
514                                                 + "\t"
515                                                 + data
516                                                 + pathHeaderInfo);
517                             }
518                         } else {
519                             uniqueData.add(data, 1);
520                         }
521                     }
522                 }
523             }
524         }
525         if (other != null) {
526             ULocale locale = new ULocale(fileName.substring(0, fileName.length() - 4));
527             String localeName = locale.getDisplayName(ULocale.ENGLISH);
528             String title = localeName + "\t" + fileName + "\t" + getType(locale);
529             diffInfo.showValues(title);
530         }
531     }
532 
533     enum ConfigOption {
534         delete,
535         add,
536         addNew,
537         replace
538     }
539 
show( ConfigOption configOption, String fileParent, String localeOrFile, String match_path, String match_value, String new_path, Set<String> new_values, Set<String> otherValues)540     public static void show(
541             ConfigOption configOption,
542             String fileParent,
543             String localeOrFile,
544             String match_path,
545             String match_value,
546             String new_path,
547             Set<String> new_values,
548             Set<String> otherValues) {
549         // locale= sv ; action=delete; value= YER ; path=
550         // //ldml/numbers/currencies/currency[@type="YER"]/symbol ;
551 
552         // locale=en ; action=delete ; path=/.*short.*/
553 
554         // locale=en ; action=add ;
555         // new_path=//ldml/localeDisplayNames/territories/territory[@type="PS"][@alt="short"] ;
556         // new_value=Palestine
557         // locale=  af     ; action=add ; new_path=
558         // //ldml/dates/fields/field[@type="second"]/relative[@type="0"]    ; new_value=    nou
559 
560         int extensionPos = localeOrFile.lastIndexOf('.');
561         String fileWithoutSuffix =
562                 extensionPos >= 0 ? localeOrFile.substring(0, extensionPos) : localeOrFile;
563 
564         String values2 =
565                 new_values == null
566                         ? null
567                         : new_values.size() != 1
568                                 ? new_values.toString()
569                                 : new_values.iterator().next();
570 
571         System.out.println(
572                 fileParent
573                         + ";\tlocale="
574                         + fileWithoutSuffix
575                         + ";\taction="
576                         + configOption
577                         + (match_value == null ? "" : ";\tvalue=" + escape(match_value))
578                         + (match_path == null ? "" : ";\tpath=" + match_path)
579                         + (values2 == null ? "" : ";\tnew_value=" + escape(values2))
580                         + (new_path == null ? "" : ";\tnew_path=" + new_path)
581                         + (otherValues == null ? "" : ";\tother_value=" + otherValues));
582     }
583 
584     static final Transliterator showInvisibles =
585             Transliterator.getInstance("[[:whitespace:][:cf:]-[\\u0020]]hex/perl");
586 
escape(String source)587     private static String escape(String source) {
588         return showInvisibles.transform(source);
589     }
590 
591     static Set<String> defaultContent =
592             SupplementalDataInfo.getInstance().getDefaultContentLocales();
593 
getType(ULocale locale)594     private static String getType(ULocale locale) {
595         if (defaultContent.contains(locale.toString())) {
596             return "DC";
597         } else if (locale.getCountry().isEmpty()) {
598             return "Base";
599         } else {
600             return "Region";
601         }
602     }
603 
group(String item, Matcher matcher)604     private static String group(String item, Matcher matcher) {
605         if (matcher == null) {
606             return item;
607         }
608         StringBuilder b = new StringBuilder();
609         for (int i = 1; i <= matcher.groupCount(); ++i) {
610             b.append(matcher.group(i));
611         }
612         return b.toString();
613     }
614 
615     //    static class StarCounter {
616     //        Map<String,Counter<String>> data = new HashMap();
617     //    }
618 }
619