1 package org.unicode.cldr.tool; 2 3 import com.ibm.icu.impl.Relation; 4 import com.ibm.icu.impl.UnicodeRegex; 5 import com.ibm.icu.text.Transliterator; 6 import com.ibm.icu.util.Output; 7 import com.ibm.icu.util.ULocale; 8 import java.io.File; 9 import java.io.IOException; 10 import java.io.PrintWriter; 11 import java.io.StringWriter; 12 import java.util.Collections; 13 import java.util.LinkedHashMap; 14 import java.util.LinkedHashSet; 15 import java.util.Set; 16 import java.util.regex.Matcher; 17 import org.unicode.cldr.test.CoverageLevel2; 18 import org.unicode.cldr.tool.Option.Options; 19 import org.unicode.cldr.util.CLDRLocale; 20 import org.unicode.cldr.util.CLDRPaths; 21 import org.unicode.cldr.util.CLDRTool; 22 import org.unicode.cldr.util.Counter; 23 import org.unicode.cldr.util.Level; 24 import org.unicode.cldr.util.PathHeader; 25 import org.unicode.cldr.util.PathHeader.BaseUrl; 26 import org.unicode.cldr.util.PathStarrer; 27 import org.unicode.cldr.util.PathUtilities; 28 import org.unicode.cldr.util.PatternCache; 29 import org.unicode.cldr.util.SupplementalDataInfo; 30 import org.unicode.cldr.util.XMLFileReader; 31 32 @CLDRTool(alias = "searchxml", description = "Search CLDR XML for matching paths or values") 33 public class SearchXml { 34 35 // TODO Use options 36 private static Matcher fileMatcher; 37 38 private static Matcher pathMatcher; 39 40 private static Matcher valueMatcher; 41 private static Matcher levelMatcher; 42 private static Matcher iRankMatcher; 43 44 private static boolean showFiles; 45 private static boolean showValues = true; 46 private static boolean replaceValues; 47 48 private static int total = 0; 49 50 private static boolean countOnly = false; 51 private static boolean verbose = false; 52 53 private static boolean pathExclude = false; 54 private static boolean levelExclude = false; 55 private static boolean iRankExclude = false; 56 private static boolean valueExclude = false; 57 private static boolean fileExclude = false; 58 private static boolean unique = false; 59 private static boolean groups = false; 60 private static Counter<String> uniqueData = new Counter<>(); 61 62 private static String valuePattern; 63 private static File comparisonDirectory; 64 private static boolean recursive; 65 66 private static Counter<String> kountRegexMatches; 67 private static Counter<String> starCounter; 68 private static final Set<String> ERRORS = new LinkedHashSet<>(); 69 private static final PathStarrer pathStarrer = new PathStarrer(); 70 private static PathHeader.Factory PATH_HEADER_FACTORY = null; 71 72 static final Options myOptions = 73 new Options() 74 .add( 75 "source", 76 ".*", 77 CLDRPaths.MAIN_DIRECTORY, 78 "source directory (use also " + CLDRPaths.AUX_DIRECTORY + ")") 79 .add( 80 "file", 81 ".*", 82 null, 83 "regex to filter files. ! in front selects items that don't match.") 84 .add( 85 "path", 86 ".*", 87 null, 88 "regex to filter paths. ! in front selects items that don't match. example: -p relative.*@type=\\\"-?3\\\"") 89 .add( 90 "value", 91 ".*", 92 null, 93 "regex to filter values. ! in front selects items that don't match") 94 .add( 95 "level", 96 ".*", 97 null, 98 "regex to filter levels. ! in front selects items that don't match") 99 .add("count", null, null, "only count items") 100 .add("kount", null, null, "count regex group matches in pattern") 101 .add("other", ".+", null, "compare against other directory") 102 .add("unique", null, null, "only unique lines") 103 .add( 104 "groups", 105 null, 106 null, 107 "only retain capturing groups in path/value, eg in -p @modifiers=\\\"([^\\\"]*+)\\\", output the part in (...)") 108 .add("Verbose", null, null, "verbose output") 109 .add("recursive", null, null, "recurse directories") 110 .add("Star", null, null, "get statistics on starred paths") 111 .add("PathHeader", null, null, "show path header and string ID") 112 .add( 113 "iRank", 114 ".*", 115 null, 116 "Filter by inheritance rank, where 0 = root, ow N = inherits directly from rank N-1"); 117 main(String[] args)118 public static void main(String[] args) throws IOException { 119 double startTime = System.currentTimeMillis(); 120 myOptions.parse(args, true); 121 122 verbose = myOptions.get("Verbose").doesOccur(); 123 124 String sourceDirectory = myOptions.get("source").getValue(); 125 if (sourceDirectory == null) { 126 System.out.println("#" + "Need Source Directory! "); 127 return; 128 } 129 Output<Boolean> exclude = new Output<>(); 130 fileMatcher = getMatcher(myOptions.get("file").getValue(), exclude); 131 fileExclude = exclude.value; 132 133 pathMatcher = getMatcher(myOptions.get("path").getValue(), exclude); 134 pathExclude = exclude.value; 135 136 levelMatcher = getMatcher(myOptions.get("level").getValue(), exclude); 137 levelExclude = exclude.value; 138 139 iRankMatcher = getMatcher(myOptions.get("iRank").getValue(), exclude); 140 iRankExclude = exclude.value; 141 142 valueMatcher = getMatcher(myOptions.get("value").getValue(), exclude); 143 valueExclude = exclude.value; 144 145 if (myOptions.get("Star").doesOccur()) { 146 starCounter = new Counter<>(); 147 } 148 149 if (pathMatcher != null && valueMatcher != null) { 150 valuePattern = valueMatcher.pattern().toString(); 151 if (PatternCache.get("\\$\\d.*").matcher(valuePattern).find()) { 152 replaceValues = true; 153 } 154 } 155 156 if (myOptions.get("PathHeader").doesOccur()) { 157 PATH_HEADER_FACTORY = PathHeader.getFactory(ToolConfig.getToolInstance().getEnglish()); 158 } 159 160 unique = myOptions.get("unique").doesOccur(); 161 groups = myOptions.get("groups").doesOccur(); 162 163 countOnly = myOptions.get("count").doesOccur(); 164 kountRegexMatches = myOptions.get("kount").doesOccur() ? new Counter<>() : null; 165 166 recursive = myOptions.get("recursive").doesOccur(); 167 168 // showFiles = myOptions.get("showFiles").doesOccur(); 169 // showValues = myOptions.get("showValues").doesOccur(); 170 171 File src = new File(sourceDirectory); 172 if (!src.isDirectory()) { 173 System.err.println("#" + sourceDirectory + " must be a directory"); 174 return; 175 } 176 177 String comparisonDirectoryString = myOptions.get("other").getValue(); 178 if (comparisonDirectoryString != null) { 179 comparisonDirectory = new File(comparisonDirectoryString); 180 if (!comparisonDirectory.isDirectory()) { 181 System.err.println("#" + comparisonDirectoryString + " must be a directory"); 182 return; 183 } 184 } 185 186 if (countOnly) { 187 System.out.print("file"); 188 for (Level cLevel : Level.values()) { 189 System.out.print("\t" + cLevel); 190 } 191 System.out.println(); 192 } 193 194 processDirectory(src); 195 196 if (kountRegexMatches != null) { 197 for (String item : kountRegexMatches.getKeysetSortedByCount(false)) { 198 System.out.println("#" + kountRegexMatches.getCount(item) + "\t" + item); 199 } 200 } 201 202 if (unique) { 203 for (String item : uniqueData.getKeysetSortedByCount(false)) { 204 System.out.println("#" + uniqueData.getCount(item) + item); 205 } 206 } 207 208 if (starCounter != null) { 209 for (String path : starCounter.getKeysetSortedByCount(false)) { 210 System.out.println("#" + starCounter.get(path) + "\t" + path); 211 } 212 } 213 double deltaTime = System.currentTimeMillis() - startTime; 214 System.out.println("#" + "Elapsed: " + deltaTime / 1000.0 + " seconds"); 215 System.out.println("#" + "Instances found: " + total); 216 } 217 getMatcher(String property, Output<Boolean> exclude)218 private static Matcher getMatcher(String property, Output<Boolean> exclude) { 219 exclude.value = false; 220 if (property == null) { 221 return null; 222 } 223 if (property.startsWith("!")) { 224 exclude.value = true; 225 property = property.substring(1); 226 } 227 Matcher result = UnicodeRegex.compile(property).matcher(""); 228 // System.out.println(result.pattern()); 229 // 230 return result; 231 } 232 processDirectory(File src)233 private static void processDirectory(File src) throws IOException { 234 if (comparisonDirectory != null) { 235 System.out.println( 236 "#" 237 + "Locale" 238 + "\tFile" 239 + "\tBase" 240 + DiffInfo.DiffInfoHeader 241 + "\n#\tValue\tOtherValue\tPath"); 242 } 243 for (File file : src.listFiles()) { 244 if (recursive && file.isDirectory()) { 245 processDirectory(file); 246 continue; 247 } 248 if (file.length() == 0) { 249 continue; 250 } 251 252 String fileName = file.getName(); 253 String canonicalFile = PathUtilities.getNormalizedPathString(file); 254 255 if (!fileName.endsWith(".xml")) { 256 continue; 257 } 258 259 String coreName = fileName.substring(0, fileName.length() - 4); // remove .xml 260 261 if (fileMatcher != null && fileExclude == fileMatcher.reset(coreName).find()) { 262 if (verbose) { 263 System.out.println("#" + "* -f Skipping " + canonicalFile); 264 } 265 continue; 266 } 267 if (iRankMatcher != null 268 && iRankExclude 269 == iRankMatcher 270 .reset( 271 String.valueOf( 272 CLDRLocale.getInstance(coreName).getRank())) 273 .find()) { 274 if (verbose) { 275 System.out.println("#" + "* -i Skipping " + canonicalFile); 276 } 277 continue; 278 } 279 if (verbose) { 280 System.out.println("#" + "Searching " + canonicalFile); 281 } 282 283 if (showFiles) { 284 System.out.println("#" + "* " + canonicalFile); 285 } 286 287 Relation<String, String> source = getXmlFileAsRelation(src, fileName); 288 Relation<String, String> other = null; 289 if (comparisonDirectory != null) { 290 other = getXmlFileAsRelation(comparisonDirectory, fileName); 291 } 292 293 checkFiles(recursive ? file.getParent() : null, fileName, coreName, source, other); 294 System.out.flush(); 295 } 296 System.out.println("#" + "\t" + DiffInfo.DiffInfoHeader); 297 DIFF_INFO.showValues("TOTAL"); 298 299 for (String error : ERRORS) { 300 System.err.println("#" + error); 301 } 302 } 303 getXmlFileAsRelation(File directory, String fileName)304 private static Relation<String, String> getXmlFileAsRelation(File directory, String fileName) { 305 ListHandler listHandler = new ListHandler(); 306 XMLFileReader xfr = new XMLFileReader().setHandler(listHandler); 307 try { 308 String fileName2 = PathUtilities.getNormalizedPathString(directory) + "/" + fileName; 309 xfr.read(fileName2, XMLFileReader.CONTENT_HANDLER | XMLFileReader.ERROR_HANDLER, false); 310 } catch (Exception e) { 311 StringWriter stringWriter = new StringWriter(); 312 PrintWriter arg0 = new PrintWriter(stringWriter); 313 e.printStackTrace(arg0); 314 arg0.flush(); 315 ERRORS.add("Can't read " + directory + "/" + fileName + "\n" + stringWriter); 316 } 317 return listHandler.data; 318 } 319 320 static class ListHandler extends XMLFileReader.SimpleHandler { 321 public Relation<String, String> data = 322 Relation.of(new LinkedHashMap<String, Set<String>>(), LinkedHashSet.class); 323 324 @Override handlePathValue(String path, String value)325 public void handlePathValue(String path, String value) { 326 data.put(path, value); 327 } 328 } 329 330 // static MyHandler myHandler = new MyHandler(); 331 332 static DiffInfo DIFF_INFO = new DiffInfo(); 333 334 static class DiffInfo { 335 static final String DiffInfoHeader = "\tSame" + "\tDeletions" + "\tAdditions" + "\tChanges"; 336 337 int additionCount = 0; 338 int deletionCount = 0; 339 int changed2Values = 0; 340 int sameCount = 0; 341 showValues(String title)342 public void showValues(String title) { 343 System.out.println( 344 "#" 345 + title 346 + "\t" 347 + sameCount 348 + "\t" 349 + deletionCount 350 + "\t" 351 + additionCount 352 + "\t" 353 + (changed2Values / 2)); 354 DIFF_INFO.additionCount += additionCount; 355 DIFF_INFO.deletionCount += deletionCount; 356 DIFF_INFO.changed2Values += changed2Values; 357 DIFF_INFO.sameCount += sameCount; 358 } 359 } 360 361 /** 362 * @author markdavis 363 * @param fileName 364 * @param canonicalFile 365 */ checkFiles( String filePath, String fileName, String coreName, Relation<String, String> source, Relation<String, String> other)366 private static void checkFiles( 367 String filePath, 368 String fileName, 369 String coreName, 370 Relation<String, String> source, 371 Relation<String, String> other) { 372 CoverageLevel2 level = null; 373 String firstMessage; 374 String file; 375 Counter<Level> levelCounter = new Counter<>(); 376 String canonicalFile = fileName; 377 firstMessage = "* " + canonicalFile; 378 file = canonicalFile; 379 380 DiffInfo diffInfo = new DiffInfo(); 381 382 if (levelMatcher != null || countOnly) { 383 try { 384 level = CoverageLevel2.getInstance(canonicalFile); 385 } catch (Exception e) { 386 } 387 } 388 389 if (countOnly) { 390 System.out.print(fileName); 391 for (Level cLevel : Level.values()) { 392 System.out.print("\t" + levelCounter.get(cLevel)); 393 } 394 System.out.println(); 395 } 396 397 Set<String> keys = new LinkedHashSet<>(source.keySet()); 398 if (other != null) { 399 keys.addAll(other.keySet()); 400 } 401 for (String path : keys) { 402 if (path.startsWith("//ldml/identity/")) { 403 continue; 404 } 405 if (pathMatcher != null && pathExclude == pathMatcher.reset(path).find()) { 406 continue; 407 } 408 409 Level pathLevel = null; 410 411 pathLevel = level == null ? Level.COMPREHENSIVE : level.getLevel(path); 412 levelCounter.add(pathLevel, 1); 413 414 if (levelMatcher != null 415 && levelExclude == levelMatcher.reset(pathLevel.toString()).find()) { 416 continue; 417 } 418 419 Set<String> values = source.get(path); 420 Set<String> otherValues = other == null ? null : other.get(path); 421 422 // if (showValues) { 423 // System.out.println("#"+values + "\t" + otherValues + "\t<=\t" + path); 424 // } 425 426 if (other != null) { 427 if (values != otherValues) { 428 boolean diff = true; 429 if (values == null) { 430 diffInfo.additionCount += otherValues.size(); 431 } else if (otherValues == null) { 432 diffInfo.deletionCount += values.size(); 433 } else if (!values.equals(otherValues)) { 434 diffInfo.changed2Values += values.size() + otherValues.size(); 435 } else { 436 diff = false; 437 diffInfo.sameCount += values.size(); 438 } 439 if (diff && showValues) { 440 show( 441 ConfigOption.add, 442 filePath, 443 file, 444 null, 445 null, 446 path, 447 values, 448 otherValues); 449 } 450 } 451 } else { 452 for (String value : values) { 453 if (replaceValues) { 454 String pattern = valuePattern; 455 for (int i = 0; i <= pathMatcher.groupCount(); ++i) { 456 pattern = pattern.replace("$" + i, pathMatcher.group(i)); 457 } 458 valueMatcher = PatternCache.get(pattern).matcher(""); 459 } 460 461 if (valueMatcher != null && valueExclude == valueMatcher.reset(value).find()) { 462 continue; 463 } 464 465 if (kountRegexMatches != null && pathMatcher != null) { 466 kountRegexMatches.add(pathMatcher.group(1), 1); 467 } 468 469 if (starCounter != null) { 470 starCounter.add(pathStarrer.set(path), 1); 471 } 472 ++total; 473 474 if (firstMessage != null) { 475 // System.out.println("#"+firstMessage); 476 firstMessage = null; 477 } 478 if (!countOnly) { 479 String data = 480 groups 481 ? group(value, valueMatcher) 482 + "\t" 483 + group(path, pathMatcher) 484 : value + "\t" + path; 485 if (!unique) { 486 String pathHeaderInfo = ""; 487 if (PATH_HEADER_FACTORY != null) { 488 PathHeader pathHeader = PATH_HEADER_FACTORY.fromPath(path); 489 if (pathHeader != null) { 490 pathHeaderInfo = 491 "\n\t" 492 + pathHeader 493 + "\n\t" 494 + pathHeader.getUrl( 495 BaseUrl.PRODUCTION, coreName); 496 } 497 } 498 // http://st.unicode.org/cldr-apps/v#/en/Fields/59d8178ec2fe04ae 499 if (!groups && pathHeaderInfo.isEmpty()) { 500 show( 501 ConfigOption.add, 502 filePath, 503 file, 504 null, 505 null, 506 path, 507 Collections.singleton(value), 508 null); 509 } else { 510 System.out.println( 511 "#?" 512 + (recursive ? filePath + "\t" : "") 513 + file 514 + "\t" 515 + data 516 + pathHeaderInfo); 517 } 518 } else { 519 uniqueData.add(data, 1); 520 } 521 } 522 } 523 } 524 } 525 if (other != null) { 526 ULocale locale = new ULocale(fileName.substring(0, fileName.length() - 4)); 527 String localeName = locale.getDisplayName(ULocale.ENGLISH); 528 String title = localeName + "\t" + fileName + "\t" + getType(locale); 529 diffInfo.showValues(title); 530 } 531 } 532 533 enum ConfigOption { 534 delete, 535 add, 536 addNew, 537 replace 538 } 539 show( ConfigOption configOption, String fileParent, String localeOrFile, String match_path, String match_value, String new_path, Set<String> new_values, Set<String> otherValues)540 public static void show( 541 ConfigOption configOption, 542 String fileParent, 543 String localeOrFile, 544 String match_path, 545 String match_value, 546 String new_path, 547 Set<String> new_values, 548 Set<String> otherValues) { 549 // locale= sv ; action=delete; value= YER ; path= 550 // //ldml/numbers/currencies/currency[@type="YER"]/symbol ; 551 552 // locale=en ; action=delete ; path=/.*short.*/ 553 554 // locale=en ; action=add ; 555 // new_path=//ldml/localeDisplayNames/territories/territory[@type="PS"][@alt="short"] ; 556 // new_value=Palestine 557 // locale= af ; action=add ; new_path= 558 // //ldml/dates/fields/field[@type="second"]/relative[@type="0"] ; new_value= nou 559 560 int extensionPos = localeOrFile.lastIndexOf('.'); 561 String fileWithoutSuffix = 562 extensionPos >= 0 ? localeOrFile.substring(0, extensionPos) : localeOrFile; 563 564 String values2 = 565 new_values == null 566 ? null 567 : new_values.size() != 1 568 ? new_values.toString() 569 : new_values.iterator().next(); 570 571 System.out.println( 572 fileParent 573 + ";\tlocale=" 574 + fileWithoutSuffix 575 + ";\taction=" 576 + configOption 577 + (match_value == null ? "" : ";\tvalue=" + escape(match_value)) 578 + (match_path == null ? "" : ";\tpath=" + match_path) 579 + (values2 == null ? "" : ";\tnew_value=" + escape(values2)) 580 + (new_path == null ? "" : ";\tnew_path=" + new_path) 581 + (otherValues == null ? "" : ";\tother_value=" + otherValues)); 582 } 583 584 static final Transliterator showInvisibles = 585 Transliterator.getInstance("[[:whitespace:][:cf:]-[\\u0020]]hex/perl"); 586 escape(String source)587 private static String escape(String source) { 588 return showInvisibles.transform(source); 589 } 590 591 static Set<String> defaultContent = 592 SupplementalDataInfo.getInstance().getDefaultContentLocales(); 593 getType(ULocale locale)594 private static String getType(ULocale locale) { 595 if (defaultContent.contains(locale.toString())) { 596 return "DC"; 597 } else if (locale.getCountry().isEmpty()) { 598 return "Base"; 599 } else { 600 return "Region"; 601 } 602 } 603 group(String item, Matcher matcher)604 private static String group(String item, Matcher matcher) { 605 if (matcher == null) { 606 return item; 607 } 608 StringBuilder b = new StringBuilder(); 609 for (int i = 1; i <= matcher.groupCount(); ++i) { 610 b.append(matcher.group(i)); 611 } 612 return b.toString(); 613 } 614 615 // static class StarCounter { 616 // Map<String,Counter<String>> data = new HashMap(); 617 // } 618 } 619