1 package org.unicode.cldr.tool; 2 3 import com.ibm.icu.text.Normalizer; 4 import com.ibm.icu.text.Transliterator; 5 import com.ibm.icu.text.UnicodeSet; 6 import com.ibm.icu.util.ICUUncheckedIOException; 7 import java.io.File; 8 import java.util.Map; 9 import java.util.concurrent.ConcurrentHashMap; 10 import org.unicode.cldr.test.DisplayAndInputProcessor; 11 import org.unicode.cldr.util.CLDRFile; 12 import org.unicode.cldr.util.CLDRPaths; 13 import org.unicode.cldr.util.CLDRTransforms; 14 import org.unicode.cldr.util.CLDRTransforms.ParsedTransformID; 15 import org.unicode.cldr.util.CldrUtility; 16 import org.unicode.cldr.util.DtdType; 17 import org.unicode.cldr.util.Factory; 18 import org.unicode.cldr.util.LocaleIDParser; 19 import org.unicode.cldr.util.SimpleFactory.NoSourceDirectoryException; 20 import org.unicode.cldr.util.SimpleXMLSource; 21 import org.unicode.cldr.util.TempPrintWriter; 22 import org.unicode.cldr.util.XMLSource; 23 24 /** 25 * Transforms the contents of a CLDRFile. 26 * 27 * @author jchye 28 */ 29 public class CLDRFileTransformer { 30 public enum PolicyIfExisting { 31 RETAIN, // Do not transliterate if existing output has locale content 32 DISCARD, // Replace existing output locale content 33 MINIMIZE // RETAIN, plus drop values if translit is a no-op. 34 } 35 36 /** 37 * Contains all supported locale-to-locale conversions along with information needed to convert 38 * each locale. Each enum value is named after the locale that results from the conversion. 39 */ 40 public enum LocaleTransform { 41 sr_Latn( 42 "sr", 43 "Serbian-Latin-BGN.xml", 44 Transliterator.FORWARD, 45 "[:script=Cyrl:]", 46 PolicyIfExisting.DISCARD), // 47 sr_Latn_BA( 48 "sr_Cyrl_BA", 49 "Serbian-Latin-BGN.xml", 50 Transliterator.FORWARD, 51 "[:script=Cyrl:]", 52 PolicyIfExisting.DISCARD), // 53 sr_Latn_ME( 54 "sr_Cyrl_ME", 55 "Serbian-Latin-BGN.xml", 56 Transliterator.FORWARD, 57 "[:script=Cyrl:]", 58 PolicyIfExisting.DISCARD), // 59 sr_Latn_XK( 60 "sr_Cyrl_XK", 61 "Serbian-Latin-BGN.xml", 62 Transliterator.FORWARD, 63 "[:script=Cyrl:]", 64 PolicyIfExisting.DISCARD), // 65 ha_NE( 66 "ha", 67 "ha-ha_NE.xml", 68 Transliterator.FORWARD, 69 "[y Y ƴ Ƴ ʼ]", 70 PolicyIfExisting.DISCARD), // 71 yo_BJ( 72 "yo", 73 "yo-yo_BJ.xml", 74 Transliterator.FORWARD, 75 "[ẹ ọ ṣ Ẹ Ọ Ṣ]", 76 PolicyIfExisting.DISCARD), // 77 de_CH("de", "[ß] Casefold", Transliterator.FORWARD, "[ß]", PolicyIfExisting.MINIMIZE), // 78 yue_Hans( 79 "yue", 80 "Simplified-Traditional.xml", 81 Transliterator.REVERSE, 82 "[:script=Hant:]", 83 PolicyIfExisting.RETAIN), // 84 // en_NZ("en_AU", "null", Transliterator.FORWARD, "[]", PolicyIfExisting.DISCARD), 85 // Needs work to fix currency symbols, handle Māori. See 86 // http://unicode.org/cldr/trac/ticket/9516#comment:6 87 ; 88 89 private final String inputLocale; 90 private final String transformFilename; 91 private final int direction; 92 private final UnicodeSet inputChars; 93 private final PolicyIfExisting policy; 94 95 /** 96 * @deprecated Use {@link #LocaleTransform(String,String,int,String,PolicyIfExisting)} 97 * instead 98 */ 99 @Deprecated LocaleTransform( String inputLocale, String transformFilename, int direction, String inputCharPattern)100 private LocaleTransform( 101 String inputLocale, 102 String transformFilename, 103 int direction, 104 String inputCharPattern) { 105 this( 106 inputLocale, 107 transformFilename, 108 direction, 109 inputCharPattern, 110 PolicyIfExisting.DISCARD); 111 } 112 LocaleTransform( String inputLocale, String transformFilename, int direction, String inputCharPattern, PolicyIfExisting policy)113 private LocaleTransform( 114 String inputLocale, 115 String transformFilename, 116 int direction, 117 String inputCharPattern, 118 PolicyIfExisting policy) { 119 this.inputLocale = inputLocale; 120 this.transformFilename = transformFilename; 121 this.direction = direction; 122 this.inputChars = new UnicodeSet(inputCharPattern); 123 this.policy = policy; 124 } 125 126 /** 127 * @return the policy for existing content 128 */ getPolicyIfExisting()129 public PolicyIfExisting getPolicyIfExisting() { 130 return policy; 131 } 132 133 /** 134 * @return the locale that used for conversion 135 */ getInputLocale()136 public String getInputLocale() { 137 return inputLocale; 138 } 139 140 /** 141 * @return the locale that used for conversion 142 */ getOutputLocale()143 public String getOutputLocale() { 144 return this.toString(); 145 } 146 147 /** 148 * @return the filename of the transform used to make the conversion 149 */ getTransformFilename()150 public String getTransformFilename() { 151 return transformFilename; 152 } 153 154 /** 155 * @return the direction of the transformation 156 */ getDirection()157 public int getDirection() { 158 return direction; 159 } 160 161 /** 162 * @return the set of characters in the input locale that should have been removed after 163 * transformation, used for internal debugging 164 */ getInputChars()165 private UnicodeSet getInputChars() { 166 return inputChars; 167 } 168 } 169 170 private UnicodeSet unconverted = new UnicodeSet(); 171 private Factory factory; 172 /* 173 * The transliterators map exists, and is static, to avoid wasting a lot of time creating 174 * a new Transliterator more often than necessary. (An alternative to "static" here might be to 175 * create only one CLDRFileTransformer, maybe as a member of ExampleGenerator.) 176 * Use ConcurrentHashMap rather than HashMap to avoid concurrency problems. 177 * Reference: https://unicode.org/cldr/trac/ticket/11657 178 */ 179 private static Map<LocaleTransform, Transliterator> transliterators = new ConcurrentHashMap<>(); 180 private String transformDir; 181 182 /** 183 * @param factory the factory to get locale data from 184 * @param transformDir the directory containing the transform files 185 */ CLDRFileTransformer(Factory factory, String transformDir)186 public CLDRFileTransformer(Factory factory, String transformDir) { 187 this.factory = factory; 188 this.transformDir = transformDir; 189 } 190 loadTransliterator(LocaleTransform localeTransform)191 public Transliterator loadTransliterator(LocaleTransform localeTransform) { 192 if (transliterators.containsKey(localeTransform)) { 193 return transliterators.get(localeTransform); 194 } 195 Transliterator transliterator; 196 if (localeTransform.getTransformFilename().contains(".xml")) { 197 ParsedTransformID directionInfo = new ParsedTransformID(); 198 String ruleString = 199 CLDRTransforms.getIcuRulesFromXmlFile( 200 transformDir, localeTransform.getTransformFilename(), directionInfo); 201 transliterator = 202 Transliterator.createFromRules( 203 directionInfo.getId(), ruleString, localeTransform.getDirection()); 204 } else { 205 transliterator = Transliterator.getInstance(localeTransform.getTransformFilename()); 206 } 207 transliterators.put(localeTransform, transliterator); 208 return transliterator; 209 } 210 211 /** 212 * NOTE: This method does not currently handle nested transliterators. 213 * 214 * @param input 215 * @return null if the input file was missing, or if there is no new output file. 216 */ transform(LocaleTransform localeTransform)217 public CLDRFile transform(LocaleTransform localeTransform) { 218 Transliterator transliterator = loadTransliterator(localeTransform); 219 CLDRFile input; 220 final String inputLocale = localeTransform.getInputLocale(); 221 try { 222 input = factory.make(inputLocale, false); 223 } catch (ICUUncheckedIOException e1) { 224 return null; // input file is missing (or otherwise unavailable) 225 } 226 boolean hadOutput = true; 227 CLDRFile output; 228 try { 229 output = factory.make(localeTransform.getOutputLocale(), false); 230 } catch (NoSourceDirectoryException e) { 231 // if we can't open the file, then just make a new one. 232 XMLSource dataSource = new SimpleXMLSource(localeTransform.getOutputLocale()); 233 output = new CLDRFile(dataSource); 234 hadOutput = false; 235 } 236 String outputParentString = LocaleIDParser.getParent(localeTransform.getOutputLocale()); 237 CLDRFile outputParent = factory.make(outputParentString, true); 238 239 outputParent = factory.make(inputLocale, false); 240 XMLSource outputSource = new SimpleXMLSource(localeTransform.toString()); 241 DisplayAndInputProcessor daip = new DisplayAndInputProcessor(output, true); 242 for (String xpath : input) { 243 String value = input.getStringValue(xpath); 244 if (CldrUtility.INHERITANCE_MARKER.equals(value)) { 245 final String foundIn = input.getSourceLocaleID(xpath, null); 246 // Include these only when they are actually present in this file 247 if (!foundIn.equals(inputLocale)) { 248 // inheritance marker came from somewhere else, ignore it 249 continue; 250 } 251 } 252 if (value == null) { 253 continue; 254 } 255 String fullPath = input.getFullXPath(xpath); 256 String oldValue = output.getStringValue(xpath); 257 String parentValue = outputParent.getStringValue(xpath); 258 value = 259 transformValue( 260 transliterator, localeTransform, xpath, value, oldValue, parentValue); 261 if (value != null) { 262 // check again 263 if (CldrUtility.INHERITANCE_MARKER.equals(value)) { 264 final String foundIn = input.getSourceLocaleID(xpath, null); 265 // Include these only when they are actually present in this file 266 if (!foundIn.equals(inputLocale)) { 267 // inheritance marker came from somewhere else, ignore it 268 continue; 269 } 270 } 271 value = daip.processInput(xpath, value, null); 272 outputSource.putValueAtPath(fullPath, value); 273 } 274 } 275 if (!outputSource.iterator().hasNext()) { // empty new output 276 if (!hadOutput) { 277 return null; // don't add file if nothing to add 278 } 279 } 280 return new CLDRFile(outputSource); 281 } 282 283 /** 284 * Transforms a CLDRFile value into another form. 285 * 286 * @param parentValue 287 */ transformValue( Transliterator transliterator, LocaleTransform localeTransform, String path, String value, String oldValue, String parentValue)288 private String transformValue( 289 Transliterator transliterator, 290 LocaleTransform localeTransform, 291 String path, 292 String value, 293 String oldValue, 294 String parentValue) { 295 296 // allows us to change only new values 297 switch (localeTransform.policy) { 298 case RETAIN: 299 case MINIMIZE: 300 if (oldValue != null) { 301 return oldValue; 302 } 303 break; 304 default: 305 } 306 307 UnicodeSet chars = localeTransform.getInputChars(); 308 String transliterated; 309 310 // TODO: Don't transform dates/patterns. 311 // For now, don't try to transliterate the exemplar characters - use the ones from the 312 // original locale. 313 // In the future, we can probably control this better with a config file - similar to 314 // CLDRModify's config file. 315 if (path.contains("exemplarCharacters")) { 316 if (oldValue != null) { 317 transliterated = oldValue; 318 } else { 319 transliterated = value; 320 } 321 } else { 322 transliterated = transliterator.transliterate(value); 323 transliterated = Normalizer.compose(transliterated, false); 324 } 325 if (localeTransform.policy == PolicyIfExisting.MINIMIZE) { 326 if (transliterated.equals(value)) { 327 return null; 328 } 329 } 330 331 if (chars.containsSome(transliterated)) { 332 unconverted.addAll(new UnicodeSet().addAll(chars).retainAll(transliterated)); 333 } 334 return transliterated; 335 } 336 main(String[] args)337 public static void main(String[] args) throws Exception { 338 for (String dir : DtdType.ldml.directories) { 339 if (dir.equals("casing") // skip, field contents are keywords, not localizable content 340 || dir.equals( 341 "collation") // skip, field contents are complex, and can't be simply 342 // remapped 343 || dir.equals("annotationsDerived") // skip, derived later 344 ) { 345 continue; 346 } 347 System.out.println("\nDirectory: " + dir); 348 final String sourceDirectory = CLDRPaths.COMMON_DIRECTORY + dir + "/"; 349 Factory factory = Factory.make(sourceDirectory, ".*"); 350 351 CLDRFileTransformer transformer = 352 new CLDRFileTransformer( 353 factory, CLDRPaths.COMMON_DIRECTORY + "transforms" + File.separator); 354 for (LocaleTransform localeTransform : LocaleTransform.values()) { 355 CLDRFile output = transformer.transform(localeTransform); 356 if (output == null) { 357 System.out.println( 358 "SKIPPING missing file: " 359 + dir 360 + "/" 361 + localeTransform.inputLocale 362 + ".xml"); 363 continue; 364 } 365 String outputFile = output.getLocaleID() + ".xml"; 366 try (TempPrintWriter out = 367 TempPrintWriter.openUTF8Writer(sourceDirectory, outputFile) 368 .skipCopyright(true)) { 369 // System.out.println("Generating locale file: " + outputDir + outputFile); 370 if (!transformer.unconverted.isEmpty()) { 371 System.out.println("Untransformed characters: " + transformer.unconverted); 372 transformer.unconverted.clear(); 373 } 374 output.write(out.asPrintWriter()); 375 } 376 } 377 } 378 } 379 } 380