1 /* 2 ******************************************************************************* 3 * Copyright (C) 2002-2016, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 package org.unicode.cldr.util; 8 9 import com.ibm.icu.text.Transliterator; 10 import com.ibm.icu.util.ICUUncheckedIOException; 11 import java.io.BufferedReader; 12 import java.io.IOException; 13 import org.unicode.cldr.draft.FileUtilities; 14 15 public class TransliteratorUtilities { 16 public static boolean DEBUG = false; 17 registerTransliteratorFromFile(String dir, String id)18 public static void registerTransliteratorFromFile(String dir, String id) { 19 try { 20 String filename = id.replace('-', '_') + ".txt"; 21 String rules = getFileContents(dir, filename); 22 Transliterator t; 23 int pos = id.indexOf('-'); 24 String rid; 25 if (pos < 0) { 26 rid = id + "-Any"; 27 id = "Any-" + id; 28 } else { 29 rid = id.substring(pos + 1) + "-" + id.substring(0, pos); 30 } 31 t = Transliterator.createFromRules(id, rules, Transliterator.FORWARD); 32 Transliterator.unregister(id); 33 Transliterator.registerInstance(t); 34 35 /*String test = "\u049A\u0430\u0437\u0430\u049B"; 36 System.out.println(t.transliterate(test)); 37 t = Transliterator.getInstance(id); 38 System.out.println(t.transliterate(test)); 39 */ 40 41 t = Transliterator.createFromRules(rid, rules, Transliterator.REVERSE); 42 Transliterator.unregister(rid); 43 Transliterator.registerInstance(t); 44 if (DEBUG) System.out.println("Registered new Transliterator: " + id + ", " + rid); 45 } catch (IOException e) { 46 // #if defined(FOUNDATION10) || defined(J2SE13) 47 // ## throw (IllegalArgumentException) new IllegalArgumentException("Can't open " 48 // + dir + ", " + id+" "+ e.getMessage()); 49 // #else 50 throw new ICUUncheckedIOException("Can't open " + dir + ", " + id, e); 51 // #endif 52 } 53 } 54 55 /** */ getFileContents(String dir, String filename)56 public static String getFileContents(String dir, String filename) throws IOException { 57 // #if defined(FOUNDATION10) || defined(J2SE13) 58 // ## BufferedReader br = TestUtil.openUTF8Reader(dir, filename); 59 // #else 60 BufferedReader br = FileUtilities.openUTF8Reader(dir, filename); 61 // #endif 62 StringBuffer buffer = new StringBuffer(); 63 while (true) { 64 String line = br.readLine(); 65 if (line == null) break; 66 if (line.length() > 0 && line.charAt(0) == '\uFEFF') line = line.substring(1); 67 buffer.append(line).append("\r\n"); 68 } 69 br.close(); 70 return buffer.toString(); 71 } 72 73 private static final String BASE_RULES = 74 ":: (hex-any/xml);" 75 + ":: (hex-any/xml10);" 76 + "'<' > '<' ;" 77 + "'<' < '&'[lL][Tt]';' ;" 78 + "'&' > '&' ;" 79 + "'&' < '&'[aA][mM][pP]';' ;" 80 + "'>' < '&'[gG][tT]';' ;" 81 + "'\"' < '&'[qQ][uU][oO][tT]';' ; " 82 + "'' < '&'[aA][pP][oO][sS]';' ; "; 83 84 private static final String CONTENT_RULES = "'>' > '>' ;"; 85 86 private static final String HTML_RULES = BASE_RULES + CONTENT_RULES + "'\"' > '"' ; "; 87 88 private static final String HTML_RULES_CONTROLS = 89 HTML_RULES 90 + ":: [[:C:][:Z:][:whitespace:][:Default_Ignorable_Code_Point:]] hex/unicode ; "; 91 92 private static final String HTML_RULES_ASCII = 93 HTML_RULES + ":: [[:C:][:^ASCII:]] any-hex/xml ; "; 94 95 private static final String XML_RULES = HTML_RULES + "'' > ''' ; "; 96 97 /* 98 The ampersand character (&) and the left angle bracket (<) MUST NOT appear 99 100 in their literal form, except when used as markup delimiters, or within a 101 102 comment, a processing instruction, or a CDATA section. If they are needed 103 104 elsewhere, they MUST be escaped using either numeric character references or 105 106 the strings "&" and "<" respectively. The right angle bracket (>) MAY 107 108 be represented using the string ">", and MUST, for compatibility, be 109 110 escaped using either ">" or a character reference when it appears in the string 111 112 "]]>" in content, when that string is not marking the end of a CDATA section. 113 114 In the content of elements, character data is any string of characters which does 115 116 not contain the start-delimiter of any markup and does not include the 117 118 CDATA-section-close delimiter, "]]>". In a CDATA section, character data is 119 120 any string of characters not including the CDATA-section-close delimiter, 121 122 "]]>". 123 124 To allow attribute values to contain both single and double quotes, the 125 126 apostrophe or single-quote character (') MAY be represented as "'", and 127 128 the double-quote character (") as """. 129 130 131 */ 132 133 public static final Transliterator toXML = 134 Transliterator.createFromRules("any-xml", XML_RULES, Transliterator.FORWARD); 135 public static final Transliterator fromXML = 136 Transliterator.createFromRules("xml-any", XML_RULES, Transliterator.REVERSE); 137 public static final Transliterator toHTML = 138 Transliterator.createFromRules("any-html", HTML_RULES, Transliterator.FORWARD); 139 public static final Transliterator toHTMLControl = 140 Transliterator.createFromRules("any-html", HTML_RULES_CONTROLS, Transliterator.FORWARD); 141 public static final Transliterator toHTMLAscii = 142 Transliterator.createFromRules("any-html", HTML_RULES_ASCII, Transliterator.FORWARD); 143 public static final Transliterator fromHTML = 144 Transliterator.createFromRules("html-any", HTML_RULES, Transliterator.REVERSE); 145 getTransliteratorFromFile(String ID, String file)146 public static Transliterator getTransliteratorFromFile(String ID, String file) { 147 return getTransliteratorFromFile(ID, file, Transliterator.FORWARD); 148 } 149 getTransliteratorFromFile(String ID, String file, int direction)150 public static Transliterator getTransliteratorFromFile(String ID, String file, int direction) { 151 try { 152 BufferedReader br = CldrUtility.getUTF8Data(file); 153 StringBuilder input = new StringBuilder(); 154 while (true) { 155 String line = br.readLine(); 156 if (line == null) break; 157 if (line.startsWith("\uFEFF")) line = line.substring(1); // remove BOM 158 input.append(line); 159 input.append('\n'); 160 } 161 return Transliterator.createFromRules(ID, input.toString(), direction); 162 } catch (IOException e) { 163 throw new ICUUncheckedIOException("Can't open transliterator file " + file, e); 164 } 165 } 166 } 167