xref: /aosp_15_r20/external/cldr/tools/cldr-code/src/main/java/org/unicode/cldr/util/TransliteratorUtilities.java (revision 912701f9769bb47905792267661f0baf2b85bed5)
1 /*
2  *******************************************************************************
3  * Copyright (C) 2002-2016, International Business Machines Corporation and
4  * others. All Rights Reserved.
5  *******************************************************************************
6  */
7 package org.unicode.cldr.util;
8 
9 import com.ibm.icu.text.Transliterator;
10 import com.ibm.icu.util.ICUUncheckedIOException;
11 import java.io.BufferedReader;
12 import java.io.IOException;
13 import org.unicode.cldr.draft.FileUtilities;
14 
15 public class TransliteratorUtilities {
16     public static boolean DEBUG = false;
17 
registerTransliteratorFromFile(String dir, String id)18     public static void registerTransliteratorFromFile(String dir, String id) {
19         try {
20             String filename = id.replace('-', '_') + ".txt";
21             String rules = getFileContents(dir, filename);
22             Transliterator t;
23             int pos = id.indexOf('-');
24             String rid;
25             if (pos < 0) {
26                 rid = id + "-Any";
27                 id = "Any-" + id;
28             } else {
29                 rid = id.substring(pos + 1) + "-" + id.substring(0, pos);
30             }
31             t = Transliterator.createFromRules(id, rules, Transliterator.FORWARD);
32             Transliterator.unregister(id);
33             Transliterator.registerInstance(t);
34 
35             /*String test = "\u049A\u0430\u0437\u0430\u049B";
36             System.out.println(t.transliterate(test));
37             t = Transliterator.getInstance(id);
38             System.out.println(t.transliterate(test));
39             */
40 
41             t = Transliterator.createFromRules(rid, rules, Transliterator.REVERSE);
42             Transliterator.unregister(rid);
43             Transliterator.registerInstance(t);
44             if (DEBUG) System.out.println("Registered new Transliterator: " + id + ", " + rid);
45         } catch (IOException e) {
46             // #if defined(FOUNDATION10) || defined(J2SE13)
47             // ##        throw (IllegalArgumentException) new IllegalArgumentException("Can't open "
48             // + dir + ", " + id+" "+ e.getMessage());
49             // #else
50             throw new ICUUncheckedIOException("Can't open " + dir + ", " + id, e);
51             // #endif
52         }
53     }
54 
55     /** */
getFileContents(String dir, String filename)56     public static String getFileContents(String dir, String filename) throws IOException {
57         // #if defined(FOUNDATION10) || defined(J2SE13)
58         // ##        BufferedReader br = TestUtil.openUTF8Reader(dir, filename);
59         // #else
60         BufferedReader br = FileUtilities.openUTF8Reader(dir, filename);
61         // #endif
62         StringBuffer buffer = new StringBuffer();
63         while (true) {
64             String line = br.readLine();
65             if (line == null) break;
66             if (line.length() > 0 && line.charAt(0) == '\uFEFF') line = line.substring(1);
67             buffer.append(line).append("\r\n");
68         }
69         br.close();
70         return buffer.toString();
71     }
72 
73     private static final String BASE_RULES =
74             ":: (hex-any/xml);"
75                     + ":: (hex-any/xml10);"
76                     + "'<' > '&lt;' ;"
77                     + "'<' < '&'[lL][Tt]';' ;"
78                     + "'&' > '&amp;' ;"
79                     + "'&' < '&'[aA][mM][pP]';' ;"
80                     + "'>' < '&'[gG][tT]';' ;"
81                     + "'\"' < '&'[qQ][uU][oO][tT]';' ; "
82                     + "'' < '&'[aA][pP][oO][sS]';' ; ";
83 
84     private static final String CONTENT_RULES = "'>' > '&gt;' ;";
85 
86     private static final String HTML_RULES = BASE_RULES + CONTENT_RULES + "'\"' > '&quot;' ; ";
87 
88     private static final String HTML_RULES_CONTROLS =
89             HTML_RULES
90                     + ":: [[:C:][:Z:][:whitespace:][:Default_Ignorable_Code_Point:]] hex/unicode ; ";
91 
92     private static final String HTML_RULES_ASCII =
93             HTML_RULES + ":: [[:C:][:^ASCII:]] any-hex/xml ; ";
94 
95     private static final String XML_RULES = HTML_RULES + "'' > '&apos;' ; ";
96 
97     /*
98     The ampersand character (&) and the left angle bracket (<) MUST NOT appear
99 
100     in their literal form, except when used as markup delimiters, or within a
101 
102     comment, a processing instruction, or a CDATA section. If they are needed
103 
104     elsewhere, they MUST be escaped using either numeric character references or
105 
106     the strings "&amp;" and "&lt;" respectively. The right angle bracket (>) MAY
107 
108     be represented using the string "&gt;", and MUST, for compatibility, be
109 
110     escaped using either "&gt;" or a character reference when it appears in the string
111 
112     "]]>" in content, when that string is not marking the end of a CDATA section.
113 
114     In the content of elements, character data is any string of characters which does
115 
116     not contain the start-delimiter of any markup and does not include the
117 
118     CDATA-section-close delimiter, "]]>". In a CDATA section, character data is
119 
120     any string of characters not including the CDATA-section-close delimiter,
121 
122     "]]>".
123 
124     To allow attribute values to contain both single and double quotes, the
125 
126     apostrophe or single-quote character (') MAY be represented as "&apos;", and
127 
128     the double-quote character (") as "&quot;".
129 
130 
131      */
132 
133     public static final Transliterator toXML =
134             Transliterator.createFromRules("any-xml", XML_RULES, Transliterator.FORWARD);
135     public static final Transliterator fromXML =
136             Transliterator.createFromRules("xml-any", XML_RULES, Transliterator.REVERSE);
137     public static final Transliterator toHTML =
138             Transliterator.createFromRules("any-html", HTML_RULES, Transliterator.FORWARD);
139     public static final Transliterator toHTMLControl =
140             Transliterator.createFromRules("any-html", HTML_RULES_CONTROLS, Transliterator.FORWARD);
141     public static final Transliterator toHTMLAscii =
142             Transliterator.createFromRules("any-html", HTML_RULES_ASCII, Transliterator.FORWARD);
143     public static final Transliterator fromHTML =
144             Transliterator.createFromRules("html-any", HTML_RULES, Transliterator.REVERSE);
145 
getTransliteratorFromFile(String ID, String file)146     public static Transliterator getTransliteratorFromFile(String ID, String file) {
147         return getTransliteratorFromFile(ID, file, Transliterator.FORWARD);
148     }
149 
getTransliteratorFromFile(String ID, String file, int direction)150     public static Transliterator getTransliteratorFromFile(String ID, String file, int direction) {
151         try {
152             BufferedReader br = CldrUtility.getUTF8Data(file);
153             StringBuilder input = new StringBuilder();
154             while (true) {
155                 String line = br.readLine();
156                 if (line == null) break;
157                 if (line.startsWith("\uFEFF")) line = line.substring(1); // remove BOM
158                 input.append(line);
159                 input.append('\n');
160             }
161             return Transliterator.createFromRules(ID, input.toString(), direction);
162         } catch (IOException e) {
163             throw new ICUUncheckedIOException("Can't open transliterator file " + file, e);
164         }
165     }
166 }
167