1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <core_jni_helpers.h>
18 #include <cutils/trace.h>
19 #include <fcntl.h>
20 #include <minikin/Hyphenator.h>
21 #ifdef __ANDROID__
22 #include <sys/mman.h>
23 #else
24 #include <android-base/mapped_file.h>
25 #include <android-base/properties.h>
26 #endif
27 #include <sys/stat.h>
28 #include <sys/types.h>
29 #ifdef __ANDROID__
30 #include <tracing_perfetto.h>
31 #endif
32 #include <unicode/uloc.h>
33 #include <unistd.h>
34
35 #include <algorithm>
36
37 namespace android {
38
buildFileName(const std::string & locale)39 static std::string buildFileName(const std::string& locale) {
40 #ifdef __ANDROID__
41 constexpr char SYSTEM_HYPHENATOR_PREFIX[] = "/system/usr/hyphen-data/hyph-";
42 #else
43 std::string hyphenPath = base::GetProperty("ro.hyphen.data.dir", "/system/usr/hyphen-data");
44 std::string SYSTEM_HYPHENATOR_PREFIX = hyphenPath + "/hyph-";
45 #endif
46 constexpr char SYSTEM_HYPHENATOR_SUFFIX[] = ".hyb";
47 std::string lowerLocale;
48 lowerLocale.reserve(locale.size());
49 std::transform(locale.begin(), locale.end(), std::back_inserter(lowerLocale), ::tolower);
50 return SYSTEM_HYPHENATOR_PREFIX + lowerLocale + SYSTEM_HYPHENATOR_SUFFIX;
51 }
52
mmapPatternFile(const std::string & locale)53 static std::pair<const uint8_t*, size_t> mmapPatternFile(const std::string& locale) {
54 const std::string hyFilePath = buildFileName(locale);
55 const int fd = open(hyFilePath.c_str(), O_RDONLY | O_CLOEXEC);
56 if (fd == -1) {
57 return std::make_pair(nullptr, 0); // Open failed.
58 }
59
60 struct stat st = {};
61 if (fstat(fd, &st) == -1) { // Unlikely to happen.
62 close(fd);
63 return std::make_pair(nullptr, 0);
64 }
65
66 #ifdef __ANDROID__
67 void* ptr = mmap(nullptr, st.st_size, PROT_READ, MAP_SHARED, fd, 0 /* offset */);
68 close(fd);
69 if (ptr == MAP_FAILED) {
70 return std::make_pair(nullptr, 0);
71 }
72 #else
73 std::unique_ptr<base::MappedFile> patternFile =
74 base::MappedFile::FromFd(fd, 0, st.st_size, PROT_READ);
75 close(fd);
76 if (patternFile == nullptr) {
77 return std::make_pair(nullptr, 0);
78 }
79 auto* mappedPtr = new base::MappedFile(std::move(*patternFile));
80 char* ptr = mappedPtr->data();
81 #endif
82 return std::make_pair(reinterpret_cast<const uint8_t*>(ptr), st.st_size);
83 }
84
addHyphenatorWithoutPatternFile(const std::string & locale,int minPrefix,int minSuffix)85 static void addHyphenatorWithoutPatternFile(const std::string& locale, int minPrefix,
86 int minSuffix) {
87 minikin::addHyphenator(locale,
88 minikin::Hyphenator::loadBinary(nullptr, 0, minPrefix, minSuffix,
89 locale));
90 }
91
addHyphenator(const std::string & locale,int minPrefix,int minSuffix)92 static void addHyphenator(const std::string& locale, int minPrefix, int minSuffix) {
93 std::pair<const uint8_t*, size_t> r = mmapPatternFile(locale);
94 if (r.first == nullptr) {
95 ALOGE("Unable to find pattern file or unable to map it for %s", locale.c_str());
96 return;
97 }
98 minikin::addHyphenator(locale,
99 minikin::Hyphenator::loadBinary(r.first, r.second, minPrefix, minSuffix,
100 locale));
101 }
102
addHyphenatorAlias(const std::string & from,const std::string & to)103 static void addHyphenatorAlias(const std::string& from, const std::string& to) {
104 minikin::addHyphenatorAlias(from, to);
105 }
106
107 /*
108 * Cache the subtag key map by calling uloc_forLanguageTag with a subtag.
109 * minikin calls uloc_forLanguageTag with an Unicode extension specifying
110 * the line breaking strictness. Parsing the extension requires loading the key map
111 * from keyTypeData.res in the ICU.
112 * "lb" is the key commonly used by minikin. "ca" is a common legacy key mapping to
113 * the "calendar" key. It ensures that the key map is loaded and cached in icu4c.
114 * "en-Latn-US" is a common locale used in the Android system regardless what default locale
115 * is selected in the Settings app.
116 */
cacheUnicodeExtensionSubtagsKeyMap()117 inline static void cacheUnicodeExtensionSubtagsKeyMap() {
118 UErrorCode status = U_ZERO_ERROR;
119 char localeID[ULOC_FULLNAME_CAPACITY] = {};
120 uloc_forLanguageTag("en-Latn-US-u-lb-loose-ca-gregory", localeID, ULOC_FULLNAME_CAPACITY,
121 nullptr, &status);
122 }
123
init()124 static void init() {
125 // TODO: Confirm that these are the best values. Various sources suggest (1, 1), but that
126 // appears too small.
127 constexpr int INDIC_MIN_PREFIX = 2;
128 constexpr int INDIC_MIN_SUFFIX = 2;
129
130 addHyphenator("af", 1, 1); // Afrikaans
131 addHyphenator("am", 1, 1); // Amharic
132 addHyphenator("as", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Assamese
133 addHyphenator("be", 2, 2); // Belarusian
134 addHyphenator("bg", 2, 2); // Bulgarian
135 addHyphenator("bn", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Bengali
136 addHyphenator("cs", 2, 2); // Czech
137 addHyphenator("cu", 1, 2); // Church Slavonic
138 addHyphenator("cy", 2, 3); // Welsh
139 addHyphenator("da", 2, 2); // Danish
140 addHyphenator("de-1901", 2, 2); // German 1901 orthography
141 addHyphenator("de-1996", 2, 2); // German 1996 orthography
142 addHyphenator("de-CH-1901", 2, 2); // Swiss High German 1901 orthography
143 addHyphenator("el", 1, 1); // Greek
144 addHyphenator("en-GB", 2, 3); // British English
145 addHyphenator("en-US", 2, 3); // American English
146 addHyphenator("es", 2, 2); // Spanish
147 addHyphenator("et", 2, 3); // Estonian
148 addHyphenator("eu", 2, 2); // Basque
149 addHyphenator("fr", 2, 3); // French
150 addHyphenator("ga", 2, 3); // Irish
151 addHyphenator("gl", 2, 2); // Galician
152 addHyphenator("gu", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Gujarati
153 addHyphenator("hi", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Hindi
154 addHyphenator("hr", 2, 2); // Croatian
155 addHyphenator("hu", 2, 2); // Hungarian
156 // texhyphen sources say Armenian may be (1, 2); but that it needs confirmation.
157 // Going with a more conservative value of (2, 2) for now.
158 addHyphenator("hy", 2, 2); // Armenian
159 addHyphenator("it", 2, 2); // Italian
160 addHyphenator("ka", 1, 2); // Georgian
161 addHyphenator("kn", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Kannada
162 addHyphenator("la", 2, 2); // Latin
163 addHyphenator("lt", 2, 2); // Lithuanian
164 addHyphenator("lv", 2, 2); // Latvian
165 addHyphenator("ml", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Malayalam
166 addHyphenator("mn-Cyrl", 2, 2); // Mongolian in Cyrillic script
167 addHyphenator("mr", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Marathi
168 addHyphenator("nb", 2, 2); // Norwegian Bokmål
169 addHyphenator("nl", 2, 2); // Dutch
170 addHyphenator("nn", 2, 2); // Norwegian Nynorsk
171 addHyphenator("or", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Oriya
172 addHyphenator("pa", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Punjabi
173 addHyphenator("pl", 2, 2); // Polish
174 addHyphenator("pt", 2, 3); // Portuguese
175 addHyphenator("ru", 2, 2); // Russian
176 addHyphenator("sk", 2, 2); // Slovak
177 addHyphenator("sl", 2, 2); // Slovenian
178 addHyphenator("sq", 2, 2); // Albanian
179 addHyphenator("sv", 1, 2); // Swedish
180 addHyphenator("ta", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Tamil
181 addHyphenator("te", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Telugu
182 addHyphenator("tk", 2, 2); // Turkmen
183 addHyphenator("uk", 2, 2); // Ukrainian
184 addHyphenator("und-Ethi", 1, 1); // Any language in Ethiopic script
185
186 // Following two hyphenators do not have pattern files but there is some special logic based on
187 // language.
188 addHyphenatorWithoutPatternFile("ca", 2, 2); // Catalan
189
190 // English locales that fall back to en-US. The data is from CLDR. It's all English locales,
191 // minus the locales whose parent is en-001 (from supplementalData.xml, under <parentLocales>).
192 // TODO: Figure out how to get this from ICU.
193 addHyphenatorAlias("en-AS", "en-US"); // English (American Samoa)
194 addHyphenatorAlias("en-GU", "en-US"); // English (Guam)
195 addHyphenatorAlias("en-MH", "en-US"); // English (Marshall Islands)
196 addHyphenatorAlias("en-MP", "en-US"); // English (Northern Mariana Islands)
197 addHyphenatorAlias("en-PR", "en-US"); // English (Puerto Rico)
198 addHyphenatorAlias("en-UM", "en-US"); // English (United States Minor Outlying Islands)
199 addHyphenatorAlias("en-VI", "en-US"); // English (Virgin Islands)
200
201 // All English locales other than those falling back to en-US are mapped to en-GB.
202 addHyphenatorAlias("en", "en-GB");
203
204 // For German, we're assuming the 1996 (and later) orthography by default.
205 addHyphenatorAlias("de", "de-1996");
206 // Liechtenstein uses the Swiss hyphenation rules for the 1901 orthography.
207 addHyphenatorAlias("de-LI-1901", "de-CH-1901");
208
209 // Norwegian is very probably Norwegian Bokmål.
210 addHyphenatorAlias("no", "nb");
211
212 // Use mn-Cyrl. According to CLDR's likelySubtags.xml, mn is most likely to be mn-Cyrl.
213 addHyphenatorAlias("mn", "mn-Cyrl"); // Mongolian
214
215 // Fall back to Ethiopic script for languages likely to be written in Ethiopic.
216 // Data is from CLDR's likelySubtags.xml.
217 // TODO: Convert this to a mechanism using ICU4J's ULocale#addLikelySubtags().
218 addHyphenatorAlias("am", "und-Ethi"); // Amharic
219 addHyphenatorAlias("byn", "und-Ethi"); // Blin
220 addHyphenatorAlias("gez", "und-Ethi"); // Geʻez
221 addHyphenatorAlias("ti", "und-Ethi"); // Tigrinya
222 addHyphenatorAlias("wal", "und-Ethi"); // Wolaytta
223
224 // Use Hindi as a fallback hyphenator for all languages written in Devanagari, etc. This makes
225 // sense because our Indic patterns are not really linguistic, but script-based.
226 addHyphenatorAlias("und-Beng", "bn"); // Bengali
227 addHyphenatorAlias("und-Deva", "hi"); // Devanagari -> Hindi
228 addHyphenatorAlias("und-Gujr", "gu"); // Gujarati
229 addHyphenatorAlias("und-Guru", "pa"); // Gurmukhi -> Punjabi
230 addHyphenatorAlias("und-Knda", "kn"); // Kannada
231 addHyphenatorAlias("und-Mlym", "ml"); // Malayalam
232 addHyphenatorAlias("und-Orya", "or"); // Oriya
233 addHyphenatorAlias("und-Taml", "ta"); // Tamil
234 addHyphenatorAlias("und-Telu", "te"); // Telugu
235
236 #ifdef __ANDROID__
237 tracing_perfetto::traceBegin(ATRACE_TAG_VIEW, "CacheUnicodeExtensionSubtagsKeyMap");
238 #endif
239 cacheUnicodeExtensionSubtagsKeyMap();
240 #ifdef __ANDROID__
241 tracing_perfetto::traceEnd(ATRACE_TAG_VIEW); // CacheUnicodeExtensionSubtagsKeyMap
242 #endif
243 }
244
245 static const JNINativeMethod gMethods[] = {
246 {"nInit", "()V", (void*) init},
247 };
248
register_android_text_Hyphenator(JNIEnv * env)249 int register_android_text_Hyphenator(JNIEnv* env) {
250 return RegisterMethodsOrDie(env, "android/text/Hyphenator", gMethods, NELEM(gMethods));
251 }
252
253 } // namespace android
254